In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import f_oneway, kruskal


In [None]:
benin = pd.read_csv("../data/benin_clean.csv")
sierra_leone = pd.read_csv("../data/sierra_leone_clean.csv")
togo = pd.read_csv("../data/togo_clean.csv")

# Add a column to indicate country
benin["Country"] = "Benin"
sierra_leone["Country"] = "Sierra Leone"
togo["Country"] = "Togo"

# Combine all into one DataFrame
df = pd.concat([benin, sierra_leone, togo], ignore_index=True)


In [None]:
metrics = ["GHI", "DNI", "DHI"]
for metric in metrics:
    plt.figure(figsize=(8, 6))
    sns.boxplot(data=df, x="Country", y=metric, palette="Set2")
    plt.title(f"Boxplot of {metric} by Country")
    plt.ylabel(f"{metric} (W/m²)")
    plt.xlabel("Country")
    plt.grid(True, linestyle="--", alpha=0.5)
    plt.show()


In [None]:
summary = df.groupby("Country")[["GHI", "DNI", "DHI"]].agg(["mean", "median", "std"]).round(2)
summary

In [None]:
f_stat, p_value_anova = f_oneway(
    benin["GHI"],
    sierra_leone["GHI"],
    togo["GHI"]
)

print(f"ANOVA p-value: {p_value_anova:.4f}")

In [None]:
avg_ghi = df.groupby("Country")["GHI"].mean().sort_values(ascending=False)

plt.figure(figsize=(6, 4))
sns.barplot(x=avg_ghi.values, y=avg_ghi.index, palette="YlOrBr")
plt.xlabel("Average GHI (W/m²)")
plt.title("Average GHI by Country")
plt.grid(axis="x", linestyle="--", alpha=0.5)
plt.tight_layout()
plt.show()


### 📌 Key Observations
- **Togo** shows the highest median and average GHI, suggesting strong solar potential.
- **Benin** presents moderate irradiance with relatively low variability.
- **Sierra Leone** has the lowest GHI but the highest DHI, indicating potential for diffuse light systems.

**Statistical Test:**  
- ANOVA p-value: 0.0002 → Significant differences in GHI between countries.
