In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import kruskal
import numpy as np
import os

# Set random seed for reproducibility
np.random.seed(42)

# Create plots directory if it doesn't exist
os.makedirs('plots', exist_ok=True)

# Load cleaned datasets
benin_df = pd.read_csv('data/benin_clean.csv')
sierra_leone_df = pd.read_csv('data/sierraleone-bumbuna_clean.csv')
togo_df = pd.read_csv('data/togo-dapaong_qc_clean.csv')

# Add country column
benin_df['Country'] = 'Benin'
sierra_leone_df['Country'] = 'Sierra Leone'
togo_df['Country'] = 'Togo'

# Combine datasets
df = pd.concat([benin_df, sierra_leone_df, togo_df], ignore_index=True)

# 1. Boxplots for GHI, DNI, DHI
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
sns.boxplot(x='Country', y='GHI', data=df, hue='Country')
plt.title('GHI by Country')

plt.subplot(1, 3, 2)
sns.boxplot(x='Country', y='DNI', data=df, hue='Country')
plt.title('DNI by Country')

plt.subplot(1, 3, 3)
sns.boxplot(x='Country', y='DHI', data=df, hue='Country')
plt.title('DHI by Country')

plt.tight_layout()
plt.savefig('plots/cross_country_boxplots.png')
plt.close()
plt.show()

# 2. Summary Table
metrics = ['GHI', 'DNI', 'DHI']
summary_data = []

for metric in metrics:
    for country in ['Benin', 'Sierra Leone', 'Togo']:
        country_data = df[df['Country'] == country][metric]
        summary_data.append({
            'Country': country,
            'Metric': metric,
            'Mean': country_data.mean(),
            'Median': country_data.median(),
            'Std': country_data.std()
        })

summary_df = pd.DataFrame(summary_data)
print("\nSummary Table:")
print(summary_df.pivot(index='Metric', columns='Country', values=['Mean', 'Median', 'Std']))

# Export summary table to CSV
summary_df.to_csv('data/summary_metrics.csv', index=False)

# 3. Statistical Testing (Kruskal-Wallis on GHI)
ghi_benin = benin_df['GHI']
ghi_sierra_leone = sierra_leone_df['GHI']
ghi_togo = togo_df['GHI']

stat, p_value = kruskal(ghi_benin, ghi_sierra_leone, ghi_togo)
print(f"\nKruskal-Wallis Test for GHI:")
print(f"Statistic: {stat:.4f}, p-value: {p_value:.4f}")
if p_value < 0.05:
    print("Significant difference in GHI between countries (p < 0.05).")
else:
    print("No significant difference in GHI between countries (p >= 0.05).")

# 4. Bar Chart for Average GHI
avg_ghi = df.groupby('Country')['GHI'].mean().sort_values(ascending=False)
plt.figure(figsize=(8, 6))
avg_ghi.plot(kind='bar', color=['#1f77b4', '#ff7f0e', '#2ca02c'])
plt.title('Average GHI by Country')
plt.xlabel('Country')
plt.ylabel('Average GHI (W/m²)')
plt.tight_layout()
plt.savefig('plots/avg_ghi_ranking.png')
plt.close()
plt.show()


FileNotFoundError: [Errno 2] No such file or directory: 'data/sierraleon-bumbuna_clean.csv'

# 5. Key Observations (Markdown Cell)
"""
### Key Observations
- **Benin vs. Sierra Leone vs. Togo GHI**: Benin may show the highest median GHI, indicating strong solar potential, but also higher variability (check boxplot spread), suggesting less consistent conditions.
- **DNI and DHI Differences**: Sierra Leone might have lower DNI but higher DHI, suggesting more diffuse radiation, possibly due to cloudier conditions.
- **Statistical Significance**: The Kruskal-Wallis test p-value indicates whether GHI differences are significant, guiding prioritization of countries for solar investment.
"""