In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import f_oneway, kruskal

%matplotlib inline
plt.rcParams['figure.figsize'] = (12, 6)
sns.set_style('whitegrid')

# Load cleaned datasets
benin = pd.read_csv('../data/benin_clean.csv', parse_dates=['Timestamp'])
sierraleone = pd.read_csv('../data/sierraleone_clean.csv', parse_dates=['Timestamp'])
togo = pd.read_csv('../data/togo_clean.csv', parse_dates=['Timestamp'])

# Add country labels
benin['Country'] = 'Benin'
sierraleone['Country'] = 'Sierra Leone'
togo['Country'] = 'Togo'

# Combine all data
all_data = pd.concat([benin, sierraleone, togo], ignore_index=True)

print('Data loaded successfully!')
print(f'Benin: {benin.shape[0]} rows')
print(f'Sierra Leone: {sierraleone.shape[0]} rows')
print(f'Togo: {togo.shape[0]} rows')
print(f'Total: {all_data.shape[0]} rows')

# Cross-Country Comparison
## 1. Summary Statistics Table

In [None]:
# Summary table comparing mean, median, and standard deviation of GHI, DNI, DHI across countries
metrics = ['GHI', 'DNI', 'DHI']
summary_stats = []

for country in ['Benin', 'Sierra Leone', 'Togo']:
    country_data = all_data[all_data['Country'] == country]
    for metric in metrics:
        summary_stats.append({
            'Country': country,
            'Metric': metric,
            'Mean': country_data[metric].mean(),
            'Median': country_data[metric].median(),
            'Std Dev': country_data[metric].std()
        })

summary_df = pd.DataFrame(summary_stats)
summary_pivot = summary_df.pivot(index='Country', columns='Metric', values=['Mean', 'Median', 'Std Dev'])

print('Summary Statistics by Country:')
display(summary_pivot.round(2))

## 2. Boxplots Comparison

In [None]:
# Boxplots of GHI, DNI, DHI side-by-side (one plot per metric, colored by country)
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

metrics = ['GHI', 'DNI', 'DHI']
colors = ['#3498db', '#e74c3c', '#2ecc71']

for idx, metric in enumerate(metrics):
    data_for_plot = []
    labels = []
    
    for country in ['Benin', 'Sierra Leone', 'Togo']:
        country_data = all_data[all_data['Country'] == country][metric].dropna()
        data_for_plot.append(country_data)
        labels.append(country)
    
    bp = axes[idx].boxplot(data_for_plot, labels=labels, patch_artist=True)
    
    # Color the boxes
    for patch, color in zip(bp['boxes'], colors):
        patch.set_facecolor(color)
        patch.set_alpha(0.7)
    
    axes[idx].set_title(f'{metric} Distribution by Country')
    axes[idx].set_ylabel(f'{metric} (W/m²)')
    axes[idx].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

In [None]:
# Alternative: Using seaborn for better visualization
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

for idx, metric in enumerate(metrics):
    sns.boxplot(data=all_data, x='Country', y=metric, ax=axes[idx], palette=colors)
    axes[idx].set_title(f'{metric} Distribution by Country')
    axes[idx].set_ylabel(f'{metric} (W/m²)')
    axes[idx].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

## 3. Statistical Testing

In [None]:
# One-way ANOVA on GHI values
benin_ghi = all_data[all_data['Country'] == 'Benin']['GHI'].dropna()
sierraleone_ghi = all_data[all_data['Country'] == 'Sierra Leone']['GHI'].dropna()
togo_ghi = all_data[all_data['Country'] == 'Togo']['GHI'].dropna()

# Check normality (Shapiro-Wilk test on samples)
print('Normality Tests (sample of 5000):')
for country, data in [('Benin', benin_ghi), ('Sierra Leone', sierraleone_ghi), ('Togo', togo_ghi)]:
    sample = data.sample(min(5000, len(data)))
    stat, p_value = stats.shapiro(sample)
    print(f'{country}: W={stat:.4f}, p={p_value:.4f}')

print('\n' + '='*50)

# One-way ANOVA
f_stat, p_value_anova = f_oneway(benin_ghi, sierraleone_ghi, togo_ghi)
print(f'\nOne-way ANOVA Results for GHI:')
print(f'F-statistic: {f_stat:.4f}')
print(f'p-value: {p_value_anova:.4e}')
if p_value_anova < 0.05:
    print('Result: Significant difference between countries (p < 0.05)')
else:
    print('Result: No significant difference between countries (p >= 0.05)')

print('\n' + '='*50)

# Kruskal-Wallis test (non-parametric alternative)
h_stat, p_value_kruskal = kruskal(benin_ghi, sierraleone_ghi, togo_ghi)
print(f'\nKruskal-Wallis Test Results for GHI:')
print(f'H-statistic: {h_stat:.4f}')
print(f'p-value: {p_value_kruskal:.4e}')
if p_value_kruskal < 0.05:
    print('Result: Significant difference between countries (p < 0.05)')
else:
    print('Result: No significant difference between countries (p >= 0.05)')

In [None]:
# Statistical tests for DNI and DHI as well
metrics_to_test = ['DNI', 'DHI']

for metric in metrics_to_test:
    benin_data = all_data[all_data['Country'] == 'Benin'][metric].dropna()
    sierraleone_data = all_data[all_data['Country'] == 'Sierra Leone'][metric].dropna()
    togo_data = all_data[all_data['Country'] == 'Togo'][metric].dropna()
    
    f_stat, p_value_anova = f_oneway(benin_data, sierraleone_data, togo_data)
    h_stat, p_value_kruskal = kruskal(benin_data, sierraleone_data, togo_data)
    
    print(f'\n{metric} Statistical Tests:')
    print(f'  ANOVA: F={f_stat:.4f}, p={p_value_anova:.4e}')
    print(f'  Kruskal-Wallis: H={h_stat:.4f}, p={p_value_kruskal:.4e}')

## 4. Key Observations

### Summary of Key Findings:

1. **Country X shows highest median GHI but also greatest variability** - [To be filled after analysis]
2. **Country Y demonstrates most consistent solar irradiance** - [To be filled after analysis]
3. **Statistical significance of differences** - [To be filled after running statistical tests]

**Note:** Please review the summary statistics, boxplots, and statistical test results above to fill in specific observations.

## 5. Visual Summary: Country Ranking

In [None]:
# Bar chart ranking countries by average GHI
country_means = all_data.groupby('Country')['GHI'].mean().sort_values(ascending=False)

plt.figure(figsize=(10, 6))
bars = plt.bar(country_means.index, country_means.values, color=colors, alpha=0.7, edgecolor='black')
plt.title('Average GHI by Country', fontsize=14, fontweight='bold')
plt.ylabel('Average GHI (W/m²)', fontsize=12)
plt.xlabel('Country', fontsize=12)
plt.grid(True, alpha=0.3, axis='y')

# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{height:.1f}',
             ha='center', va='bottom', fontsize=11, fontweight='bold')

plt.tight_layout()
plt.show()

print('\nCountry Rankings by Average GHI:')
for rank, (country, value) in enumerate(country_means.items(), 1):
    print(f'{rank}. {country}: {value:.2f} W/m²')

In [None]:
# Ranking for all three metrics
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

for idx, metric in enumerate(['GHI', 'DNI', 'DHI']):
    country_means = all_data.groupby('Country')[metric].mean().sort_values(ascending=False)
    
    bars = axes[idx].bar(country_means.index, country_means.values, 
                        color=colors, alpha=0.7, edgecolor='black')
    axes[idx].set_title(f'Average {metric} by Country', fontsize=12, fontweight='bold')
    axes[idx].set_ylabel(f'{metric} (W/m²)', fontsize=10)
    axes[idx].set_xlabel('Country', fontsize=10)
    axes[idx].grid(True, alpha=0.3, axis='y')
    
    # Add value labels
    for bar in bars:
        height = bar.get_height()
        axes[idx].text(bar.get_x() + bar.get_width()/2., height,
                      f'{height:.1f}',
                      ha='center', va='bottom', fontsize=9, fontweight='bold')

plt.tight_layout()
plt.show()