In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import f_oneway, kruskal
import os
from pathlib import Path

%matplotlib inline
plt.rcParams['figure.figsize'] = (14, 8)
sns.set_style('whitegrid')
sns.set_palette('husl')

# Define explicit export locations
EXPORT_DIR = Path('../data')
FIGURES_DIR = Path('../figures')
SUMMARY_DIR = Path('../summary')

# Create directories if they don't exist
EXPORT_DIR.mkdir(exist_ok=True)
FIGURES_DIR.mkdir(exist_ok=True)
SUMMARY_DIR.mkdir(exist_ok=True)

print('Export directories created:')
print(f'  Data exports: {EXPORT_DIR.absolute()}')
print(f'  Figures: {FIGURES_DIR.absolute()}')
print(f'  Summary reports: {SUMMARY_DIR.absolute()}')

ModuleNotFoundError: No module named 'pandas'

# Consolidated Cross-Country Solar Data Analysis Dashboard

## Overview
This notebook provides a comprehensive comparative analysis of solar radiation data from Benin, Sierra Leone, and Togo.

## 1. Data Loading

In [None]:
# Load cleaned datasets
data_dir = Path('../data')

benin = pd.read_csv(data_dir / 'benin_clean.csv', parse_dates=['Timestamp'])
sierraleone = pd.read_csv(data_dir / 'sierraleone_clean.csv', parse_dates=['Timestamp'])
togo = pd.read_csv(data_dir / 'togo_clean.csv', parse_dates=['Timestamp'])

# Add country labels
benin['Country'] = 'Benin'
sierraleone['Country'] = 'Sierra Leone'
togo['Country'] = 'Togo'

# Combine all data
all_data = pd.concat([benin, sierraleone, togo], ignore_index=True)

print('Data loaded successfully!')
print(f'Benin: {benin.shape[0]:,} rows, {benin.shape[1]} columns')
print(f'Sierra Leone: {sierraleone.shape[0]:,} rows, {sierraleone.shape[1]} columns')
print(f'Togo: {togo.shape[0]:,} rows, {togo.shape[1]} columns')
print(f'Total combined: {all_data.shape[0]:,} rows, {all_data.shape[1]} columns')

# Export combined dataset
combined_export_path = EXPORT_DIR / 'all_countries_combined.csv'
all_data.to_csv(combined_export_path, index=False)
print(f'\n‚úì Combined dataset exported to: {combined_export_path.absolute()}')

## 2. Summary Statistics Comparison

In [None]:
# Comprehensive summary statistics table
metrics = ['GHI', 'DNI', 'DHI']
summary_stats = []

for country in ['Benin', 'Sierra Leone', 'Togo']:
    country_data = all_data[all_data['Country'] == country]
    for metric in metrics:
        summary_stats.append({
            'Country': country,
            'Metric': metric,
            'Mean': country_data[metric].mean(),
            'Median': country_data[metric].median(),
            'Std Dev': country_data[metric].std(),
            'Min': country_data[metric].min(),
            'Max': country_data[metric].max(),
            'Q25': country_data[metric].quantile(0.25),
            'Q75': country_data[metric].quantile(0.75)
        })

summary_df = pd.DataFrame(summary_stats)
summary_pivot = summary_df.pivot(index='Country', columns='Metric', values=['Mean', 'Median', 'Std Dev'])

print('Summary Statistics by Country:')
display(summary_pivot.round(2))

# Export summary statistics
summary_export_path = SUMMARY_DIR / 'summary_statistics.csv'
summary_df.to_csv(summary_export_path, index=False)
print(f'\n‚úì Summary statistics exported to: {summary_export_path.absolute()}')

## 3. Comparative Visualizations: Boxplots

In [None]:
# Side-by-side boxplots for all three metrics
fig, axes = plt.subplots(1, 3, figsize=(20, 6))
metrics = ['GHI', 'DNI', 'DHI']
colors = ['#3498db', '#e74c3c', '#2ecc71']

for idx, metric in enumerate(metrics):
    sns.boxplot(data=all_data, x='Country', y=metric, ax=axes[idx], palette=colors)
    axes[idx].set_title(f'{metric} Distribution by Country', fontsize=14, fontweight='bold')
    axes[idx].set_ylabel(f'{metric} (W/m¬≤)', fontsize=12)
    axes[idx].set_xlabel('Country', fontsize=12)
    axes[idx].grid(True, alpha=0.3, axis='y')

plt.tight_layout()

# Export figure
fig_path = FIGURES_DIR / 'comparative_boxplots.png'
plt.savefig(fig_path, dpi=300, bbox_inches='tight')
print(f'‚úì Figure exported to: {fig_path.absolute()}')
plt.show()

In [None]:
# Violin plots to show distribution shapes
fig, axes = plt.subplots(1, 3, figsize=(20, 6))

for idx, metric in enumerate(metrics):
    sns.violinplot(data=all_data, x='Country', y=metric, ax=axes[idx], palette=colors)
    axes[idx].set_title(f'{metric} Distribution Shape by Country', fontsize=14, fontweight='bold')
    axes[idx].set_ylabel(f'{metric} (W/m¬≤)', fontsize=12)
    axes[idx].set_xlabel('Country', fontsize=12)
    axes[idx].grid(True, alpha=0.3, axis='y')

plt.tight_layout()

# Export figure
fig_path = FIGURES_DIR / 'distribution_shapes_violin.png'
plt.savefig(fig_path, dpi=300, bbox_inches='tight')
print(f'‚úì Figure exported to: {fig_path.absolute()}')
plt.show()

## 4. Time Series Comparison

In [None]:
# Monthly average comparison
all_data['Month'] = pd.to_datetime(all_data['Timestamp']).dt.month
monthly_avg = all_data.groupby(['Country', 'Month'])['GHI'].mean().reset_index()

fig, ax = plt.subplots(figsize=(16, 6))
for country in ['Benin', 'Sierra Leone', 'Togo']:
    country_data = monthly_avg[monthly_avg['Country'] == country]
    ax.plot(country_data['Month'], country_data['GHI'], marker='o', label=country, linewidth=2, markersize=8)

ax.set_title('Monthly Average GHI Comparison Across Countries', fontsize=14, fontweight='bold')
ax.set_xlabel('Month', fontsize=12)
ax.set_ylabel('Average GHI (W/m¬≤)', fontsize=12)
ax.set_xticks(range(1, 13))
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3)

plt.tight_layout()

# Export figure
fig_path = FIGURES_DIR / 'monthly_ghi_comparison.png'
plt.savefig(fig_path, dpi=300, bbox_inches='tight')
print(f'‚úì Figure exported to: {fig_path.absolute()}')
plt.show()

# Export monthly averages
monthly_export_path = EXPORT_DIR / 'monthly_averages_by_country.csv'
monthly_avg.to_csv(monthly_export_path, index=False)
print(f'‚úì Monthly averages exported to: {monthly_export_path.absolute()}')

## 5. Statistical Significance Testing

In [None]:
# Statistical tests for all metrics
test_results = []

for metric in ['GHI', 'DNI', 'DHI']:
    benin_data = all_data[all_data['Country'] == 'Benin'][metric].dropna()
    sierraleone_data = all_data[all_data['Country'] == 'Sierra Leone'][metric].dropna()
    togo_data = all_data[all_data['Country'] == 'Togo'][metric].dropna()
    
    # One-way ANOVA
    f_stat, p_value_anova = f_oneway(benin_data, sierraleone_data, togo_data)
    
    # Kruskal-Wallis test
    h_stat, p_value_kruskal = kruskal(benin_data, sierraleone_data, togo_data)
    
    test_results.append({
        'Metric': metric,
        'ANOVA_F_statistic': f_stat,
        'ANOVA_p_value': p_value_anova,
        'ANOVA_significant': 'Yes' if p_value_anova < 0.05 else 'No',
        'Kruskal_H_statistic': h_stat,
        'Kruskal_p_value': p_value_kruskal,
        'Kruskal_significant': 'Yes' if p_value_kruskal < 0.05 else 'No'
    })

test_results_df = pd.DataFrame(test_results)

print('Statistical Test Results:')
display(test_results_df.round(6))

# Export test results
test_export_path = SUMMARY_DIR / 'statistical_test_results.csv'
test_results_df.to_csv(test_export_path, index=False)
print(f'\n‚úì Statistical test results exported to: {test_export_path.absolute()}')

## 6. Country Rankings by Solar Potential

In [None]:
# Ranking visualization for all metrics
fig, axes = plt.subplots(1, 3, figsize=(20, 6))

rankings_data = []

for idx, metric in enumerate(['GHI', 'DNI', 'DHI']):
    country_means = all_data.groupby('Country')[metric].mean().sort_values(ascending=False)
    
    bars = axes[idx].bar(country_means.index, country_means.values, 
                        color=colors, alpha=0.7, edgecolor='black', linewidth=1.5)
    axes[idx].set_title(f'Average {metric} by Country', fontsize=14, fontweight='bold')
    axes[idx].set_ylabel(f'{metric} (W/m¬≤)', fontsize=12)
    axes[idx].set_xlabel('Country', fontsize=12)
    axes[idx].grid(True, alpha=0.3, axis='y')
    
    # Add value labels
    for i, (country, value) in enumerate(country_means.items()):
        axes[idx].text(i, value, f'{value:.1f}', ha='center', va='bottom', 
                       fontsize=11, fontweight='bold')
        rankings_data.append({
            'Metric': metric,
            'Rank': i + 1,
            'Country': country,
            'Average_Value': value
        })

plt.tight_layout()

# Export figure
fig_path = FIGURES_DIR / 'country_rankings.png'
plt.savefig(fig_path, dpi=300, bbox_inches='tight')
print(f'‚úì Figure exported to: {fig_path.absolute()}')
plt.show()

# Export rankings
rankings_df = pd.DataFrame(rankings_data)
rankings_export_path = SUMMARY_DIR / 'country_rankings.csv'
rankings_df.to_csv(rankings_export_path, index=False)
print(f'\n‚úì Rankings exported to: {rankings_export_path.absolute()}')

## 7. Correlation Analysis Comparison

In [None]:
# Correlation heatmaps for each country
corr_cols = ['GHI', 'DNI', 'DHI', 'Tamb', 'RH', 'WS', 'BP']

fig, axes = plt.subplots(1, 3, figsize=(24, 6))

for idx, country in enumerate(['Benin', 'Sierra Leone', 'Togo']):
    country_data = all_data[all_data['Country'] == country][corr_cols]
    corr_matrix = country_data.corr()
    
    sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0,
                square=True, linewidths=1, cbar_kws={'shrink': 0.8}, ax=axes[idx])
    axes[idx].set_title(f'{country} - Correlation Matrix', fontsize=12, fontweight='bold')

plt.tight_layout()

# Export figure
fig_path = FIGURES_DIR / 'correlation_comparison.png'
plt.savefig(fig_path, dpi=300, bbox_inches='tight')
print(f'‚úì Figure exported to: {fig_path.absolute()}')
plt.show()

## 8. Key Insights Summary

In [None]:
# Generate key insights
insights = []

# Find country with highest GHI
ghi_ranking = all_data.groupby('Country')['GHI'].mean().sort_values(ascending=False)
top_country_ghi = ghi_ranking.index[0]
top_ghi_value = ghi_ranking.iloc[0]
ghi_std = all_data[all_data['Country'] == top_country_ghi]['GHI'].std()

insights.append({
    'Insight': f'{top_country_ghi} shows highest average GHI ({top_ghi_value:.2f} W/m¬≤)',
    'Variability': f'Std Dev: {ghi_std:.2f} W/m¬≤',
    'Implication': 'Highest solar potential but may require robust system design'
})

# Find most consistent country
consistency = all_data.groupby('Country')['GHI'].std().sort_values()
most_consistent = consistency.index[0]
consistency_value = consistency.iloc[0]

insights.append({
    'Insight': f'{most_consistent} demonstrates most consistent solar irradiance (Std Dev: {consistency_value:.2f} W/m¬≤)',
    'Variability': 'Low variability',
    'Implication': 'Predictable generation, lower risk, steady returns'
})

# Statistical significance
significant_tests = test_results_df[test_results_df['ANOVA_significant'] == 'Yes']
if len(significant_tests) > 0:
    metrics_sig = ', '.join(significant_tests['Metric'].tolist())
    insights.append({
        'Insight': f'Countries show statistically significant differences in {metrics_sig}',
        'Variability': 'p < 0.05',
        'Implication': 'Measurably different solar potential between countries'
})
else:
    insights.append({
        'Insight': 'Countries are statistically similar in solar availability',
        'Variability': 'p >= 0.05',
        'Implication': 'Similar solar potential across all countries'
})

insights_df = pd.DataFrame(insights)

print('Key Insights:')
display(insights_df)

# Export insights
insights_export_path = SUMMARY_DIR / 'key_insights.csv'
insights_df.to_csv(insights_export_path, index=False)
print(f'\n‚úì Key insights exported to: {insights_export_path.absolute()}')

## 9. Export Summary

In [None]:
# List all exported files
print('='*60)
print('EXPORT SUMMARY')
print('='*60)

print('\nüìä Data Exports:')
print(f'  ‚Ä¢ Combined dataset: {EXPORT_DIR / "all_countries_combined.csv"}')
print(f'  ‚Ä¢ Monthly averages: {EXPORT_DIR / "monthly_averages_by_country.csv"}')

print('\nüìà Summary Reports:')
print(f'  ‚Ä¢ Summary statistics: {SUMMARY_DIR / "summary_statistics.csv"}')
print(f'  ‚Ä¢ Statistical tests: {SUMMARY_DIR / "statistical_test_results.csv"}')
print(f'  ‚Ä¢ Country rankings: {SUMMARY_DIR / "country_rankings.csv"}')
print(f'  ‚Ä¢ Key insights: {SUMMARY_DIR / "key_insights.csv"}')

print('\nüñºÔ∏è  Figures:')
print(f'  ‚Ä¢ Comparative boxplots: {FIGURES_DIR / "comparative_boxplots.png"}')
print(f'  ‚Ä¢ Distribution shapes: {FIGURES_DIR / "distribution_shapes_violin.png"}')
print(f'  ‚Ä¢ Monthly GHI comparison: {FIGURES_DIR / "monthly_ghi_comparison.png"}')
print(f'  ‚Ä¢ Country rankings: {FIGURES_DIR / "country_rankings.png"}')
print(f'  ‚Ä¢ Correlation comparison: {FIGURES_DIR / "correlation_comparison.png"}')

print('\n' + '='*60)
print('All exports completed successfully!')
print('='*60)