In [6]:
# 0. Imports and Setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats # For statistical tests

# Set some display options
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
sns.set_style('whitegrid')

# Define paths to cleaned data files
BENIN_CLEANED_FILE = '../data/benin_clean.csv'
SIERRALEONE_CLEANED_FILE = '../data/sierraleone_clean.csv'
TOGO_CLEANED_FILE = '../data/togo_clean.csv'

# --- Assume these files exist from Task 2 ---
# --- Ensure 'data/' directory is correctly in .gitignore ---

# 1. Load Cleaned Data
print("--- Loading Cleaned Data ---")
dataframes = {}
file_paths = {
    'Benin': BENIN_CLEANED_FILE,
    'SierraLeone': SIERRALEONE_CLEANED_FILE,
    'Togo': TOGO_CLEANED_FILE
}

for country, file_path in file_paths.items():
    try:
        df = pd.read_csv(file_path, index_col='Timestamp') # Load with Timestamp as index
        df.index = pd.to_datetime(df.index) # Ensure index is datetime
        df['Country'] = country # Add country column
        dataframes[country] = df
        print(f"Successfully loaded data for {country}. Shape: {df.shape}")
    except FileNotFoundError:
        print(f"Error: Cleaned data file for {country} not found at {file_path}. Please run Task 2 for this country first.")
        dataframes[country] = pd.DataFrame() # Use empty DF to avoid errors later

# 2. Combine Data
# Combine only if dataframes were successfully loaded
if all(not df.empty for df in dataframes.values()):
    print("\n--- Combining DataFrames ---")
    df_combined = pd.concat(dataframes.values(), ignore_index=False) # Keep Timestamp index
    print("Combined Data Shape:", df_combined.shape)
    print("Combined Data Head:")
    print(df_combined.head())
    print("\nCountries in combined data:", df_combined['Country'].unique())
else:
    print("\nSkipping comparisons as some country data failed to load.")
    df_combined = pd.DataFrame() # Ensure df_combined is empty if loading failed

# 3. Metric Comparison
if not df_combined.empty:
    print("\n--- Metric Comparison ---")
    metrics_to_compare = ['GHI', 'DNI', 'DHI']

    # Boxplots
    print("Generating Boxplots...")
    for metric in metrics_to_compare:
        if metric in df_combined.columns:
            plt.figure(figsize=(10, 6))
            sns.boxplot(data=df_combined, x='Country', y=metric, palette='viridis')
            plt.title(f'{metric} Distribution Across Countries')
            plt.ylabel(f'{metric} (W/m²)')
            plt.xlabel('Country')
            plt.show()
        else:
             print(f"Warning: Metric '{metric}' not found in combined data.")


    # Summary Table (Mean, Median, Std Dev)
    print("\n--- Summary Table (Mean, Median, Std Dev) ---")
    # Filter for relevant columns + Country, then group
    summary_cols = metrics_to_compare + ['Country']
    summary_df = df_combined[[col for col in summary_cols if col in df_combined.columns]] # Ensure columns exist
    if not summary_df.empty and 'Country' in summary_df.columns:
         summary_stats = summary_df.groupby('Country')[metrics_to_compare].agg(['mean', 'median', 'std'])
         print(summary_stats)
    else:
        print("Could not generate summary table due to missing data or columns.")


# 4. Statistical Testing (GHI)
if not df_combined.empty and 'GHI' in df_combined.columns:
    print("\n--- Statistical Testing (GHI) ---")
    # Prepare data for Kruskal-Wallis test
    ghi_by_country = [df_combined[df_combined['Country'] == country]['GHI'].dropna() for country in df_combined['Country'].unique()]

    # Ensure there's data for at least two countries and sufficient data points
    if len(ghi_by_country) > 1 and all(len(arr) > 5 for arr in ghi_by_country): # Need at least 2 countries with data
        try:
            # Kruskal-Wallis test is non-parametric, suitable if data isn't normal
            # Null Hypothesis (H0): The median GHI is the same across all countries.
            # Alternative Hypothesis (H1): The median GHI is different for at least one country.
            stat, p_value = stats.kruskal(*ghi_by_country)

            print(f"Kruskal-Wallis Test on GHI values across countries:")
            print(f"Statistic: {stat:.3f}")
            print(f"P-value: {p_value:.5f}")

            # Interpretation (optional but good):
            alpha = 0.05
            if p_value < alpha:
                print(f"Result: The p-value ({p_value:.5f}) is less than the significance level ({alpha}). We reject the null hypothesis.")
                print("Conclusion: There are statistically significant differences in the median GHI values among the countries.")
            else:
                print(f"Result: The p-value ({p_value:.5f}) is greater than the significance level ({alpha}). We fail to reject the null hypothesis.")
                print("Conclusion: There is no statistically significant evidence to suggest differences in the median GHI values among the countries.")

        except ValueError as e:
             print(f"Could not perform Kruskal-Wallis test: {e}. This might happen if one country has no data.")
        except Exception as e:
             print(f"An unexpected error occurred during statistical testing: {e}")
    else:
        print("Not enough valid GHI data across countries to perform statistical test.")


# 5. Key Observations (Markdown cell)
# Add a new Markdown cell below this code cell in your Jupyter notebook.
# Use bullet points to summarize the most important findings from the plots,
# summary table, and statistical test.
# Examples:
# - Country X consistently shows the highest median GHI, indicating potentially better solar resource.
# - Country Y exhibits the largest standard deviation in GHI, suggesting more variable weather or irradiance patterns.
# - The Kruskal-Wallis test (p=...) indicates statistically significant differences in GHI distribution between the countries.


# 6. (Bonus) Visual Summary - Ranking by Average GHI
if not df_combined.empty and 'GHI' in df_combined.columns:
    print("\n--- Visual Summary: Average GHI Ranking ---")
    # Calculate average GHI per country
    avg_ghi_by_country = df_combined.groupby('Country')['GHI'].mean().sort_values(ascending=False)

    plt.figure(figsize=(8, 5))
    avg_ghi_by_country.plot(kind='bar', color=sns.color_palette('viridis', len(avg_ghi_by_country)))
    plt.title('Average GHI Ranking by Country')
    plt.xlabel('Country')
    plt.ylabel('Average GHI (W/m²)')
    plt.xticks(rotation=0) # Keep country names horizontal
    plt.tight_layout()
    plt.show()

--- Loading Cleaned Data ---
Error: Cleaned data file for Benin not found at ../data/benin_clean.csv. Please run Task 2 for this country first.
Error: Cleaned data file for SierraLeone not found at ../data/sierraleone_clean.csv. Please run Task 2 for this country first.
Error: Cleaned data file for Togo not found at ../data/togo_clean.csv. Please run Task 2 for this country first.

Skipping comparisons as some country data failed to load.
