# Solar Challenge Week 1 - EDA and Data Cleaning
This notebook demonstrates:
- Loading the dataset
- Basic profiling and statistics
- Cleaning missing or invalid values
- Exporting cleaned dataset

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
from scipy import stats

# Add module path for solar_features.py
sys.path.append('../src/analysis')

# Import modular functions
from solar_features import compute_daily_mean, compute_daily_max, merge_daily_features

# Load cleaned CSV
df = pd.read_csv('../data/sierraleone-cleaned_data.csv')

# Basic profiling
print("Data Info:")
print(df.info())

print("\nSummary Statistics:")
print(df.describe())

print("\nMissing values per column:")
print(df.isna().sum())

# --- Outlier detection using Z-scores ---
z_thresh = 3  # Typically 3 standard deviations
numeric_cols = ['GHI', 'DNI', 'DHI']  # Columns to check

# Compute Z-scores and filter outliers
df_z = df[numeric_cols]
z_scores = stats.zscore(df_z)
abs_z_scores = abs(z_scores)
filter_mask = (abs_z_scores < z_thresh).all(axis=1)
df_clean = df[filter_mask]

print(f"\nOriginal rows: {len(df)}, After removing outliers: {len(df_clean)}")

# Compute daily features
df_ghi_daily = compute_daily_mean(df_clean, 'GHI')
df_dni_daily = compute_daily_mean(df_clean, 'DNI')
df_dhi_daily = compute_daily_mean(df_clean, 'DHI')

# Merge into a single daily dataframe
df_daily = merge_daily_features(df_ghi_daily, df_dni_daily, df_dhi_daily)

# Save daily features
df_daily.to_csv('../data/sierraleone_daily_features.csv', index=False)
print("Daily features saved to '../data/sierraleone_daily_features.csv'")

# --- Plots ---

# Time series plot
plt.figure(figsize=(12,6))
plt.plot(df_daily['Date'], df_daily['GHI_daily_mean'], label='GHI')
plt.plot(df_daily['Date'], df_daily['DNI_daily_mean'], label='DNI')
plt.plot(df_daily['Date'], df_daily['DHI_daily_mean'], label='DHI')
plt.xlabel('Date')
plt.ylabel('Daily Mean')
plt.title('Sierra Leone Daily Solar Features')
plt.legend()
plt.show()

# Correlation heatmap
corr = df_daily[['GHI_daily_mean','DNI_daily_mean','DHI_daily_mean']].corr()
plt.figure(figsize=(6,5))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title("Correlation of Daily Solar Features - Sierra Leone")
plt.show()

# Scatter plots
plt.figure(figsize=(12,5))
plt.scatter(df_daily['GHI_daily_mean'], df_daily['DNI_daily_mean'])
plt.xlabel('GHI Daily Mean')
plt.ylabel('DNI Daily Mean')
plt.title('Scatter Plot: GHI vs DNI - Sierra Leone')
plt.show()

# Histograms
df_daily[['GHI_daily_mean', 'DNI_daily_mean', 'DHI_daily_mean']].hist(figsize=(12,6))
plt.suptitle("Distribution of Daily Solar Features - Sierra Leone")
plt.show()

