import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore


# Load dataset
df = pd.read_csv('../data/benin-malanville.csv')
# Basic summary
print("Summary Statistics sierraleone-bumbuna :")
print(df.describe())
print("\nMissing Values (%):")
print(df.isna().sum() / len(df) * 100)

# Outlier Detection using Z-Score
for col in ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']:
    df['z_' + col] = zscore(df[col].fillna(df[col].median()))
    df = df[df['z_' + col].abs() < 3]

# Export cleaned data (ignored in .gitignore)
df.to_csv('../data/malanville_clean.csv', index=False)

# --- VISUALIZATIONS ---

# 1. GHI, DNI, DHI over time
plt.figure(figsize=(12, 5))
plt.plot(df['Timestamp'], df['GHI'], label='GHI')
plt.plot(df['Timestamp'], df['DNI'], label='DNI')
plt.plot(df['Timestamp'], df['DHI'], label='DHI')
plt.title('Solar Irradiance Over Time - Malanville')
plt.xlabel('Timestamp')
plt.ylabel('Irradiance (W/m²)')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# 2. RH vs Tamb
plt.figure(figsize=(8, 5))
sns.scatterplot(x='RH', y='Tamb', data=df)
plt.title('Relative Humidity vs. Ambient Temperature - Malanville')
plt.xlabel('Relative Humidity (%)')
plt.ylabel('Ambient Temperature (°C)')
plt.tight_layout()
plt.show()

# 3. Cleaning Effect
plt.figure(figsize=(6, 4))
df.groupby('Cleaning')[['ModA', 'ModB']].mean().plot(kind='bar', legend=True)
plt.title('Effect of Cleaning on Module Output - Malanville')
plt.ylabel('Irradiance (W/m²)')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

# 4. Correlation Heatmap
plt.figure(figsize=(10, 8))
corr = df[['GHI', 'DNI', 'DHI', 'TModA', 'TModB', 'Tamb', 'RH']].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap - Malanville')
plt.tight_layout()
plt.show()

# 5. Bubble Chart
plt.figure(figsize=(10, 6))
scatter = plt.scatter(df['GHI'], df['Tamb'], s=df['RH']*2, c=df['BP'], cmap='viridis', alpha=0.7)
plt.colorbar(scatter, label='Barometric Pressure (hPa)')
plt.xlabel('GHI (W/m²)')
plt.ylabel('Tamb (°C)')
plt.title('GHI vs Tamb - Bubble: RH, Color: BP - Malanville')
plt.tight_layout()
plt.show()
