In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore

sns.set(style='whitegrid')


In [None]:
df = pd.read_csv('../data/benin.csv', parse_dates=['Timestamp'])
df.head()


In [None]:
print(df.describe())
print(df.isna().sum())


In [None]:
missing_percent = df.isna().mean() * 100
high_missing = missing_percent[missing_percent > 5]
print("Columns with >5% missing values:")
print(high_missing)


In [None]:
cols_to_check = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']
df_z = df[cols_to_check].apply(zscore)
outliers = (df_z.abs() > 3).any(axis=1)
print(f"Outliers count: {outliers.sum()}")


In [None]:
for col in cols_to_check:
    median = df[col].median()
    df[col].fillna(median, inplace=True)


In [None]:
plt.figure(figsize=(12,6))
plt.plot(df['Timestamp'], df['GHI'], label='GHI')
plt.plot(df['Timestamp'], df['DNI'], label='DNI')
plt.plot(df['Timestamp'], df['DHI'], label='DHI')
plt.legend()
plt.title('Solar Irradiance Over Time')
plt.xlabel('Timestamp')
plt.ylabel('Irradiance (W/m²)')
plt.show()


In [None]:
plt.figure(figsize=(12,6))
plt.plot(df['Timestamp'], df['GHI'], label='GHI')
plt.plot(df['Timestamp'], df['DNI'], label='DNI')
plt.plot(df['Timestamp'], df['DHI'], label='DHI')
plt.legend()
plt.title('Solar Irradiance Over Time')
plt.xlabel('Timestamp')
plt.ylabel('Irradiance (W/m²)')
plt.show()


In [None]:
cleaning_avg = df.groupby('Cleaning')[['ModA','ModB']].mean()
cleaning_avg.plot(kind='bar')
plt.title('Average Module Irradiance Pre/Post Cleaning')
plt.ylabel('Irradiance (W/m²)')
plt.show()


In [None]:
corr = df[['GHI', 'DNI', 'DHI', 'TModA', 'TModB']].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()


In [None]:
sns.scatterplot(x='WS', y='GHI', data=df)
plt.title('Wind Speed vs. GHI')
plt.xlabel('Wind Speed (m/s)')
plt.ylabel('GHI (W/m²)')
plt.show()


In [None]:
df.to_csv('../data/benin_clean.csv', index=False)