In [85]:
import pandas as pd


df = pd.read_csv(r'C:\Users\hp\Music\data\benin-malanville.csv', encoding='latin-1')
print(df.head())



          Timestamp  GHI  DNI  DHI  ModA  ModB  Tamb    RH   WS  WSgust  \
0  2021-08-09 00:01 -1.2 -0.2 -1.1   0.0   0.0  26.2  93.4  0.0     0.4   
1  2021-08-09 00:02 -1.1 -0.2 -1.1   0.0   0.0  26.2  93.6  0.0     0.0   
2  2021-08-09 00:03 -1.1 -0.2 -1.1   0.0   0.0  26.2  93.7  0.3     1.1   
3  2021-08-09 00:04 -1.1 -0.1 -1.0   0.0   0.0  26.2  93.3  0.2     0.7   
4  2021-08-09 00:05 -1.0 -0.1 -1.0   0.0   0.0  26.2  93.3  0.1     0.7   

   WSstdev     WD  WDstdev   BP  Cleaning  Precipitation  TModA  TModB  \
0      0.1  122.1      0.0  998         0            0.0   26.3   26.2   
1      0.0    0.0      0.0  998         0            0.0   26.3   26.2   
2      0.5  124.6      1.5  997         0            0.0   26.4   26.2   
3      0.4  120.3      1.3  997         0            0.0   26.4   26.3   
4      0.3  113.2      1.0  997         0            0.0   26.4   26.3   

   Comments  
0       NaN  
1       NaN  
2       NaN  
3       NaN  
4       NaN  


In [86]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

In [87]:

df.describe().T
df.isna().mean()[df.isna().mean()>0.05]

Comments    1.0
dtype: float64

In [88]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
sns.set(style="whitegrid")



df['Timestamp'] = pd.to_datetime(df['Timestamp'])
print("Benin Data Loaded:", df.shape)

# 2. SUMMARY STATS & MISSING VALUES
print("\nSUMMARY STATISTICS")
display(df.describe().T)

print("\nMISSING VALUES (>5%)")
missing = df.isna().mean()
print(missing[missing > 0.05])

# 3. OUTLIER DETECTION (Z-SCORE)
cols = ['GHI','DNI','DHI','ModA','ModB','WS','WSgust']
df_z = df[cols].copy()
z_scores = np.abs(stats.zscore(df_z.select_dtypes(include=np.number)))
df['outlier'] = (z_scores > 3).any(axis=1)

print(f"\nOutliers detected: {df['outlier'].sum()} rows")

# 4. CLEANING
df_clean = df[~df['outlier']].copy()
df_clean[cols] = df_clean[cols].fillna(df_clean[cols].median())



# 5. EXPORT CLEANED DATA
df_clean.to_csv('../data/benin_clean.csv', index=False)
print("benin_clean.csv SAVED")

Benin Data Loaded: (525600, 19)

SUMMARY STATISTICS


Unnamed: 0,count,mean,min,25%,50%,75%,max,std
Timestamp,525600.0,2022-02-07 12:00:30.000000512,2021-08-09 00:01:00,2021-11-08 06:00:45,2022-02-07 12:00:30,2022-05-09 18:00:15,2022-08-09 00:00:00,
GHI,525600.0,240.559452,-12.9,-2.0,1.8,483.4,1413.0,331.131327
DNI,525600.0,167.187516,-7.8,-0.5,-0.1,314.2,952.3,261.710501
DHI,525600.0,115.358961,-12.6,-2.1,1.6,216.3,759.2,158.691074
ModA,525600.0,236.589496,0.0,0.0,4.5,463.7,1342.3,326.894859
ModB,525600.0,228.883576,0.0,0.0,4.3,447.9,1342.3,316.536515
Tamb,525600.0,28.179683,11.0,24.2,28.0,32.3,43.8,5.924297
RH,525600.0,54.487969,2.1,28.8,55.1,80.1,100.0,28.073069
WS,525600.0,2.121113,0.0,1.0,1.9,3.1,19.5,1.603466
WSgust,525600.0,2.809195,0.0,1.3,2.6,4.1,26.6,2.02912



MISSING VALUES (>5%)
Comments    1.0
dtype: float64

Outliers detected: 7740 rows
benin_clean.csv SAVED


In [89]:
#TIME SERIES
plt.figure(figsize=(14,5))
df_clean.set_index('Timestamp')[['GHI','DNI','DHI']].resample('D').mean().plot()
plt.title('Daily Solar Irradiance – Benin')
plt.ylabel('W/m²')
plt.savefig('../notebooks/data/benin_graph.png', format='png', dpi=200)
plt.close()


plt.show()


<Figure size 1400x500 with 0 Axes>

In [90]:
#CLEANING IMPACT
clean_impact = df_clean.groupby('Cleaning')[['ModA','ModB']].mean()
clean_impact.plot(kind='bar', title='ModA/ModB: Clean vs Dirty')
plt.ylabel('Average Reading')
plt.savefig('../notebooks/data/benin_graph.png', format='png', dpi=200)
plt.close()

plt.show()

In [91]:
#CORRELATION HEATMAP
plt.figure(figsize=(10,8))
sns.heatmap(df_clean[['GHI','DNI','DHI','TModA','TModB','Tamb','RH']].corr(),
            annot=True, cmap='coolwarm', center=0)
plt.title('Solar Metrics Correlation – Benin')
plt.savefig('../notebooks/data/benin_graph.png', format='png', dpi=200)
plt.close()
plt.show()

In [92]:
#SCATTER: Wind vs GHI
plt.figure(figsize=(8,6))
sns.scatterplot(data=df_clean, x='WS', y='GHI', hue='Cleaning', alpha=0.6)
plt.title('Wind Speed vs GHI (color = cleaning event)')
plt.savefig('../notebooks/data/benin_graph.png', format='png', dpi=200)
plt.close()
plt.show()

In [93]:
#WIND ROSE (SIMPLE)
plt.figure(figsize=(8,8))
ax = plt.subplot(111, polar=True)
directions = np.radians(df_clean['WD'])
speeds = df_clean['WS']
ax.scatter(directions, speeds, c=speeds, cmap='viridis', alpha=0.7)
ax.set_theta_zero_location('N')
ax.set_theta_direction(-1)
plt.title('Wind Rose – Benin')
plt.savefig('../notebooks/data/benin_graph.png', format='png', dpi=200)
plt.close()
plt.show()

In [94]:
#HISTOGRAMS
sns.set(rc={'figure.figsize':(12,4)})
df_clean['GHI'].hist(bins=50, alpha=0.7, label='GHI')
df_clean['WS'].hist(bins=50, alpha=0.7, label='WS')
plt.legend()
plt.title('Distribution: GHI & Wind Speed')
plt.savefig('../notebooks/data/benin_graph.png', format='png', dpi=200)
plt.close()
plt.show()

In [95]:
#BUBBLE CHART: GHI vs Temp (bubble = Humidity)
plt.figure(figsize=(10,7))
sns.scatterplot(data=df_clean, x='Tamb', y='GHI', size='RH',
                sizes=(20, 200), alpha=0.6, hue='RH', palette='Blues')
plt.title('GHI vs Temperature – Bubble = Relative Humidity')
plt.savefig('../notebooks/data/benin_graph.png', format='png', dpi=200)
plt.close()
plt.show()

In [96]:
# Create a figure with subplots for multiple box plots
plt.figure(figsize=(12, 6))

# Box plot for Temperature (Tamb)
plt.subplot(1, 2, 1)  # 1 row, 2 columns, 1st subplot
df.boxplot(column='Tamb')
plt.title('Temperature (Tamb) Box Plot')

# Box plot for Humidity (RH)
plt.subplot(1, 2, 2)  # 1 row, 2 columns, 2nd subplot
df.boxplot(column='RH')
plt.title('Humidity (RH) Box Plot')

# Show the plots
plt.tight_layout()
plt.savefig('../notebooks/data/benin_graph.png', format='png', dpi=200)
plt.close()
plt.show()

In [97]:

sns.set(rc={'figure.figsize':(12,4)})
df.set_index('Timestamp')[['GHI','DNI','DHI']].plot(); plt.title('Benin Irradiance')
plt.savefig('../data/benin_graph.png', format='png', dpi=200)
plt.close()
plt.show()
sns.heatmap(df[['GHI','DNI','DHI','Tamb','RH']].corr(), annot=True, cmap='coolwarm')
plt.savefig('../notebooks/data/benin_graph.png', format='png', dpi=200)
plt.close()
plt.show()


