In [25]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore


df =pd.read_csv(r'C:\Users\hp\Music\data\sierraleone-bumbuna.csv', encoding='latin-1')


In [26]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

In [27]:
df.describe().T
df.isna().mean()[df.isna().mean()>0.05]

Comments    1.0
dtype: float64

In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
sns.set(style="whitegrid")



df['Timestamp'] = pd.to_datetime(df['Timestamp'])
print("Benin Data Loaded:", df.shape)

# 2. SUMMARY STATS & MISSING VALUES
print("\nSUMMARY STATISTICS")
display(df.describe().T)

print("\nMISSING VALUES (>5%)")
missing = df.isna().mean()
print(missing[missing > 0.05])

# 3. OUTLIER DETECTION (Z-SCORE)
cols = ['GHI','DNI','DHI','ModA','ModB','WS','WSgust']
df_z = df[cols].copy()
z_scores = np.abs(stats.zscore(df_z.select_dtypes(include=np.number)))
df['outlier'] = (z_scores > 3).any(axis=1)

print(f"\nOutliers detected: {df['outlier'].sum()} rows")

# 4. CLEANING
df_clean = df[~df['outlier']].copy()
df_clean[cols] = df_clean[cols].fillna(df_clean[cols].median())



# 5. EXPORT CLEANED DATA
df_clean.to_csv('../data/sierraleone_clean.csv', index=False)
print("benin_clean.csv SAVED")

Benin Data Loaded: (525600, 19)

SUMMARY STATISTICS


Unnamed: 0,count,mean,min,25%,50%,75%,max,std
Timestamp,525600.0,2022-04-30 12:00:30.000000768,2021-10-30 00:01:00,2022-01-29 06:00:45,2022-04-30 12:00:30,2022-07-30 18:00:15,2022-10-30 00:00:00,
GHI,525600.0,201.957515,-19.5,-2.8,0.3,362.4,1499.0,298.49515
DNI,525600.0,116.376337,-7.8,-0.3,-0.1,107.0,946.0,218.652659
DHI,525600.0,113.720571,-17.9,-3.8,-0.1,224.7,892.0,158.946032
ModA,525600.0,206.643095,0.0,0.0,3.6,359.5,1507.0,300.896893
ModB,525600.0,198.114691,0.0,0.0,3.4,345.4,1473.0,288.889073
Tamb,525600.0,26.319394,12.3,23.1,25.3,29.4,39.9,4.398605
RH,525600.0,79.448857,9.9,68.7,85.4,96.7,100.0,20.520775
WS,525600.0,1.146113,0.0,0.0,0.8,2.0,19.2,1.239248
WSgust,525600.0,1.691606,0.0,0.0,1.6,2.6,23.9,1.617053



MISSING VALUES (>5%)
Comments    1.0
dtype: float64

Outliers detected: 16292 rows
benin_clean.csv SAVED


In [29]:
#TIME SERIES
plt.figure(figsize=(14,5))
df_clean.set_index('Timestamp')[['GHI','DNI','DHI']].resample('D').mean().plot()
plt.title('Daily Solar Irradiance – sierraleone')
plt.ylabel('W/m²')
plt.savefig('../notebooks/data/sierraleone_graph.png', format='png', dpi=200)
plt.close()
plt.show()

<Figure size 1400x500 with 0 Axes>

In [30]:
#CLEANING IMPACT
clean_impact = df_clean.groupby('Cleaning')[['ModA','ModB']].mean()
clean_impact.plot(kind='bar', title='ModA/ModB: Clean vs Dirty')
plt.ylabel('Average Reading')
plt.savefig('../notebooks/data/sierraleone_graph.png', format='png', dpi=200)
plt.close()
plt.show()

In [31]:
#CORRELATION HEATMAP
plt.figure(figsize=(10,8))
sns.heatmap(df_clean[['GHI','DNI','DHI','TModA','TModB','Tamb','RH']].corr(),
            annot=True, cmap='coolwarm', center=0)
plt.title('Solar Metrics Correlation – sierraleone')
plt.savefig('../notebooks/data/sierraleone_graph.png', format='png', dpi=200)
plt.close()
plt.show()

In [32]:
#SCATTER: Wind vs GHI
plt.figure(figsize=(8,6))
sns.scatterplot(data=df_clean, x='WS', y='GHI', hue='Cleaning', alpha=0.6)
plt.title('Wind Speed vs GHI (color = cleaning event)')
plt.savefig('../notebooks/data/sierraleone_graph.png', format='png', dpi=200)
plt.close()
plt.show()

In [33]:
#WIND ROSE (SIMPLE)
plt.figure(figsize=(8,8))
ax = plt.subplot(111, polar=True)
directions = np.radians(df_clean['WD'])
speeds = df_clean['WS']
ax.scatter(directions, speeds, c=speeds, cmap='viridis', alpha=0.7)
ax.set_theta_zero_location('N')
ax.set_theta_direction(-1)
plt.title('Wind Rose – sierraleone')
plt.savefig('../notebooks/data/sierraleone_graph.png', format='png', dpi=200)
plt.close()
plt.show()

In [34]:
#HISTOGRAMS
sns.set(rc={'figure.figsize':(12,4)})
df_clean['GHI'].hist(bins=50, alpha=0.7, label='GHI')
df_clean['WS'].hist(bins=50, alpha=0.7, label='WS')
plt.legend()
plt.title('Distribution: GHI & Wind Speed')
plt.savefig('../notebooks/data/sierraleone_graph.png', format='png', dpi=200)
plt.close()
plt.show()

In [35]:
#BUBBLE CHART: GHI vs Temp (bubble = Humidity)
plt.figure(figsize=(10,7))
sns.scatterplot(data=df_clean, x='Tamb', y='GHI', size='RH',
                sizes=(20, 200), alpha=0.6, hue='RH', palette='Blues')
plt.title('GHI vs Temperature – Bubble = Relative Humidity')
plt.savefig('../notebooks/data/sierraleone_graph.png', format='png', dpi=200)
plt.close()
plt.show()

In [36]:
# Create a figure with subplots for multiple box plots
plt.figure(figsize=(12, 6))

# Box plot for Temperature (Tamb)
plt.subplot(1, 2, 1)  # 1 row, 2 columns, 1st subplot
df.boxplot(column='Tamb')
plt.title('Temperature (Tamb) Box Plot')

# Box plot for Humidity (RH)
plt.subplot(1, 2, 2)  # 1 row, 2 columns, 2nd subplot
df.boxplot(column='RH')
plt.title('Humidity (RH) Box Plot')

# Show the plots
plt.tight_layout()
plt.savefig('../notebooks/data/sierraleone_graph.png', format='png', dpi=200)
plt.close()
plt.show()

In [37]:

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(rc={'figure.figsize':(12,4)})
df.set_index('Timestamp')[['GHI','DNI','DHI']].plot(); plt.title('Benin Irradiance')
plt.savefig('../data/sierraleone_graph.png', format='png', dpi=200)
plt.close()
plt.show()
sns.heatmap(df[['GHI','DNI','DHI','Tamb','RH']].corr(), annot=True, cmap='coolwarm')
plt.savefig('../notebooks/data/sierraleone_graph.png', format='png', dpi=200)
plt.close()
plt.show()