In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

In [None]:
import pandas as pd

url = "../data/benin-malanville.csv"


df = pd.read_csv(url)
print(df.shape)
df.head()

In [None]:
print("First 5 rows of the dataset:")
df.head()

In [None]:
print("Dataset Information:")
print(df.info())

In [None]:
print("the description of the dataset:")
print(df.describe())

In [None]:
print("Missing values:")
print(df.isna().sum())

In [None]:
missing_percentage = df.isna().sum() * 100 / len(df)
high_null_columns = missing_percentage[missing_percentage > 5]
print(high_null_columns)

In [None]:
columns_to_check = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']
missing_report = df[columns_to_check].isna().sum()
print("Missing values:\n", missing_report)

In [None]:
for col in ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']:
    invalid = df[df[col] < 0]
    print(f"{col}: {len(invalid)} negative values")


In [None]:
columns = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']
z = np.abs(stats.zscore(df[columns]))
z_outliers_mask = (z > 3).any(axis=1)
print("Z-score outlier rows:", int(z_outliers_mask.sum()))

In [None]:
for col in ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']:
    median_value = df[col].median()
    df[col].fillna(median_value, inplace=True)
    print(f"Filled missing values in {col} with median: {median_value}")

In [None]:
df.to_csv("../data/benin_clean.csv", index=False)

In [None]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(14, 6))
for col in ['GHI', 'DNI', 'DHI', 'Tamb']:
    plt.plot(df['Timestamp'], df[col], label=col)

plt.xlabel('Timestamp')
plt.ylabel('Value')
plt.title('Line Chart of GHI, DNI, DHI, Tamb vs. Timestamp')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
#Trends for the month:
#In spring and summer, solar irradiance (GHI, DNI, DHI) peaks, while in
#winter, it drops due to reduced daylight and increased cloudiness.
#Trends for the day:
#During the day, irradiance increases, while at night it decreases. The DNI shows sharper daytime peaks than the GHI or DHI, reflecting periods of clear skies. As with irradiance, ambient temperature (Tamb) rises during the day and cools at night.
#An anomaly:
#An invalid irradiance value is caused by a sensor or logging error. Sensor faults, cloud changes, or data recording problems can cause sudden spikes or dips in irradiance or temperature.

In [None]:
df_original = pd.read_csv("../data/benin-malanville.csv")
df_cleaned = pd.read_csv("../data/benin_clean.csv")

# Create a flag column in the original dataset
df_original['Cleaning'] = 'pre-clean'

# Create a flag column in the cleaned dataset
df_cleaned['Cleaning'] = 'post-clean'

# Combine both datasets
df_combined = pd.concat([df_original, df_cleaned], ignore_index=True)

grouped = df.groupby('Cleaning')[['ModA', 'ModB']].mean()

import matplotlib.pyplot as plt
grouped.plot(kind='bar', figsize=(8, 5), color=['skyblue', 'salmon'])
plt.title('Average ModA & ModB Pre/Post-Clean')
plt.ylabel('Average Sensor Reading')
plt.xticks(rotation=0)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
corr_cols = ['GHI', 'DNI', 'DHI', 'TModA', 'TModB']
corr_matrix = df[corr_cols].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap: GHI, DNI, DHI, TModA, TModB')
plt.tight_layout()
plt.show()


In [None]:
print("WS vs. GHI")
plt.figure(figsize=(6, 4))
sns.scatterplot(data=df, x='WS', y='GHI', alpha=0.3)
plt.title('WS vs. GHI')
plt.xlabel('Wind Speed (WS)')
plt.ylabel('Global Horizontal Irradiance (GHI)')
plt.tight_layout()
plt.show()

print("WSgust vs. GHI")
plt.figure(figsize=(6, 4))
sns.scatterplot(data=df, x='WSgust', y='GHI', alpha=0.3)
plt.title('WSgust vs. GHI')
plt.xlabel('Wind Gust (WSgust)')
plt.ylabel('Global Horizontal Irradiance (GHI)')
plt.tight_layout()
plt.show()

print("WD vs. GHI")
plt.figure(figsize=(6, 4))
sns.scatterplot(data=df, x='WD', y='GHI', alpha=0.3)
plt.title('Wind Direction (WD) vs. GHI')
plt.xlabel('Wind Direction (WD)')
plt.ylabel('Global Horizontal Irradiance (GHI)')
plt.tight_layout()
plt.show()

print("RH vs. Tamb")
plt.figure(figsize=(6, 4))
sns.scatterplot(data=df, x='RH', y='Tamb', alpha=0.3)
plt.title('Relative Humidity (RH) vs. Ambient Temperature (Tamb)')
plt.xlabel('Relative Humidity (%)')
plt.ylabel('Ambient Temperature (°C)')
plt.tight_layout()
plt.show()

print("RH vs. GHI")
plt.figure(figsize=(6, 4))
sns.scatterplot(data=df, x='RH', y='GHI', alpha=0.3)
plt.title('Relative Humidity (RH) vs. GHI')
plt.xlabel('Relative Humidity (%)')
plt.ylabel('Global Horizontal Irradiance (GHI)')
plt.tight_layout()
plt.show()


In [None]:
bins = np.arange(0, 361, 30)
labels = [f"{b}°-{b+30}°" for b in bins[:-1]]
df['WD_bin'] = pd.cut(df['WD'], bins=bins, labels=labels, right=False)
ws_by_dir = df.groupby('WD_bin')['WS'].mean().reset_index()
ws_by_dir['angle'] = bins[:-1] * np.pi / 180  
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111, polar=True)
bars = ax.bar(ws_by_dir['angle'], ws_by_dir['WS'], width=np.pi/6, bottom=0, color='skyblue', edgecolor='black')

ax.set_theta_zero_location('N')
ax.set_theta_direction(-1)
ax.set_title('Wind Rose: Average Wind Speed by Direction', va='bottom')
plt.tight_layout()
plt.show()


In [None]:
print("Histogram for GHI")
plt.figure(figsize=(6, 4))
sns.histplot(df['GHI'], bins=50, kde=True, color='orange')
plt.title('Histogram of Global Horizontal Irradiance (GHI)')
plt.xlabel('GHI')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

print("Histogram for WS")
plt.figure(figsize=(6, 4))
sns.histplot(df['WS'], bins=50, kde=True, color='skyblue')
plt.title('Histogram of Wind Speed (WS)')
plt.xlabel('Wind Speed (m/s)')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()


In [None]:
cols = ['RH', 'Tamb', 'GHI', 'DNI', 'DHI']
correlation_matrix = df[cols].corr()
print("Correlation with RH:\n", correlation_matrix['RH'])
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix: RH, Tamb, GHI, DNI, DHI')
plt.tight_layout()
plt.show()
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
sns.scatterplot(data=df, x='RH', y='Tamb', alpha=0.3, ax=axes[0])
axes[0].set_title('RH vs. Tamb')
sns.scatterplot(data=df, x='RH', y='GHI', alpha=0.3, ax=axes[1])
axes[1].set_title('RH vs. GHI')
plt.tight_layout()
plt.show()
df['RH_bin'] = pd.cut(df['RH'], bins=[0, 30, 60, 90, 100], labels=['Low', 'Medium', 'High', 'Very High'])
grouped = df.groupby('RH_bin')[['Tamb', 'GHI']].mean()
print("Average Tamb and GHI by RH bin:\n", grouped)
grouped.plot(kind='bar', figsize=(8, 5), color=['salmon', 'orange'])
plt.title('Average Tamb and GHI by RH Level')
plt.ylabel('Average Value')
plt.xticks(rotation=0)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()
df.set_index('Timestamp')[['RH', 'Tamb', 'GHI']].plot(figsize=(14, 6), alpha=0.7)
plt.title('RH, Tamb, and GHI Over Time')
plt.ylabel('Value')
plt.tight_layout()
plt.show()


In [None]:
bubble_size = df['RH']  
size_scaled = (bubble_size - bubble_size.min()) / (bubble_size.max() - bubble_size.min()) * 300
plt.figure(figsize=(8, 6))
plt.scatter(df['Tamb'], df['GHI'], s=size_scaled, alpha=0.4, c='orange', edgecolors='w')
plt.xlabel('Ambient Temperature (Tamb)')
plt.ylabel('Global Horizontal Irradiance (GHI)')
plt.title('GHI vs. Tamb with Bubble Size = RH')
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
print("the description of the categorical columns:")
print(df.describe(include=['object']))