In [None]:


#Import Libraries

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

import seaborn as sns

import plotly.express as px

from scipy import stats

import os



#Load Data

df = pd.read_csv("benin-malanville.csv", parse_dates=["Timestamp"])





#Summary Statistics

print("Summary Statistics:")

display(df.describe())



#Missing Values

missing = df.isna().sum()

print("\nMissing Values Report:")

print(missing[missing > 0])

print("\nColumns with >5% Missing Values:")

print(missing[missing > 0.05 * len(df)])





# Outlier Detection using Z-scores

key_cols = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']

z_scores = np.abs(stats.zscore(df[key_cols].dropna()))

outliers = (z_scores > 3).any(axis=1)

print(f"\nOutlier Rows Detected: {outliers.sum()}")



#Clean: Impute Missing with Median

df[key_cols] = df[key_cols].apply(lambda col: col.fillna(col.median()))



#Save Cleaned Data (but exclude from Git using .gitignore)

clean_path = "data/data/benin_clean.csv"

os.makedirs("data/data", exist_ok=True)

df.to_csv(clean_path, index=False)





#Time Series Analysis

plt.figure(figsize=(15, 6))

df.set_index('Timestamp')[['GHI', 'DNI', 'DHI']].plot(title="Irradiance Over Time", alpha=0.8)

plt.ylabel("W/m²")

plt.show()





# Set timestamp as index

df_ts = df.set_index('Timestamp')



# Create subplots

fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(15, 10), sharex=True)



# Plot each irradiance component

df_ts['GHI'].plot(ax=ax1, title="GHI Over Time", alpha=0.8)

ax1.set_ylabel("W/m²")

ax1.grid(True)



df_ts['DNI'].plot(ax=ax2, title="DNI Over Time", alpha=0.8)

ax2.set_ylabel("W/m²")

ax2.grid(True)



df_ts['DHI'].plot(ax=ax3, title="DHI Over Time", alpha=0.8)

ax3.set_ylabel("W/m²")

ax3.set_xlabel("Timestamp")

ax3.grid(True)



# Adjust layout

plt.tight_layout()

plt.show()



#Clean: Impute Missing with Median

df[key_cols] = df[key_cols].apply(lambda col: col.fillna(col.median()))



#Save Cleaned Data (but exclude from Git using .gitignore)

clean_path = "data/benin_clean.csv"

os.makedirs("data", exist_ok=True)

df.to_csv(clean_path, index=False) 





#Time Series Analysis

plt.figure(figsize=(15, 6))

df.set_index('Timestamp')[['GHI', 'DNI', 'DHI']].plot(title="Irradiance Over Time", alpha=0.8)

plt.ylabel("W/m²")

plt.show()



plt.figure(figsize=(12, 4))

df.set_index('Timestamp')['Tamb'].plot(title="Ambient Temperature Over Time", color='orange')

plt.ylabel("°C")

plt.show()





#Monthly Pattern Visualization

df['Month'] = df['Timestamp'].dt.month

monthly_avg = df.groupby('Month')[['GHI', 'DNI', 'DHI', 'Tamb']].mean()

monthly_avg.plot(kind='bar', figsize=(12,6), title='Monthly Average of Solar & Temperature Data')

plt.ylabel("Average Value")

plt.show()





#Cleaning Impact (if 'Cleaning' column exists)

if 'Cleaning' in df.columns:

    df['Cleaning'] = df['Cleaning'].fillna('Unknown')

    cleaning_effect = df.groupby('Cleaning')[['ModA', 'ModB']].mean()

    cleaning_effect.plot(kind='bar', title='Cleaning Impact on ModA and ModB')

    plt.ylabel("Module Reading")

    plt.show()







#Correlation Heatmap

plt.figure(figsize=(10, 6))

corr_cols = ['GHI', 'DNI', 'DHI', 'TModA', 'TModB']

sns.heatmap(df[corr_cols].corr(), annot=True, cmap='coolwarm', fmt=".2f")

plt.title("Correlation Heatmap")

plt.show()









#Relationship Plots

fig1 = px.scatter(df, x='WS', y='GHI', color='WSgust', title="WS vs GHI")

fig1.show()



fig2 = px.scatter(df, x='RH', y='Tamb', title="RH vs Tamb")

fig2.show()



fig3 = px.scatter(df, x='RH', y='GHI', title="RH vs GHI")

fig3.show()



import plotly.express as px

import plotly.io as pio



pio.renderers.default = 'browser'  # or 'notebook' if you're in a notebook



fig1 = px.scatter(df, x='WS', y='GHI', color='WSgust', title="WS vs GHI")

fig1.show()



fig2 = px.scatter(df, x='RH', y='Tamb', title="RH vs Tamb")

fig2.show()



fig3 = px.scatter(df, x='RH', y='GHI', title="RH vs GHI")

fig3.show()





# Wind Rose-like Plot (simplified radial bar)

if 'WD' in df.columns and 'WS' in df.columns:

    wind_df = df[['WD', 'WS']].dropna()

    wind_df['WD_bin'] = (wind_df['WD'] // 30) * 30  # group into 30° bins

    rose_data = wind_df.groupby('WD_bin')['WS'].mean().reset_index()

    fig = px.bar_polar(rose_data, r='WS', theta='WD_bin',

                       title="Wind Rose Approximation", color='WS', color_continuous_scale='Viridis')

    fig.show()

	

	

import plotly.express as px

import plotly.io as pio



# Set renderer to something that works in your environment

pio.renderers.default = 'browser'  # Try 'notebook' or 'notebook_connected' if in Jupyter



# Wind Rose Plot (simplified)

if 'WD' in df.columns and 'WS' in df.columns:

    wind_df = df[['WD', 'WS']].dropna()

    wind_df['WD_bin'] = (wind_df['WD'] // 30) * 30  # binning by 30°

    rose_data = wind_df.groupby('WD_bin')['WS'].mean().reset_index()

    

    fig = px.bar_polar(

        rose_data,

        r='WS',

        theta='WD_bin',

        color='WS',

        title="Wind Rose Approximation",

        color_continuous_scale='Viridis'

    )

    fig.show()

	

#Histograms

df[['GHI', 'WS']].hist(bins=30, figsize=(12, 5))

plt.suptitle("Histograms of GHI and WS")

plt.show()





#Temp vs RH & GHI

fig4 = px.scatter(df, x='RH', y='Tamb', color='GHI', title="RH vs Tamb (Color: GHI)")

fig4.show()



#Bubble Chart: GHI vs Tamb (Bubble Size = RH)

fig5 = px.scatter(df, x='Tamb', y='GHI', size='RH', color='RH',

                  title="Bubble Chart: GHI vs Tamb (Size=RH)", size_max=20)

fig5.show()

# fig5 = px.scatter(df, x = 'Tamb', y = 'GHI', size = 'RH', titl)



FileNotFoundError: [Errno 2] No such file or directory: 'benin-malanville.csv'