1. Step up enviroments

In [None]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore
from windrose import WindroseAxes


In [None]:
#load the dataset 
df = pd.read_csv('../data/benin-malanville.csv')
df.head()


2. Summary Statistics

In [None]:
# Summary statistics
summary_stats = df.describe()
print(summary_stats)

Data Quality Check

In [None]:
missing_values = df.isnull().sum()
print("Missing Values:\n", missing_values)

# Visualize missing values
sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
plt.title("Missing Values Heatmap")
plt.show()


Check for Outliers

In [None]:
#Using boxplots
columns_to_check = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']
for col in columns_to_check:
    sns.boxplot(data=df[col])
    plt.title(f"Boxplot of {col}")
    plt.show()


In [None]:
# Check for negative values
negative_values = df[columns_to_check].lt(0).sum()
print("Negative Values:\n", negative_values)

Time seies analysis

In [None]:
# Ensure 'DateTime' is datetime type
df['DateTime'] = pd.to_datetime(df['DateTime'])

# Plotting
plt.figure(figsize=(12, 6))
for col in ['GHI', 'DNI', 'DHI', 'Tamb']:
    plt.plot(df['DateTime'], df[col], label=col)
plt.legend()
plt.title("Time Series Analysis")
plt.xlabel("Time")
plt.ylabel("Values")
plt.show()


Correlation Analysis


In [None]:
correlation_matrix = df[['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust', 'Tamb']].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()


Pair Plot

In [None]:
sns.pairplot(df[['GHI', 'DNI', 'DHI', 'Tamb', 'WS']])
plt.show()

Wind Analysis

In [None]:
# Ensure 'WD' (wind direction) exists
ax = WindroseAxes.from_ax()
ax.bar(df['WD'], df['WS'], normed=True, opening=0.8, edgecolor='white')
ax.set_legend()
plt.title("Wind Rose")
plt.show()


Temperature Analysis

In [None]:
sns.scatterplot(data=df, x='RH', y='Tamb', hue='GHI')
plt.title("RH vs Tamb vs GHI")
plt.show()


Histograms

In [None]:
for col in ['GHI', 'DNI', 'DHI', 'WS', 'Tamb']:
    df[col].plot(kind='hist', bins=30, alpha=0.7, title=f"Histogram of {col}")
    plt.show()


Z-Score Analysis

In [None]:
# Calculate Z-scores
z_scores = df[['GHI', 'DNI', 'DHI']].apply(zscore)

# Flag data points with |Z-score| > 3
outliers = (np.abs(z_scores) > 3).sum()
print("Outliers flagged:\n", outliers)


Bubble Charts

In [None]:
#Explore complex relationships:
plt.scatter(df['GHI'], df['Tamb'], s=df['RH'], alpha=0.6, c=df['WS'], cmap='viridis')
plt.colorbar(label="Wind Speed")
plt.title("Bubble Chart: GHI vs Tamb vs RH")
plt.xlabel("GHI")
plt.ylabel("Tamb")
plt.show()


Data cleaning 

In [None]:
# Drop entirely null columns
df = df.dropna(axis=1, how='all')

# Fill missing values with mean
df.fillna(df.mean(), inplace=True)

# Remove negative values in specific columns
for col in ['GHI', 'DNI', 'DHI', 'ModA', 'ModB']:
    df[col] = df[col].clip(lower=0)
