# Togo-Solar Radiation Measurement Data Exploratory Data Analysis

In [None]:
# import libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats
import seaborn as sns

## Data Profiling

In [None]:
# import data as pandas DataFrame
# I added latin encoding because I kept getting UnicodeDecodeError: 'utf-8' codec can't decode byte 0xb2 in position 144: invalid start byte
# A better solution is to remove the first line after the headers
df = pd.read_csv('../data/benin-malanville_qc.csv')

df.set_index('Timestamp', inplace=True)


print(df.head())
print("*"*100)
print(df.tail())
print("*"*100)

In [None]:
df.info()
print("*"*100)
print("The shape of the dataset is",df.shape)
print("*"*100)

## Summary Statistics & Missing-Value Report

In [None]:
print("*"*100)
print(df.describe())

In [None]:
# list any column with >5% nulls.
columns_with_nulls = [df.isna().sum()[df.isna().sum() > 0.05 * df.shape[0]]]
print(columns_with_nulls)

## Outlier Detection & Basic Cleaning

In [None]:
numeric_columns = ["GHI", "DNI", "DHI", "ModA", "ModB", "WS", "WSgust"]

# Compute Z score
z_scores = np.abs(stats.zscore(df[numeric_columns]))
outlier_flags = (z_scores > 3).any(axis=1)
# Make sure we got all columns
print(outlier_flags.shape)
print("Number of flagged outliers", outlier_flags.sum())


In [None]:
# impute columns with missing values

for col in numeric_columns:
    df.loc[outlier_flags, col] = df[col].median()

df_clean = df.drop(columns=['Comments'])
df_clean

In [None]:
# Export cleaned DataFrame
path = "../data/togo_dapaong_clean.csv"
df_clean.to_csv(path)
print("Clean DataFrame save to ", path)

## Time Series Analysis

In [None]:
# Line or bar charts of GHI, DNI, DHI, Tamb vs. Timestamp.
chart_cols = ["GHI","DNI","DHI","Tamb"]
plt.figure(figsize=(12, 6))

for i, col in enumerate(chart_cols):
    color=np.random.rand(3,)
    plt.subplot(4, 2, i + 1)
    plt.plot(df_clean.index, df_clean[col], label=col,color=color)
    plt.title("Time Series of " + col)
    plt.xlabel('Timestamp')
    plt.ylabel(col)        
plt.tight_layout()
plt.show()

In [None]:
# Extract month from the index
df_clean['month'] = df_clean.index.month

# Monthly averages
monthly_data = df_clean.groupby('month')[columns].mean()

# Plot
plt.figure(figsize=(12, 6))
monthly_data.plot(kind='line', marker='M')
plt.title("Monthly Trends in Solar Radiation and Temperature")
plt.xlabel("Month")
plt.ylabel("Average Value")
plt.grid(True)
plt.show()

# Cleaning Impact

In [None]:
# Grouping by Cleaning flag and calculating mean of ModA and ModB
cleaning_flag= df_clean.groupby('Cleaning')[['ModA', 'ModB']].mean().reset_index()

# Melt to reshape the DataFrame for seaborn
cleaning_melted = cleaning_flag.melt(id_vars='Cleaning', value_vars=['ModA', 'ModB'], var_name='Module', value_name='Average Irradiance')

# Plot
plt.figure(figsize=(8, 6))
sns.barplot(data=cleaning_melted, x='Module', y='Average Irradiance', hue='Cleaning', palette='Set2')
plt.title("Impact of Cleaning on Module Irradiance")
plt.ylabel("Average Irradiance (W/m²)")
plt.xlabel("Sensor Module")
plt.legend(title='Cleaning (0=No, 1=Yes)')
plt.show()

# Correlation & Relationship Analysis

In [None]:
plt.figure(figsize=(12, 8))
correlation_matrix = df[['GHI', 'DNI', 'DHI', 'TModA', 'TModB']].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Heatmap of Solar Radiation and Temperature")
plt.show()

In [None]:
scatter_data = df_clean[['GHI', 'WS', 'WD', 'WSgust', 'RH', 'Tamb']]
pandas.plotting.scatter_matrix(scatter_data, figsize=(12, 12), diagonal='kde')
plt.suptitle('Scatter plots: Wind Conditions and Solar Irradiance', y=1.02)
plt.show()

# Wind & Distribution Analysis

In [None]:

df_clean['WD_rad'] = np.deg2rad(df_clean['WD'])

plt.figure(figsize=(7, 7))
ax = plt.subplot(111, polar=True)
ax.scatter(df_clean['WD_rad'], df_clean['WS'], c=df_clean['WS'], cmap='viridis', alpha=0.75)
ax.set_theta_zero_location('N')
ax.set_theta_direction(-1) 
ax.set_title('Polar Plot of Wind Speed and Direction')
plt.show()

In [None]:
# Histogram of GHI, DNI, DHI, Tamb
columns = ['GHI', 'DNI', 'DHI', 'Tamb']
plt.figure(figsize=(15, 10))
for i, col in enumerate(columns):
    color = np.random.rand(3,)
    plt.subplot(2, 2, i + 1)
    sns.histplot(df_clean[col], bins=30, kde=True, edgecolor='black', color=color)
    plt.title("Histogram of " + col)
    plt.xlabel(col)
    plt.ylabel('Frequency')
plt.tight_layout()
plt.show()


# Temperature Analysis

In [None]:
# How relative humidity (RH) might influence temperature readings

plt.figure(figsize=(10, 5))
sns.scatterplot(data=df_clean, x='RH', y='Tamb', hue='GHI', alpha=0.6)
plt.title("Relative Humidity vs. Temperature with Solar Radiation (GHI) as Hue")
plt.xlabel("Relative Humidity (%)")
plt.ylabel("Temperature (°C)")
plt.grid(True)
plt.show()

# How relative humidity (RH) might influence solar radiation (GHI)

plt.figure(figsize=(10, 5))
sns.scatterplot(data=df_clean, x='RH', y='GHI', hue='Tamb', alpha=0.6)
plt.title("Relative Humidity vs. Solar Radiation (GHI) with Temperature as Hue")
plt.xlabel("Relative Humidity (%)")
plt.ylabel("Solar Radiation (GHI) (W/m²)")
plt.grid(True)
plt.show()


# Bubble Chart

In [None]:
# GHI vs. Tamb with bubble size = RH or BP.

plt.figure(figsize=(10, 6))
scatter = plt.scatter(data=df_clean, x='GHI', y='Tamb', s=df['RH']*10, c=df['WS'], alpha=0.6,cmap='viridis')
plt.title("GHI vs. Tamb with Bubble Size = RH")
plt.colorbar(label='Wind Speed (WS)')
plt.xlabel("Global Horizontal Irradiance (GHI) (W/m²)")
plt.ylabel("Temperature (°C)")
plt.grid(True)
plt.show()

# GHI vs. Tamb with bubble size = BP.

plt.figure(figsize=(10, 6))
scatter = plt.scatter(data=df_clean, x='GHI', y='Tamb', s=df['BP']*0.1,c=df['WS'], alpha=0.6,cmap='viridis')
plt.title("GHI vs. Tamb with Bubble Size = BP")
plt.colorbar(label='Wind Speed (WS)')
plt.xlabel("Global Horizontal Irradiance (GHI) (W/m²)")
plt.ylabel("Temperature (°C)")
plt.grid(True)
plt.show()

# References
- The issue I first encountered - https://stackoverflow.com/questions/55563399/how-to-solve-unicodedecodeerror-utf-8-codec-cant-decode-byte-0xff-in-positio#72642907
- How to export - https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html 