In [None]:
Conducting EDA on Benini_Malanville data

In [None]:
# Import the pandas library for data analysis
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import os,sys
from pandas.plotting import scatter_matrix 

rpath = os.path.abspath('..')
if rpath not in sys.path:
    sys.path.insert(0, rpath)

from scripts.utils import (
    detect_outliers_iqr,
    plot_time_series, 
    plot_correlation_analysis, 
    plot_wind_analysis, 
    plot_temperature_analysis, 
    plot_histograms, 
    calculate_z_scores, 
    plot_bubble_chart, 
    clean_dataset
)


In [None]:
# Import the dataframe from csv
# benin-malanville data

df_benin = pd.read_csv('../data/benin-malanville.csv')

Understanding DataFrame / Data

In [None]:
# The first 5 rows of the dataset

df_benin.head(5)

In [None]:
# The last 5 rows of the dataset

df_benin.tail(5)

In [None]:
# Information about the dataframe
    # consists all columns
    # number of rows
    # number of columns
    # data types of columns
    # memory usage

df_benin.info()

In [None]:
# Shape of the dataframe

df_benin.shape

In [None]:
# Check null values

null_values = df_benin.isnull().sum()
null_values

In [None]:
# Data summary

df_benin.describe()

# Summary statistics

In [None]:
# Calculate the mean, median, standard deviation, and 
# Other statistical measures for each numeric column to understand data distribution.

In [None]:
# Mean

numeric_df = df_benin.select_dtypes(include=['number'])
numeric_df.mean()

In [None]:
# Median

numeric_df = df_benin.select_dtypes(include=['number'])
numeric_df.median()

In [None]:
# Standard deviation

numeric_df = df_benin.select_dtypes(include=['number'])
numeric_df.std()

In [None]:
# Range 

numeric_df = df_benin.select_dtypes(include=['number'])
range_value = numeric_df.max() - numeric_df.min()
range_value

In [None]:
# Variance

numeric_df = df_benin.select_dtypes(include=['number'])
variance_values = numeric_df.var()
variance_values

In [None]:
# Skwiness

numeric_df = df_benin.select_dtypes(include=['number'])
numeric_df.skew()

In [None]:
# Quartiles

numeric_df = df_benin.select_dtypes(include=['number'])
quantiles = numeric_df.quantile([0.25, 0.50, 0.75])
quantiles

# Data Quality Test/Check

In [None]:
# Look for missing values

df_benin.isnull().sum()

In [None]:
# incorrect entries

incorrect_entries_GHI = (df_benin['GHI'] < 0).sum()
incorrect_entries_DNI = (df_benin['DNI'] < 0).sum()
incorrect_entries_DHI = (df_benin['DHI'] < 0).sum()

# Print the results
print("Incorrect entries in GHI:", incorrect_entries_GHI)
print("Incorrect entries in DNI:", incorrect_entries_DNI)
print("Incorrect entries in DHI:", incorrect_entries_DHI)

In [None]:
# outliers

# Calculate the number of outliers for each column
outliers_ModA = detect_outliers_iqr(df_benin['ModA'])
outliers_ModB = detect_outliers_iqr(df_benin['ModB'])
outliers_WS = detect_outliers_iqr(df_benin['WS'])
outliers_WSgust = detect_outliers_iqr(df_benin['WSgust'])

# Print the results
print("Outliers in ModA:", outliers_ModA)
print("Outliers in ModB:", outliers_ModB)
print("Outliers in WS:", outliers_WS)
print("Outliers in WSgust:", outliers_WSgust)

# Time Series Analysis

In [None]:
plot_time_series(df_benin)

# Correlation Analysis

In [None]:
plot_correlation_analysis(df_benin)

# Wind Analysis

In [None]:
plot_wind_analysis(df_benin)

# Temperature Analysis 

In [None]:
plot_temperature_analysis(df_benin)

#### Histograms

In [None]:
plot_histograms(df_benin)

#### Z-Scores Analysis

In [None]:
z_scores_df, outliers = calculate_z_scores(df_benin)

#### Bubble Charts

In [None]:
plot_bubble_chart(df_benin, 'GHI', 'Tamb', 'RH', 'GHI vs Tamb with Bubble Size Representing RH')

# Plotting GHI vs Tamb with bubble size representing BP
print('Bubble Chart for Benin:')
plot_bubble_chart(df_benin, 'GHI', 'Tamb', 'BP', 'GHI vs Tamb with Bubble Size Representing BP')

#### Data Cleaning

In [None]:
cleaned_benin_data = clean_dataset(df_benin)

cleaned_benin_data