## EDA of totaldemand dataset
#### Modified: 2024-04-01

#### Import libraries

In [1]:
import pandas as pd
from ydata_profiling import ProfileReport  # One-stop-shop for initial profile of a dataset

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

#### Set variables (*filepath, input file name, etc.*)

In [8]:
fpath = "..\\..\\data\\all_raw_files"
fname_nsw_tdemand = "totaldemand_nsw.csv"
fname_qld_tdemand = "totaldemand_qld.csv"
fname_vic_tdemand = "totaldemand_vic.csv"
fname_sa_tdemand = "totaldemand_sa.csv"

#### Function for reading data

In [4]:
def read_data(datafile):
    df_data = pd.read_csv(datafile, sep = ',')
    #df_data.columns=[]
    return df_data

#### Function for initial data analysis and exploration

In [None]:
def initial_analysis(data_df):
    # Get a quick overview of the dataset to compare some attributes to the supplied metadata
    print('Dataset overview:\n-----------------')
    data_df.info()

    # Get basic statistical details about the dataset
    print('\n')
    print('Dataset summary statistics:\n---------------------------')
    print(data_df.describe())

    # Ascertain missing values
    print('\n')
    print('Missing values table:\n---------------------')
    null_values = data_df.isnull().sum()
    percentage_null_values = (null_values/len(data_df))*100
    print(pd.concat([null_values, percentage_null_values], axis=1, keys = ['Missing Count', 'Percent Missing']))

    ### Determine distribution of each feature
    # 1. distribution of 'sex'

    # Set figure file name
    figure_file_name = '1_distribution_sex.png'
    sns.set()
    plt.figure(figsize=(5,5))
    sns.countplot(x=data_df.sex)
    plt.savefig(figure_dir + figure_file_name)

    # 2. distribution of target attribute 'rings'
    # Set figure file name
    figure_file_name = '2_distribution_rings.png'
    plt.clf()
    rows = 1
    cols = 2
    i = 0
    plt.figure(figsize=(cols * 5, rows * 5))
    i += 1
    plt.subplot(rows, cols, i)
    plt.xticks(range(0, 31, 4))
    plt.xlim(0, 30)
    sns.histplot(data=data_df, x="rings", kde=True)
    i += 1  
    plt.subplot(rows, cols, i)
    plt.xticks(range(0, 31, 4))
    plt.xlim(0, 30)
    sns.boxplot(data=data_df, x="rings", y="sex")
    plt.savefig(figure_dir + figure_file_name)

    # 3. distribution of 'size' attributes: 'length', 'diameter', 'height'
    # Set figure file name
    figure_file_name = '3_distribution_size.png'
    plt.clf()
    rows = 2
    cols = 3
    i = 0
    plt.figure(figsize=(cols * 5, rows * 5))
    colours = sns.color_palette()
    i += 1
    plt.subplot(rows, cols, i)
    sns.histplot(data=data_df, x="length", kde=True, color=colours[i % cols])
    i += 1  
    plt.subplot(rows, cols, i)
    sns.histplot(data=data_df, x="diameter", kde=True, color=colours[i % cols])
    i += 1  
    plt.subplot(rows, cols, i)
    sns.histplot(data=data_df, x="height", kde=True, color=colours[i % cols])
    i += 1  
    plt.subplot(rows, cols, i)
    sns.boxplot(data=data_df, x="length", y="sex", color=colours[i % cols])
    i += 1  
    plt.subplot(rows, cols, i)
    sns.boxplot(data=data_df, x="diameter", y="sex", color=colours[i % cols])
    i += 1  
    plt.subplot(rows, cols, i)
    sns.boxplot(data=data_df, x="height", y="sex", color=colours[i % cols])
    plt.savefig(figure_dir + figure_file_name)

    # 4. distribution of 'weight' attributes: 'whole_w', 'shucked_w', 'viscera_w', 'shell_w'
    # Set figure file name
    figure_file_name = '4_distribution_weight.png'
    plt.clf()
    rows = 2
    cols = 4
    i = 0
    plt.figure(figsize=(cols * 5, rows * 5))
    colours = sns.color_palette()
    i += 1
    plt.subplot(rows, cols, i)
    sns.histplot(data=data_df, x="whole_w", kde=True, color=colours[i % cols])
    i += 1  
    plt.subplot(rows, cols, i)
    sns.histplot(data=data_df, x="shucked_w", kde=True, color=colours[i % cols])
    i += 1  
    plt.subplot(rows, cols, i)
    sns.histplot(data=data_df, x="viscera_w", kde=True, color=colours[i % cols])
    i += 1  
    plt.subplot(rows, cols, i)
    sns.histplot(data=data_df, x="shell_w", kde=True, color=colours[i % cols])
    i += 1  
    plt.subplot(rows, cols, i)
    sns.boxplot(data=data_df, x="whole_w", y="sex",color=colours[i % cols])
    i += 1  
    plt.subplot(rows, cols, i)
    sns.boxplot(data=data_df, x="shucked_w", y="sex", color=colours[i % cols])
    i += 1  
    plt.subplot(rows, cols, i)
    sns.boxplot(data=data_df, x="viscera_w", y="sex", color=colours[i % cols])
    i += 1  
    plt.subplot(rows, cols, i)
    sns.boxplot(data=data_df, x="shell_w", y="sex", color=colours[i % cols])
    plt.savefig(figure_dir + figure_file_name)

    ## Visualise the correlation matrix for numeric attributes
    # Set figure file name
    figure_file_name = '5_corr_map_heatmap.png'
    plt.clf()
    # Remove categorical feature before creating correlation matrix
    data_numeric_df = data_df.drop(['sex'], axis=1)

    sns.heatmap(data_numeric_df.corr(), cmap="GnBu", annot=True)
    plt.savefig(figure_dir + figure_file_name)

### Analysis

In [10]:
# Read file
df_nsw_tdemand = read_data(f'{fpath}\{fname_nsw_tdemand}')
df_qld_tdemand = read_data(f'{fpath}\{fname_qld_tdemand}')
df_vic_tdemand = read_data(f'{fpath}\{fname_vic_tdemand}')
df_sa_tdemand = read_data(f'{fpath}\{fname_sa_tdemand}')

#### Generate Data Profile Report (*using ydata_profiler*)

In [11]:
profile_nsw_tdemand = ProfileReport(df_nsw_tdemand, title="Total Demand NSW Data Profile")
profile_qld_tdemand = ProfileReport(df_qld_tdemand, title="Total Demand QLD Data Profile")
profile_vic_tdemand = ProfileReport(df_vic_tdemand, title="Total Demand VIC Data Profile")
profile_sa_tdemand = ProfileReport(df_sa_tdemand, title="Total Demand SA Data Profile")

#### Output Data Profile Report to Widgets

In [7]:
profile_nsw_tdemand.to_widgets()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

In [12]:
profile_qld_tdemand.to_widgets()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

In [13]:
profile_vic_tdemand.to_widgets()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

In [14]:
profile_sa_tdemand.to_widgets()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

In [17]:
df_nsw_tdemand[]

DATETIME        object
TOTALDEMAND    float64
REGIONID        object
dtype: object

In [21]:
df_nsw_tdemand['DATETIME'] = pd.to_datetime(df_nsw_tdemand['DATETIME'],format='%d/%m/%Y %H:%M')

In [28]:
df = df_nsw_tdemand.loc[(df_nsw_tdemand['DATETIME'] < pd.to_datetime('2022-01-01 00:00:00')) & (df_nsw_tdemand['DATETIME'] >= pd.to_datetime('2021-01-01 00:00:00'))].sort_values('DATETIME', ascending=True)

In [33]:
plt.figure(figsize=(10, 6))
plt.plot(df['DATETIME'], df['TOTALDEMAND'], marker='o', linestyle='-', color='b')
plt.xlabel('DateTime')
plt.ylabel('TotalDemand NSW')
plt.title('Time Series Plot')
plt.savefig('total_demand_nsw.png')