In [1]:
import warnings
import random
import pandas as pd # dataframe manipulation
import numpy as np # linear algebra
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler 

# Configure ssl for unverified content so we can load a dataset from an unknown source (github).
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [2]:
import os

#train_url = 'https://raw.githubusercontent.com/bryonbaker/datasets/main/SIT720/Ass3/ac_train_data.csv'
train_url = 'https://raw.githubusercontent.com/bryonbaker/datasets/main/SIT720/Ass3/debug_ac_train_data.csv'
test_url = 'https://raw.githubusercontent.com/bryonbaker/datasets/main/SIT720/Ass3/ac_test_data.csv'
train_path = '/opt/app-root/src/datasets/SIT720/Ass3/ac_train_data.csv'
test_path = '/opt/app-root/src/datasets/SIT720/Ass3/ac_test_data.csv'

#
# Work out if the datasets are local. If not use a remote url. Preference is local.
#
print("Load datasets from local or remote resource:")
print("="*50)
if os.path.isfile(train_path):
    print("Training data is local")
    training_data = train_path
else:
    print("Training data is remote. Downloading file from: {}".format(train_url))
    training_data = train_url

if os.path.isfile(test_path):
    print("Test data is local")
    test_data = test_path
else:
    print("Test data is remote. Downloading file from: {}".format(test_url))
    test_data = test_url
print()

# Load the datasets from either local or remote.
train_df = pd.read_csv(training_data)
test_df = pd.read_csv(test_data)

Load datasets from local or remote resource:
Training data is local
Test data is local



In [3]:
# Drop the columns not used in the assignment

# Drop the sequence number (column 0) from the test dataset. Column 0 is unlabeled so use the index number.
train_df = train_df.drop(train_df.columns[[0]],axis=1)
test_df = test_df.drop(test_df.columns[[0]],axis=1)

In [4]:
# Ordinal encode the days of week.

oldCol = "dayofweek"
newCol = "dayofweeknum"
dayMap = {'Sun' : 0, 'Mon': 1, "Tue" : 2, "Wed" : 3, "Thu" : 4, "Fri" : 5, "Sat" : 6 }    # How to map the values

train_df[newCol] = train_df[oldCol].map(dayMap)    # Adds a new column with proper boolean values
train_df = train_df.drop([oldCol], axis=1)    # Drop the old column before renaming the new column to the name just dropped
train_df = train_df.rename(columns={newCol : oldCol})
      
# Ordinal encode the dayof week in the test dataset.
test_df[newCol] = test_df[oldCol].map(dayMap)    # Adds a new column with proper boolean values
test_df = test_df.drop([oldCol], axis=1)    # Drop the old column before renaming the new column to the name just dropped
test_df = test_df.rename(columns={newCol : oldCol})

In [5]:
# Split out the X and y from the datasets
train_X = train_df.drop(['ac'], axis=1)
train_y = train_df["ac"]
test_X = test_df.drop(['ac'], axis=1)
test_y = test_df['ac']

print("Training Dataset")
print(f"{train_X.head()}\n")
print(f"{train_y.head()}\n")

print("Testing Dataset")
print(f"{test_X.head()}\n")
print(f"{test_y.head()}\n")

Training Dataset
    load  hourofday    dif  absdif    max       var   entropy  nonlinear  \
0  2.245          0  0.987   0.987  6.215  3.074549  0.678886   0.052903   
1  2.259          0  0.014   0.014  6.215  3.172867  0.667450   0.054829   
2  2.269          0  0.010   0.010  6.215  3.270112  0.647777   0.056991   
3  2.268          0 -0.001   0.001  6.215  3.303763  0.629227   0.057606   
4  2.270          0  0.002   0.002  6.215  3.302744  0.621295   0.082640   

      hurst  dayofweek  
0  0.994071          0  
1  0.994154          0  
2  0.994220          0  
3  0.994150          0  
4  0.994041          0  

0    0
1    0
2    0
3    0
4    0
Name: ac, dtype: int64

Testing Dataset
    load  hourofday    dif  absdif  max  var  entropy  nonlinear  hurst  \
0  1.869          0  0.000   0.000  0.0  0.0      0.0        0.0    0.0   
1  1.673          0 -0.196   0.196  0.0  0.0      0.0        0.0    0.0   
2  1.660          0 -0.013   0.013  0.0  0.0      0.0        0.0    0.0   


In [6]:
# Decision Tree

# Use TimeSeriesSplit. We will make the time series the equivalent of 4 hour periods (even though the back of the data is not continuous)
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix


# Define the split details.\
n_splits = 10
test_size = int(train_X.shape[0]/11)
print("Number of splits: {}".format(n_splits))

tscv = TimeSeriesSplit(n_splits, test_size=test_size)

model = DecisionTreeClassifier()
average_acc = []

i = 1
for train_index, test_index in tscv.split(train_X):
    X_tr, X_val = train_X.iloc[train_index], train_X.iloc[test_index]
    y_tr, y_val = train_y.iloc[train_index], train_y.iloc[test_index]
    
    model.fit(X_tr, y_tr)
    y_pred = model.predict(X_val)

    # Find the performance info
    accuracy = metrics.accuracy_score(y_val, y_pred)
    precision = metrics.precision_score(y_val, y_pred, zero_division=0)
    recall = metrics.recall_score(y_val, y_pred, zero_division=1)
    f1 = metrics.f1_score(y_val, y_pred, zero_division=1)
    
    # Get the performance details
    average_acc.append(accuracy)
    print(f"Split: {i}: Accuracy: {accuracy} {precision} {recall} {f1}")
    cm = confusion_matrix(y_val,y_pred)
    cr = classification_report(y_val,y_pred)
    print(cm)
    print(cr)
    i+=1
    
print("Average Accuracy: {}".format(np.mean(average_acc)))

Number of splits: 10
Split: 1: Accuracy: 0.9781956075209354 0.97556142668428 0.9787939032471835 0.9771749917300694
[[19422   444]
 [  384 17724]]
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     19866
           1       0.98      0.98      0.98     18108

    accuracy                           0.98     37974
   macro avg       0.98      0.98      0.98     37974
weighted avg       0.98      0.98      0.98     37974

Split: 2: Accuracy: 0.9837257070627271 0.9847786420820128 0.9855358000961076 0.9851570756076472
[[16847   317]
 [  301 20509]]
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     17164
           1       0.98      0.99      0.99     20810

    accuracy                           0.98     37974
   macro avg       0.98      0.98      0.98     37974
weighted avg       0.98      0.98      0.98     37974

Split: 3: Accuracy: 0.9905461631642702 0.9911807428927163 0.9902047162477

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Split: 8: Accuracy: 0.9953389161004899 0.0 1.0 0.0
[[37797   177]
 [    0     0]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     37974
           1       0.00      0.00      0.00         0

    accuracy                           1.00     37974
   macro avg       0.50      0.50      0.50     37974
weighted avg       1.00      1.00      1.00     37974



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Split: 9: Accuracy: 0.9888081318797072 0.0 1.0 0.0
[[37549   425]
 [    0     0]]
              precision    recall  f1-score   support

           0       1.00      0.99      0.99     37974
           1       0.00      0.00      0.00         0

    accuracy                           0.99     37974
   macro avg       0.50      0.49      0.50     37974
weighted avg       1.00      0.99      0.99     37974



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Split: 10: Accuracy: 0.995496918944541 0.0 1.0 0.0
[[37803   171]
 [    0     0]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     37974
           1       0.00      0.00      0.00         0

    accuracy                           1.00     37974
   macro avg       0.50      0.50      0.50     37974
weighted avg       1.00      1.00      1.00     37974

Average Accuracy: 0.9872149365355243


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
