# Assignment 4
* Unit: SIT720
* Name: Bryon Baker
* Student #: 85031775

In [1]:
#!pip3 install pandas
#!pip3 install numpy

In [2]:
import sys
import warnings
import random
import pandas as pd # dataframe manipulation
import numpy as np # linear algebra

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor

In [3]:
# Seed the random nunmber generator to ensure K-Means randomisation works properly.
random.seed(0)

print(f"Seeding random number generator. Random number is: {random.random()}")

Seeding random number generator. Random number is: 0.8444218515250481


# Helper Functions
This section defines any helper functions used in the code below

In [4]:
# Calculate the inverse of log(x+1) and assign any negative value to zero
def inverse_log_transform( vals ):
    vals_x = np.exp(vals)-1
    vals_x[vals_x < 0] = 0
    
    return vals_x

In [5]:
# A helper function to standardise all the values in the dataset with a mean=0 and SD=1
def standardise_dataset( dataset_name, dataset, column_headings ):
    # Standardise all attributes to zero mean and SD 1
    scaler = StandardScaler(copy=False, with_mean=True, with_std=True)
    scaler.fit(dataset)
    result = scaler.transform(dataset)
    # print(f"Transformed Data Frame:\n{SVM_STFWI_df}")
    m = np.mean(result, axis=0)
    s = np.std(result, axis=0)
    #print("Checking transformation")
    #print(f"Column  means:\n{m}")
    #print(f"Column  SD:\n{s}")

    # The StandardScaler converts to an array. Convert back to a DataFrame
    result_df = pd.DataFrame(result)
    result_df.columns = column_headings     # Reassign the column names.
    # print(result_df.head())

    save_dataset( dataset_name, result_df)      # Save the dataset to a file for exploration later
    
    return result_df

In [6]:
# Helper to calculate the RMSE
def calc_rmse( y, y_hat ):
    # Perform the inverse transform of y and h_hat
    yy = np.exp(y)-1
    yy[yy < 0] = 0
    
    yy_hat = np.exp(y_hat)-1
    yy_hat[yy_hat < 0] = 0
        
    rmse = mean_squared_error(yy, yy_hat, squared=False)
    
    return rmse

In [7]:
from pathlib import Path 
    
def save_dataset( filename, dataset ):
    filename = "~/datasets/SIT720/Ass4/transformed/" + filename + ".csv"
    filepath = Path(filename)  
    filepath.parent.mkdir(parents=True, exist_ok=True)
    dataset.to_csv(filepath)

# Load the source dataset
Dataset is stored in public git repo. COnfigure the ssl context in order to access github raw data files.

In [8]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

url = 'https://raw.githubusercontent.com/bryonbaker/datasets/main/SIT720/Ass4/forestfires.csv'
full_df = pd.read_csv(url)
print(f"{full_df.head()}\n")

   X  Y month  day  FFMC   DMC     DC  ISI  temp  RH  wind  rain  area
0  7  5   mar  fri  86.2  26.2   94.3  5.1   8.2  51   6.7   0.0   0.0
1  7  4   oct  tue  90.6  35.4  669.1  6.7  18.0  33   0.9   0.0   0.0
2  7  4   oct  sat  90.6  43.7  686.9  6.7  14.6  33   1.3   0.0   0.0
3  8  6   mar  fri  91.7  33.3   77.5  9.0   8.3  97   4.0   0.2   0.0
4  8  6   mar  sun  89.3  51.3  102.2  9.6  11.4  99   1.8   0.0   0.0



# Pre Process the Data
1. One hot encode the day and month attributes.
2. Ordinal encode the day/month for DT, MR, RF
3. Transform the area to be y = ln(area+1)

## Preprocess area
Reduce the skewness of area as per procedure on page 5.
This will be used to train the model, but it is important to take the inverse log after prediction so as to find the predicted area. 


In [9]:
# Check the dataset has no NaNs.
if full_df.isnull().sum().sum() == 0:
    print("Data does not have any missing values.")
else:
    print("Data has missing values and needs pre processing.")
    print("Stopping....")
    quit() 

Data does not have any missing values.


In [10]:
area_df = pd.DataFrame(full_df["area"])
area_df["xfrmd_area"] = full_df["area"] +1                 # Create a new area column that will contain the ln(area+1)
print(area_df.head())
area_df["xfrmd_area"] = np.log(area_df['xfrmd_area'])      # Take the ln(area+1)
print(area_df.head()) 

   area  xfrmd_area
0   0.0         1.0
1   0.0         1.0
2   0.0         1.0
3   0.0         1.0
4   0.0         1.0
   area  xfrmd_area
0   0.0         0.0
1   0.0         0.0
2   0.0         0.0
3   0.0         0.0
4   0.0         0.0


### 1-of-C Encoding

In [11]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(handle_unknown='ignore', drop=None, dtype='int')

In [12]:
# One-hot encode month
transformed = ohe.fit_transform(full_df[['month']])
ohe_month_df = pd.DataFrame(transformed.toarray())
ohe_month_df.columns = ohe.categories_[0]

In [13]:
# One hot encode day
transformed = ohe.fit_transform(full_df[['day']])
ohe_day_df = pd.DataFrame(transformed.toarray())
ohe_day_df.columns = ohe.categories_[0]

In [14]:
# encoded_df holds all the training.test data that has been 1-of-C encoded.
# unencoded
ohe_encoded_df = full_df.drop(['month','day','area'], axis=1)
ohe_encoded_df = pd.concat([ohe_encoded_df, ohe_month_df], axis=1)
ohe_encoded_df = pd.concat([ohe_encoded_df, ohe_day_df], axis=1)
ohe_encoded_df.head(20)

Unnamed: 0,X,Y,FFMC,DMC,DC,ISI,temp,RH,wind,rain,...,nov,oct,sep,fri,mon,sat,sun,thu,tue,wed
0,7,5,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,...,0,0,0,1,0,0,0,0,0,0
1,7,4,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,...,0,1,0,0,0,0,0,0,1,0
2,7,4,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,...,0,1,0,0,0,1,0,0,0,0
3,8,6,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,...,0,0,0,1,0,0,0,0,0,0
4,8,6,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,...,0,0,0,0,0,0,1,0,0,0
5,8,6,92.3,85.3,488.0,14.7,22.2,29,5.4,0.0,...,0,0,0,0,0,0,1,0,0,0
6,8,6,92.3,88.9,495.6,8.5,24.1,27,3.1,0.0,...,0,0,0,0,1,0,0,0,0,0
7,8,6,91.5,145.4,608.2,10.7,8.0,86,2.2,0.0,...,0,0,0,0,1,0,0,0,0,0
8,8,6,91.0,129.5,692.6,7.0,13.1,63,5.4,0.0,...,0,0,1,0,0,0,0,0,1,0
9,7,5,92.5,88.0,698.6,7.1,22.8,40,4.0,0.0,...,0,0,1,0,0,1,0,0,0,0


### Ordinal Encoding

In [15]:
# Ordinal encode month
from sklearn.preprocessing import OrdinalEncoder

ore = OrdinalEncoder(dtype='int')

transformed = ore.fit_transform(full_df[['month']])
ord_month_df = pd.DataFrame(transformed)
ord_month_df.columns = ['month']

In [16]:
# Ordinal encode month
from sklearn.preprocessing import OrdinalEncoder

ore = OrdinalEncoder(dtype='int')

transformed = ore.fit_transform(full_df[['day']])
ord_day_df = pd.DataFrame(transformed)
ord_day_df.columns = ['day']

In [17]:
ord_encoded_df = full_df     # RF and DT need the unencoded dataframe
ord_encoded_df = full_df.drop(['month','day','area'], axis=1)
ord_encoded_df = pd.concat([ord_encoded_df, ord_month_df], axis=1)
ord_encoded_df = pd.concat([ord_encoded_df, ord_day_df], axis=1)
ord_encoded_df.head(20)

Unnamed: 0,X,Y,FFMC,DMC,DC,ISI,temp,RH,wind,rain,month,day
0,7,5,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,7,0
1,7,4,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,10,5
2,7,4,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,10,2
3,8,6,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,7,0
4,8,6,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,7,3
5,8,6,92.3,85.3,488.0,14.7,22.2,29,5.4,0.0,1,3
6,8,6,92.3,88.9,495.6,8.5,24.1,27,3.1,0.0,1,1
7,8,6,91.5,145.4,608.2,10.7,8.0,86,2.2,0.0,1,1
8,8,6,91.0,129.5,692.6,7.0,13.1,63,5.4,0.0,11,5
9,7,5,92.5,88.0,698.6,7.1,22.8,40,4.0,0.0,11,2


In [18]:
del full_df         # Make sure we don't accidentally use full_df later

## Create Sub-Datasets

Now we have all the raw ingredients for all datasets, create the sub datasets that the different models will need.

In [19]:
# Define the sub datasets from the transformed dataset

# Get the 1-of-C encoded column names for month and day
month_cols = ohe_month_df.columns.tolist()
day_cols = ohe_day_df.columns.tolist()

# Construct the dataframe column names
STFWI_encoded_cols = ['X','Y','FFMC','DMC','DC','ISI']
STFWI_encoded_cols = STFWI_encoded_cols + month_cols + day_cols
STM_encoded_cols = ['X','Y','temp','RH','wind','rain']
STM_encoded_cols = STM_encoded_cols + month_cols + day_cols
FWI_encoded_cols = ['FFMC','DMC','DC','ISI']
M_encoded_cols = ['temp','RH','wind','rain']

# Construct the dataframe column names
STFWI_unencoded_cols = ['X','Y','month','day','FFMC','DMC','DC','ISI']
STM_unencoded_cols = ['X','Y','month','day','temp','RH','wind','rain']
FWI_unencoded_cols = ['FFMC','DMC','DC','ISI']
M_unencoded_cols = ['temp','RH','wind','rain']


In [20]:
# create feature tables for MR, SVM, and NN
STFWI_ohe_encoded_df = ohe_encoded_df[STFWI_encoded_cols]
STM_ohe_encoded_df = ohe_encoded_df[STM_encoded_cols]
FWI_ohe_encoded_df = ohe_encoded_df[FWI_encoded_cols]
M_ohe_encoded_df = ohe_encoded_df[M_encoded_cols]
# Create the feature tables for RF and DT
STFWI_ord_encoded_df = ord_encoded_df[STFWI_unencoded_cols]
STM_ord_encoded_df = ord_encoded_df[STM_unencoded_cols]
FWI_ord_encoded_df = ord_encoded_df[FWI_unencoded_cols]
M_ord_encoded_df = ord_encoded_df[M_unencoded_cols]
# Create the transformed area ln(area+1)
y_xformed = area_df["xfrmd_area"]
y_vanilla = area_df["area"]

In [21]:
print("SVM, MR, and NN Dataset columns:")
print("="*16)
print(f"STFWI Dataset columns:\n{STFWI_ohe_encoded_df.columns}\n")
print(f"STM Dataset columns:\n{STM_ohe_encoded_df.columns}\n")
print(f"FWI Dataset columns:\n{FWI_ohe_encoded_df.columns}\n")
print(f"M Dataset columns:\n{M_ohe_encoded_df.columns}\n")
print("="*16)
print("DT and RF Dataset columns:")
print("="*16)
print(f"STFWI Dataset columns:\n{STFWI_ord_encoded_df.columns}\n")
print(f"STM Dataset columns:\n{STM_ord_encoded_df.columns}\n")
print(f"FWI Dataset columns:\n{FWI_ord_encoded_df.columns}\n")
print(f"M Dataset columns:\n{M_ord_encoded_df.columns}\n")

SVM, MR, and NN Dataset columns:
STFWI Dataset columns:
Index(['X', 'Y', 'FFMC', 'DMC', 'DC', 'ISI', 'apr', 'aug', 'dec', 'feb', 'jan',
       'jul', 'jun', 'mar', 'may', 'nov', 'oct', 'sep', 'fri', 'mon', 'sat',
       'sun', 'thu', 'tue', 'wed'],
      dtype='object')

STM Dataset columns:
Index(['X', 'Y', 'temp', 'RH', 'wind', 'rain', 'apr', 'aug', 'dec', 'feb',
       'jan', 'jul', 'jun', 'mar', 'may', 'nov', 'oct', 'sep', 'fri', 'mon',
       'sat', 'sun', 'thu', 'tue', 'wed'],
      dtype='object')

FWI Dataset columns:
Index(['FFMC', 'DMC', 'DC', 'ISI'], dtype='object')

M Dataset columns:
Index(['temp', 'RH', 'wind', 'rain'], dtype='object')

DT and RF Dataset columns:
STFWI Dataset columns:
Index(['X', 'Y', 'month', 'day', 'FFMC', 'DMC', 'DC', 'ISI'], dtype='object')

STM Dataset columns:
Index(['X', 'Y', 'month', 'day', 'temp', 'RH', 'wind', 'rain'], dtype='object')

FWI Dataset columns:
Index(['FFMC', 'DMC', 'DC', 'ISI'], dtype='object')

M Dataset columns:
Index(['temp', 'R

## Decision Tree Dataset Preparation

In [22]:
DT_STFWI_df = STFWI_ord_encoded_df
DT_STM_df = STM_ord_encoded_df
DT_FWI_df = FWI_ord_encoded_df
DT_M_df = M_ord_encoded_df

save_dataset( "DT_STFWI_df", DT_STFWI_df )
save_dataset( "DT_STM_df", DT_STM_df )
save_dataset( "DT_FWI_df", DT_FWI_df )
save_dataset( "DT_M_df", DT_M_df )



## Random Forrest Dataset Preparation

In [23]:
RF_STFWI_df = STFWI_ord_encoded_df
RF_STM_df = STM_ord_encoded_df
RF_FWI_df = FWI_ord_encoded_df
RF_M_df = M_ord_encoded_df

save_dataset( "RF_STFWI_df", RF_STFWI_df )
save_dataset( "RF_STM_df", RF_STM_df )
save_dataset( "RF_FWI_df", RF_FWI_df )
save_dataset( "RF_M_df", RF_M_df )

## SVM Data Set Preparation

In [24]:
# Standardise all attributes to zero mean and SD 1
SVM_STFWI_df = STFWI_ohe_encoded_df
SVM_STM_df = STM_ohe_encoded_df
SVM_FWI_df = FWI_ohe_encoded_df
SVM_M_df = M_ohe_encoded_df

SVM_STFWI_df = standardise_dataset("SVM_STFWI_df", SVM_STFWI_df, STFWI_encoded_cols)
SVM_STM_df = standardise_dataset("SVM_STM_df", SVM_STM_df, STM_encoded_cols)
SVM_FWI_df = standardise_dataset("SVM_FWI_df", SVM_FWI_df, FWI_encoded_cols)
SVM_M_df = standardise_dataset("SVM_M_df", SVM_M_df, M_encoded_cols)

## Multiple Rgression Data Set Preparation

In [25]:
MR_STFWI_df = STFWI_ohe_encoded_df
MR_STM_df = STM_ohe_encoded_df
MR_FWI_df = FWI_ohe_encoded_df
MR_M_df = M_ohe_encoded_df

MR_STFWI_df = standardise_dataset("MR_STFWI_df", MR_STFWI_df, STFWI_encoded_cols)
MR_STM_df = standardise_dataset("MR_STM_df", MR_STM_df, STM_encoded_cols)
MR_FWI_df = standardise_dataset("MR_FWI_df", MR_FWI_df, FWI_encoded_cols)
MR_M_df = standardise_dataset("MR_M_df", MR_M_df, M_encoded_cols)

## Neural Network Dataset Preparation



In [26]:
NN_STFWI_df = STFWI_ohe_encoded_df
NN_STM_df = STM_ohe_encoded_df
NN_FWI_df = FWI_ohe_encoded_df
NN_M_df = M_ohe_encoded_df

NN_STFWI_df = standardise_dataset("NN_STFWI_df", NN_STFWI_df, STFWI_encoded_cols)
NN_STM_df = standardise_dataset("NN_STM_df", NN_STM_df, STM_encoded_cols)
NN_FWI_df = standardise_dataset("NN_FWI_df", NN_FWI_df, FWI_encoded_cols)
NN_M_df = standardise_dataset("NN_M_df", NN_M_df, M_encoded_cols)

# Define All Hyperparameters
Define all the hyperparameters from the paper that will be used in the models.

There hyperparameters are declared in a single location to aid in marking.

In [27]:
# Multiple Regression
fit_intercept = True
copy_X = True
n_jobs = None
positive = False

# Create the Results DataFrame

In [28]:
results_df = pd.DataFrame(columns=['Model','STFWI (RMSE)','STM (RMSE)','FWI (RMSE)','M (RMSE)'])

# Calculate the Naive results

In [29]:
# Generate an array with the predicted values being the mean of the area
y = area_df["area"]
y_pred = np.mean(y)
y_naive_pred = np.repeat(y_pred, len(y))

# Calculate the naive RMSE
naive_rmse = mean_squared_error(y, y_naive_pred, squared=False)
print(naive_rmse)

# Add the naive results to the Results Table
rmse_df = pd.DataFrame([['Naive', naive_rmse, naive_rmse, naive_rmse, naive_rmse]], columns=['Model','STFWI (RMSE)','STM (RMSE)','FWI (RMSE)','M (RMSE)'])
results_df.append(rmse_df)



63.59422598281833


Unnamed: 0,Model,STFWI (RMSE),STM (RMSE),FWI (RMSE),M (RMSE)
0,Naive,63.594226,63.594226,63.594226,63.594226


# Decision Tree

Scoring details:

RMSE: https://scikit-learn.org/stable/modules/model_evaluation.html

https://stackoverflow.com/questions/62514395/score-obtained-from-cross-val-score-is-rmse-or-mse


In [30]:
from sklearn.model_selection import train_test_split

print("DECISION TREE")
print("*"*30)

# Get the list of training / validating datasets
train_set = { "STFWI" : DT_STFWI_df, "STM" : DT_STM_df, "FWI" : DT_FWI_df, "M" : DT_M_df}

# Iterate over each dataset, fit the mode and evaluate the performance with RMSE
for key in train_set:
    print("="*10)
    print(key)

    # Split the dataset for train/test
    X_train, X_test, y_train, y_test = train_test_split( train_set[key], y_xformed, test_size=0.30)

    # Set hyperparameters according to the research paper
    # min_samples_split = 2
    model = DecisionTreeRegressor(min_samples_split = 2)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    rmse = calc_rmse( y_pred, y_test )
    
    # Take the inverse of ln(area+1) to calculate the rmse
    #y_norm = np.exp(y_test)-1
    #y_hat = np.exp(y_pred)-1
    #y_norm[y_norm < 0] = 0
    #y_hat[y_hat < 0] = 0
    
    #y_norm = inverse_log_transform(y_test)
    #y_hat = inverse_log_transform(y_pred)
          
    #rmse = mean_squared_error(y_norm, y_hat, squared=False)
    print(f"RMSE: {rmse}")
    
del train_set

DECISION TREE
******************************
STFWI
RMSE: 95.1968323437938
STM
RMSE: 75.33652932561533
FWI
RMSE: 30.443733592456027
M
RMSE: 110.1539982063748


In [31]:
X_train, X_test, y_train, y_test = train_test_split( y_xformed, y_xformed, test_size=0.30)

y = []
for i in y_train:
    y.append(i+0)

print(calc_rmse( X_train, y))

0.0


# Random Forrest

In [32]:
### TODO - Validate is this shuld be a regressor

In [38]:
from sklearn.ensemble import RandomForestRegressor

print("RANDOM FORREST")
print("*"*30)


# Get the list of training / validating datasets
train_set = { "STFWI" : RF_STFWI_df, "STM" : RF_STM_df, "FWI" : RF_FWI_df, "M" : RF_M_df}

for key in train_set:
    print(key)
    print("="*10)
    
    # Split the dataset for train/test
    X_train, X_test, y_train, y_test = train_test_split( train_set[key], y_xformed, test_size=0.30)

    # Set hyperparameters according to research paper
    # Number of trees: 500
    model = RandomForestRegressor(n_estimators = 500)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    rmse = calc_rmse( y_pred, y_test )
    
    print(f"RMSE: {rmse}")
    

del train_set     # Delete the variables you do not want to be picked up accidentally elsewhere.

RANDOM FORREST
******************************
STFWI
RMSE: 87.46664496978836
STM
RMSE: 91.82345550606192
FWI
RMSE: 109.1893603683164
M
RMSE: 35.257379263276825


# Multiple Regression

In [39]:
from sklearn.linear_model import LinearRegression

print("LINEAR REGRESSION")
print("*"*30)

# Get the list of training / validating datasets
train_set = { "STFWI" : MR_STFWI_df, "STM" : MR_STM_df, "FWI" : MR_FWI_df, "M" : MR_M_df}

for key in train_set:
    print(key)
    print("="*10)
    
    # Split the dataset for train/test
    X_train, X_test, y_train, y_test = train_test_split( train_set[key], y_xformed, test_size=0.30)

    # Set hyperparameters according to research paper.
    # Use Least squares method.  Nothing to do. From scikit documentation: 
    # From the implementation point of view, this is just plain Ordinary Least Squares (scipy.linalg.lstsq)"
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    rmse = calc_rmse( y_pred, y_test )
    
    print(f"RMSE: {rmse}")
    
del train_set     # Delete the variables you do not want to be picked up accidentally elsewhere.

LINEAR REGRESSION
******************************
STFWI
RMSE: 67.84119505705644
STM


  yy = np.exp(y)-1


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').