Make sure to install the requirements before beginning.

In [None]:
!pip install -r requirements.txt

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Uncomment to restore previous session
"""
import dill
dill.load_session('svm_final.db')
"""

import pandas as pd
import numpy as np

from svm_estimator import SVM
from sklearn import svm

from scipy import stats
from sklearn import metrics
from sklearn.model_selection import cross_validate, GridSearchCV, RandomizedSearchCV

import matplotlib.pyplot as plt

# Support Vector Machine

## Preprocessing the Data

In [3]:
# Basic pre-processing that the team decided is sufficient for initial consumption. (Remove age outliers > 180, and drop PetID and RescuerID, and Name)
%env TRAIN_PATH=./data/train/train.csv

%run preprocess.ipynb

processed_df = df.reset_index()
processed_df = processed_df.drop(["index"], axis=1)

# SVM prefers scaled data, so we must scale all columns
# Type: Ordinal rank between 0-1
processed_df["Type"] = processed_df["Type"].replace({1: 0, 2: 1})
# Age: Scale from 0-1 using Min-Max Scaling
min = processed_df["Age"].min()
max = processed_df["Age"].max()
processed_df["Age"] = (processed_df["Age"] - min) / (max - min)
# Breed: Get the Frequency of the Breed in Each Column and ordinally rank based on frequency. Finally, min max scale down.
breed1_count = processed_df["Breed1"].value_counts()
breed2_count = processed_df["Breed2"].value_counts()
processed_df["Breed1"] = processed_df["Breed1"].map(breed1_count)
processed_df["Breed2"] = processed_df["Breed2"].map(breed2_count)
min = processed_df["Breed1"].min()
max = processed_df["Breed1"].max()
processed_df["Breed1"] = (processed_df["Breed1"] - min) / (max - min)
min = processed_df["Breed2"].min()
max = processed_df["Breed2"].max()
processed_df["Breed2"] = (processed_df["Breed2"] - min) / (max - min)
# Gender: One-hot encode into male, female, or both
gender = []
for i in range(0, len(processed_df)):
    if processed_df["Gender"][i] == 1: # Male
        gender.append([1, 0])
    elif processed_df["Gender"][i] == 2: # Female
        gender.append([0, 1])
    else:
        gender.append([1, 1])
gender = pd.DataFrame(gender, columns=["Gender_Male", "Gender_Female"])
processed_df = processed_df.drop("Gender", axis=1)
processed_df = pd.concat([processed_df, gender], axis=1)
# Color: One-hot encode into black, brown, golden, yellow, cream, gray, white (+7 features)
color = pd.get_dummies(processed_df.loc[:,"Color1"].values)
color = color.rename(columns={
    1: "Color1_Black",
    2: "Color1_Brown",
    3: "Color1_Golden",
    4: "Color1_Yellow",
    5: "Color1_Cream",
    6: "Color1_Gray",
    7: "Color1_White"
})
processed_df = processed_df.drop("Color1", axis=1)
processed_df = pd.concat([processed_df, color], axis=1)

color = pd.get_dummies(processed_df.loc[:,"Color2"].values)
color = color.rename(columns={
    0: "Color2_None",
    1: "Color2_Black",
    2: "Color2_Brown",
    3: "Color2_Golden",
    4: "Color2_Yellow",
    5: "Color2_Cream",
    6: "Color2_Gray",
    7: "Color2_White"
})
processed_df = processed_df.drop("Color2", axis=1)
processed_df = pd.concat([processed_df, color], axis=1)

color = pd.get_dummies(processed_df.loc[:,"Color3"].values)
color = color.rename(columns={
    0: "Color3_None",
    1: "Color3_Black",
    2: "Color3_Brown",
    3: "Color3_Golden",
    4: "Color3_Yellow",
    5: "Color3_Cream",
    6: "Color3_Gray",
    7: "Color3_White"
})
processed_df = processed_df.drop("Color3", axis=1)
processed_df = pd.concat([processed_df, color], axis=1)
# Maturity: One-hot encode into 5 categories (+5 features)
maturity = pd.get_dummies(processed_df.loc[:,"MaturitySize"].values)
maturity = maturity.rename(columns={0: "MaturitySize_NotSpecified", 1: "MaturitySize_Small", 2: "MaturitySize_Medium", 3: "MaturitySize_Large", 4: "MaturitySize_ExtraLarge"})
processed_df = processed_df.drop("MaturitySize", axis=1)
processed_df = pd.concat([processed_df, maturity], axis=1)
# Fur Length: One-hot encode into 4 categories (+4 features)
fur = pd.get_dummies(processed_df.loc[:,"FurLength"].values)
fur = fur.rename(columns={0: "FurLength_NotSpecified", 1: "FurLength_Short", 2: "FurLength_Medium", 3: "FurLength_Long"})
processed_df = processed_df.drop("FurLength", axis=1)
processed_df = pd.concat([processed_df, fur], axis=1)
# Vaccinated: Re-rank from Unsure to Yes 0, 0.5, 1
processed_df["Vaccinated"] = processed_df["Vaccinated"].replace({1: 1, 2: 0.5, 3: 0})
# Dewormed: Re-rank from Unsure to Yes 0, 0.5, 1
processed_df["Dewormed"] = processed_df["Dewormed"].replace({1: 1, 2: 0.5, 3: 0})
# Sterilized: Re-rank from Unsure to Yes 0, 0.5, 1
processed_df["Sterilized"] = processed_df["Sterilized"].replace({1: 1, 2: 0.5, 3: 0})
# Health: Re-rank from 0, 1/3, 2/3, 1 for Not Specified, Serious Injury, Minor Injury, Healthy
processed_df["Health"] = processed_df["Health"].replace({0: 0, 1: 1, 2: 2/3, 3: 1/3})
# Fee: Scale from 0-1
min = processed_df["Fee"].min()
max = processed_df["Fee"].max()
processed_df["Fee"] = (processed_df["Fee"] - min) / (max - min)
# Quantity: Scale from 0-1
min = processed_df["Quantity"].min()
max = processed_df["Quantity"].max()
processed_df["Quantity"] = (processed_df["Quantity"] - min) / (max - min)
# State: One-hot encode the location (+15 features)
state = pd.get_dummies(processed_df.loc[:,"State"].values)
state = state.rename(columns={
    41336: "State_Johor",
    41325: "State_Kedah",
    41367: "State_Kelantan",
    41401: "State_KualaLampur",
    41415: "State_Labuan",
    41324: "State_Melaka",
    41332: "State_NegeriSembilan",
    41335: "State_Pahang",
    41330: "State_Perak",
    41380: "State_Perlis",
    41327: "State_PulauPinang",
    41345: "State_Sabah",
    41342: "State_Sarawak",
    41326: "State_Selangor",
    41361: "State_Terengganu"
})
processed_df = processed_df.drop("State", axis=1)
processed_df = pd.concat([processed_df, state], axis=1)
# VideoAmt: Scale from 0-1
min = processed_df["VideoAmt"].min()
max = processed_df["VideoAmt"].max()
processed_df["VideoAmt"] = (processed_df["VideoAmt"] - min) / (max - min)
# PhotoAmt: Scale from 0-1
min = processed_df["PhotoAmt"].min()
max = processed_df["PhotoAmt"].max()
processed_df["PhotoAmt"] = (processed_df["PhotoAmt"] - min) / (max - min)

display(df)
display(processed_df)

env: TRAIN_PATH=./data/train/train.csv


Unnamed: 0,Type,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,FurLength,Vaccinated,Dewormed,Sterilized,Health,Quantity,Fee,State,VideoAmt,PhotoAmt,AdoptionSpeed
0,2,3,299,0,1,1,7,0,1,1,2,2,2,1,1,100,41326,0,1.0,2
1,2,1,265,0,1,1,2,0,2,2,3,3,3,1,1,0,41401,0,2.0,0
2,1,1,307,0,1,2,7,0,2,2,1,1,2,1,1,0,41326,0,7.0,3
3,1,4,307,0,2,1,2,0,2,1,1,1,2,1,1,150,41401,0,8.0,2
4,1,1,307,0,1,1,0,0,2,1,2,2,2,1,1,0,41326,0,3.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14988,2,2,266,0,3,1,0,0,2,2,2,2,2,1,4,0,41326,0,3.0,2
14989,2,60,265,264,3,1,4,7,2,2,1,1,1,1,2,0,41326,0,3.0,4
14990,2,2,265,266,3,5,6,7,3,2,2,1,3,1,5,30,41326,0,5.0,3
14991,2,9,266,0,2,4,7,0,1,1,1,1,1,1,1,0,41336,0,3.0,4


Unnamed: 0,Type,Age,Breed1,Breed2,Vaccinated,Dewormed,Sterilized,Health,Quantity,Fee,...,State_Perak,State_NegeriSembilan,State_Pahang,State_Johor,State_Sarawak,State_Sabah,State_Terengganu,State_Kelantan,State_KualaLampur,State_Labuan
0,1,0.03,0.057777,1.000000,0.5,0.5,0.5,1.0,0.000000,0.250,...,0,0,0,0,0,0,0,0,0,0
1,1,0.01,0.212809,1.000000,0.0,0.0,0.0,1.0,0.000000,0.000,...,0,0,0,0,0,0,0,0,1,0
2,0,0.01,1.000000,1.000000,1.0,1.0,0.5,1.0,0.000000,0.000,...,0,0,0,0,0,0,0,0,0,0
3,0,0.04,1.000000,1.000000,1.0,1.0,0.5,1.0,0.000000,0.375,...,0,0,0,0,0,0,0,0,1,0
4,0,0.01,1.000000,1.000000,0.5,0.5,0.5,1.0,0.000000,0.000,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14791,1,0.02,0.613690,1.000000,0.5,0.5,0.5,1.0,0.157895,0.000,...,0,0,0,0,0,0,0,0,0,0
14792,1,0.60,0.212809,0.010837,1.0,1.0,1.0,1.0,0.052632,0.000,...,0,0,0,0,0,0,0,0,0,0
14793,1,0.02,0.212809,0.056163,0.5,1.0,0.0,1.0,0.210526,0.075,...,0,0,0,0,0,0,0,0,0,0
14794,1,0.09,0.613690,1.000000,1.0,1.0,1.0,1.0,0.000000,0.000,...,0,0,0,1,0,0,0,0,0,0


In [4]:
# Partition the data into training, holdout, and test sets.
# 14796 total data points
# 12576 as training - ~85%
# 1479 for validation - ~10%
# 741 for training - ~5%

size = 14796
train_index = 12576
holdout_index = train_index + 1479
test_index = size - 741

train = processed_df[:train_index]
train_adoptionspeed = train["AdoptionSpeed"].to_numpy()
train = train.drop(["AdoptionSpeed"], axis=1)
train = train.to_numpy()

holdout = processed_df[train_index:holdout_index]
holdout_adoptionspeed = holdout["AdoptionSpeed"].to_numpy()
holdout = holdout.drop(["AdoptionSpeed"], axis=1)
holdout = holdout.to_numpy()

test = processed_df[test_index:]
test_adoptionspeed = test["AdoptionSpeed"].to_numpy()
test = test.drop(["AdoptionSpeed"], axis=1)
test = test.to_numpy()

## Predict for Each Implementation

In [6]:
clf_sklearn = svm.SVC(kernel="linear", C=1.0)
clf_ours = SVM(C=1.0)

In [7]:
clf_sklearn.fit(train, train_adoptionspeed)
clf_ours.fit(train, train_adoptionspeed)
print(train.shape)

(12576, 55)


In [8]:
prd_sklearn = clf_sklearn.predict(train)
prd_ours = clf_ours.predict(train)

## Initial Correctness Statistics

In [9]:
statistics = [
    [metrics.accuracy_score(train_adoptionspeed,prd_sklearn), metrics.accuracy_score(train_adoptionspeed,prd_ours)], # Accuracy
    [metrics.precision_score(train_adoptionspeed,prd_sklearn, average='macro', zero_division=np.nan), metrics.precision_score(train_adoptionspeed,prd_ours, average='macro', zero_division=np.nan)], # Precision
    [metrics.recall_score(train_adoptionspeed,prd_sklearn, average='macro', zero_division=np.nan), metrics.recall_score(train_adoptionspeed,prd_ours, average='macro', zero_division=np.nan)], # Recall
    [metrics.f1_score(train_adoptionspeed,prd_sklearn, average='macro', zero_division=np.nan), metrics.f1_score(train_adoptionspeed,prd_ours, average='macro', zero_division=np.nan)] # F1 Score
]
statistics = pd.DataFrame(statistics)
statistics = statistics.set_axis(['Accuracy', 'Precision', 'Recall', 'F1 Score'], axis='index')
statistics = statistics.set_axis(['SKLearn', 'From Scratch'], axis='columns')
display(statistics)

Unnamed: 0,SKLearn,From Scratch
Accuracy,0.356632,0.367048
Precision,0.355964,0.315046
Recall,0.271524,0.295385
F1 Score,0.304238,0.2851


## Hyperparameter Tuning

### Randomized Search (SKLearn)

In [12]:
svc = svm.SVC()
param_grid = {
    'C': stats.loguniform(1e-1, 1e3),
    'kernel': ['linear']
}
clf = RandomizedSearchCV(
    svm.SVC(kernel="linear"), param_grid, n_iter=10, n_jobs=-1
)
clf = clf.fit(train, train_adoptionspeed)
print(clf.best_estimator_)

SVC(C=98.05701671998274, kernel='linear')


Let's confirm that C=98.05701671998274 is a good hyperparameter for SKLearn

In [6]:
clf_98 =svm.SVC(C=98.05701671998274, kernel='linear')
clf_1 = svm.SVC(C=1, kernel='linear')

In [7]:
clf_1.fit(train, train_adoptionspeed)

In [8]:
clf_98.fit(train, train_adoptionspeed)

In [11]:
prd98 = clf_98.predict(train)
prd1 = clf_1.predict(train)
print(metrics.f1_score(train_adoptionspeed,prd98, average='macro', zero_division=np.nan))
print(metrics.f1_score(train_adoptionspeed,prd1, average='macro', zero_division=np.nan))
prd98 = clf_98.predict(holdout)
prd1 = clf_1.predict(holdout)
print(metrics.f1_score(holdout_adoptionspeed,prd98, average='macro', zero_division=np.nan))
print(metrics.f1_score(holdout_adoptionspeed,prd1, average='macro', zero_division=np.nan))
prd98 = clf_98.predict(test)
prd1 = clf_1.predict(test)
print(metrics.f1_score(test_adoptionspeed,prd98, average='macro', zero_division=np.nan))
print(metrics.f1_score(test_adoptionspeed,prd1, average='macro', zero_division=np.nan))

0.31208681041683656
0.3042380683282776
0.2952587445773459
0.2884279280276669
0.29714193789794124
0.29329407689126796


Performs better than our previous decision to use the default hyperparameter (C = 1). Unfortunately, grid search takes too long based on the sheer scale of our data set.

### Randomized Search (Scratch)

In [8]:
param_grid = {
    'C': stats.loguniform(1e-1, 1e3)
}
clf = RandomizedSearchCV(
    SVM(), param_grid, n_iter=100, n_jobs=-1
)
clf = clf.fit(train, train_adoptionspeed)
print(clf.best_estimator_)

SVM(C=1.0986214041924145)


### Grid Search (Scratch)

In [14]:
param_grid = [{
    'C': [0.01, 1, 100, 1000]
}]
clf = GridSearchCV(
    SVM(), param_grid, n_jobs=-1
)
clf = clf.fit(train, train_adoptionspeed)
print(clf.best_estimator_)

SVM(C=1)


Based on these results, SKLearn's best C value is 98.05701671998274. The scratch implementation has a best C value of C=1.0986214041924145. Thus, we will use _____ as our C parameter.

## Final Correctness Statistics

In [5]:
final_c_sklearn = 98.05701671998274
final_c_scratch = 1.0986214041924145

clf_sklearn = svm.SVC(kernel="linear", C=final_c_sklearn)
clf_ours = SVM(C=final_c_scratch)

In [6]:
clf_sklearn.fit(train, train_adoptionspeed)
clf_ours.fit(train, train_adoptionspeed)

In [22]:
train_prd_sklearn = clf_sklearn.predict(train)
train_prd_ours = clf_ours.predict(train)

holdout_prd_sklearn = clf_sklearn.predict(holdout)
holdout_prd_ours = clf_ours.predict(holdout)

test_prd_sklearn = clf_sklearn.predict(test)
test_prd_ours = clf_sklearn.predict(test)

In [27]:
%store train_prd_sklearn holdout_prd_sklearn test_prd_sklearn
%store train_prd_ours holdout_prd_ours test_prd_ours

Stored 'train_prd_sklearn' (ndarray)
Stored 'holdout_prd_sklearn' (ndarray)
Stored 'test_prd_sklearn' (ndarray)
Stored 'train_prd_ours' (ndarray)
Stored 'holdout_prd_ours' (ndarray)
Stored 'test_prd_ours' (ndarray)


### APR & F-Score Statistics

In [21]:
train_statistics = [
    [metrics.accuracy_score(train_adoptionspeed, train_prd_sklearn), 
     metrics.accuracy_score(train_adoptionspeed, train_prd_ours)], # Accuracy
    [metrics.precision_score(train_adoptionspeed, train_prd_sklearn, average='macro', zero_division=np.nan), 
     metrics.precision_score(train_adoptionspeed, train_prd_ours, average='macro', zero_division=np.nan)], # Precision
    [metrics.recall_score(train_adoptionspeed, train_prd_sklearn, average='macro', zero_division=np.nan), 
     metrics.recall_score(train_adoptionspeed, train_prd_ours, average='macro', zero_division=np.nan)], # Recall
    [metrics.f1_score(train_adoptionspeed, train_prd_sklearn, average='macro', zero_division=np.nan), 
     metrics.f1_score(train_adoptionspeed, train_prd_ours, average='macro', zero_division=np.nan)] # F1 Score
]
train_statistics = pd.DataFrame(train_statistics)
train_statistics = train_statistics.set_axis(['Accuracy', 'Precision', 'Recall', 'F1 Score'], axis='index')
train_statistics = train_statistics.set_axis(['SKLearn', 'From Scratch'], axis='columns')
train_statistics = train_statistics.style.set_caption("Training Statistics")
display(train_statistics)

Unnamed: 0,SKLearn,From Scratch
Accuracy,0.359017,0.369354
Precision,0.35887,0.362382
Recall,0.274916,0.290884
F1 Score,0.312087,0.279433


In [22]:
holdout_statistics = [
    [metrics.accuracy_score(holdout_adoptionspeed, holdout_prd_sklearn), 
     metrics.accuracy_score(holdout_adoptionspeed, holdout_prd_ours)], # Accuracy
    [metrics.precision_score(holdout_adoptionspeed, holdout_prd_sklearn, average='macro', zero_division=np.nan), 
     metrics.precision_score(holdout_adoptionspeed, holdout_prd_ours, average='macro', zero_division=np.nan)], # Precision
    [metrics.recall_score(holdout_adoptionspeed, holdout_prd_sklearn, average='macro', zero_division=np.nan), 
     metrics.recall_score(holdout_adoptionspeed, holdout_prd_ours, average='macro', zero_division=np.nan)], # Recall
    [metrics.f1_score(holdout_adoptionspeed, holdout_prd_sklearn, average='macro', zero_division=np.nan), 
     metrics.f1_score(holdout_adoptionspeed, holdout_prd_ours, average='macro', zero_division=np.nan)] # F1 Score
]
holdout_statistics = pd.DataFrame(holdout_statistics)
holdout_statistics = holdout_statistics.set_axis(['Accuracy', 'Precision', 'Recall', 'F1 Score'], axis='index')
holdout_statistics = holdout_statistics.set_axis(['SKLearn', 'From Scratch'], axis='columns')
holdout_statistics = holdout_statistics.style.set_caption("Holdout Statistics")
display(holdout_statistics)

Unnamed: 0,SKLearn,From Scratch
Accuracy,0.33739,0.362407
Precision,0.345074,0.363275
Recall,0.25913,0.284282
F1 Score,0.295259,0.338949


In [24]:
test_statistics = [
    [metrics.accuracy_score(test_adoptionspeed, test_prd_sklearn), 
     metrics.accuracy_score(test_adoptionspeed, test_prd_ours)], # Accuracy
    [metrics.precision_score(test_adoptionspeed, test_prd_sklearn, average='macro', zero_division=np.nan), 
     metrics.precision_score(test_adoptionspeed, test_prd_ours, average='macro', zero_division=np.nan)], # Precision
    [metrics.recall_score(test_adoptionspeed, test_prd_sklearn, average='macro', zero_division=np.nan), 
     metrics.recall_score(test_adoptionspeed, test_prd_ours, average='macro', zero_division=np.nan)], # Recall
    [metrics.f1_score(test_adoptionspeed, test_prd_sklearn, average='macro', zero_division=np.nan), 
     metrics.f1_score(test_adoptionspeed, test_prd_ours, average='macro', zero_division=np.nan)] # F1 Score
]
test_statistics = pd.DataFrame(test_statistics)
test_statistics = test_statistics.set_axis(['Accuracy', 'Precision', 'Recall', 'F1 Score'], axis='index')
test_statistics = test_statistics.set_axis(['SKLearn', 'From Scratch'], axis='columns')
test_statistics = test_statistics.style.set_caption("Test Statistics")
display(test_statistics)

Unnamed: 0,SKLearn,From Scratch
Accuracy,0.346829,0.346829
Precision,0.323856,0.323856
Recall,0.257896,0.257896
F1 Score,0.297142,0.297142


### 10-Fold Cross Validation

In [11]:
scoring_metrics = {
    'accuracy': metrics.make_scorer(metrics.accuracy_score),
    'precision': metrics.make_scorer(
        metrics.precision_score, average='weighted', zero_division=0
    ),
    'recall': metrics.make_scorer(
        metrics.recall_score, average='weighted', zero_division=0
    )
}
# Training Cross Validation
train_scores_sklearn = cross_validate(clf_sklearn, train, train_adoptionspeed, cv=10, scoring=scoring_metrics, n_jobs=-1)
train_scores_ours = cross_validate(clf_ours, train, train_adoptionspeed, cv=10, scoring=scoring_metrics, n_jobs=-1)
# Holdout Cross Validation
holdout_scores_sklearn = cross_validate(clf_sklearn, holdout, holdout_adoptionspeed, cv=10, scoring=scoring_metrics, n_jobs=-1)
holdout_scores_ours = cross_validate(clf_ours, holdout, holdout_adoptionspeed, cv=10, scoring=scoring_metrics, n_jobs=-1)
# Test Cross Validation
test_scores_sklearn = cross_validate(clf_sklearn, test, test_adoptionspeed, cv=10, scoring=scoring_metrics, n_jobs=-1)
test_scores_ours = cross_validate(clf_ours, test, test_adoptionspeed, cv=10, scoring=scoring_metrics, n_jobs=-1)

display(train_scores_sklearn)
"""
cv_stats = pd.DataFrame([
    [],
    [],
    [],
])
cv_stats = cv_stats.set_axis(['Train', 'Validation', 'Test'], axis='index')
cv_stats = cv_stats.set_axis(['SKLearn', 'From Scratch'], axis='columns')
cv_stats = cv_stats.style.set_caption("10-Fold Cross Validation Averages")
"""

{'fit_time': array([753.35637641, 736.06748462, 722.89005733, 735.43659306,
        739.91702461, 728.32660747, 698.40035939, 705.68589687,
        722.15376472, 704.65435576]),
 'score_time': array([0.15527201, 0.15953159, 0.16170716, 0.15614939, 0.15505934,
        0.15706134, 0.20863843, 0.15973568, 0.17726898, 0.17410755]),
 'test_accuracy': array([0.33942766, 0.3672496 , 0.33386328, 0.34578696, 0.32352941,
        0.35135135, 0.35322196, 0.33890215, 0.36515513, 0.35719968]),
 'test_precision': array([0.33208893, 0.36728758, 0.32017292, 0.33154896, 0.29107365,
        0.32790363, 0.34302788, 0.31845195, 0.35079827, 0.34860243]),
 'test_recall': array([0.33942766, 0.3672496 , 0.33386328, 0.34578696, 0.32352941,
        0.35135135, 0.35322196, 0.33890215, 0.36515513, 0.35719968])}

'\ncv_stats = pd.DataFrame([\n    [],\n    [],\n    [],\n])\ncv_stats = cv_stats.set_axis([\'Train\', \'Validation\', \'Test\'], axis=\'index\')\ncv_stats = cv_stats.set_axis([\'SKLearn\', \'From Scratch\'], axis=\'columns\')\ncv_stats = cv_stats.style.set_caption("10-Fold Cross Validation Averages")\n'

In [11]:
cv_stats = pd.DataFrame([
    [np.mean(train_scores_sklearn['test_accuracy']), np.mean(holdout_scores_sklearn['test_accuracy']), np.mean(test_scores_sklearn['test_accuracy'])], # Accuracy
    [np.mean(train_scores_sklearn['test_precision']), np.mean(holdout_scores_sklearn['test_precision']), np.mean(test_scores_sklearn['test_precision'])], # Precision
    [np.mean(train_scores_sklearn['test_recall']), np.mean(holdout_scores_sklearn['test_recall']), np.mean(test_scores_sklearn['test_recall'])], # Recall
])
cv_stats = cv_stats.set_axis(['Train', 'Validation', 'Test'], axis='columns')
cv_stats = cv_stats.set_axis(['Accuracy', 'Precision', 'Recall'], axis='index')
cv_stats = cv_stats.style.set_caption("10-Fold Cross Validation Averages - SKLearn")
display(cv_stats)

cv_stats = pd.DataFrame([
    [np.mean(train_scores_ours['test_accuracy']), np.mean(holdout_scores_ours['test_accuracy']), np.mean(test_scores_ours['test_accuracy'])], # Accuracy
    [np.mean(train_scores_ours['test_precision']), np.mean(holdout_scores_ours['test_precision']), np.mean(test_scores_ours['test_precision'])], # Precision
    [np.mean(train_scores_ours['test_recall']), np.mean(holdout_scores_ours['test_recall']), np.mean(test_scores_ours['test_recall'])], # Recall
])
cv_stats = cv_stats.set_axis(['Train', 'Validation', 'Test'], axis='columns')
cv_stats = cv_stats.set_axis(['Accuracy', 'Precision', 'Recall'], axis='index')
cv_stats = cv_stats.style.set_caption("10-Fold Cross Validation Averages - Ours")
display(cv_stats)

Unnamed: 0,Train,Validation,Test
Accuracy,0.347569,0.327234,0.325135
Precision,0.333096,0.308028,0.296455
Recall,0.347569,0.327234,0.325135


Unnamed: 0,Train,Validation,Test
Accuracy,0.354646,0.3225,0.330649
Precision,0.353695,0.31566,0.270938
Recall,0.354646,0.3225,0.330649


### T-Test Significance Analysis and P-Value

In [12]:
# T-Test statistics adapted from Benjamin Denzler
ttest_stats = []
pval_stats = []

In [13]:
ttest_accuracy, p_val_accuracy = stats.ttest_rel(train_scores_sklearn["test_accuracy"], train_scores_ours["test_accuracy"])
ttest_precision, p_val_precision = stats.ttest_rel(train_scores_sklearn["test_precision"], train_scores_ours["test_precision"])
ttest_recall, p_val_recall = stats.ttest_rel(train_scores_sklearn["test_recall"], train_scores_ours["test_recall"])
ttest_stats.append([ttest_accuracy, ttest_precision, ttest_recall])
pval_stats.append([p_val_accuracy, p_val_precision, p_val_recall])

In [14]:
ttest_accuracy, p_val_accuracy = stats.ttest_rel(holdout_scores_sklearn["test_accuracy"], holdout_scores_ours["test_accuracy"])
ttest_precision, p_val_precision = stats.ttest_rel(holdout_scores_sklearn["test_precision"], holdout_scores_ours["test_precision"])
ttest_recall, p_val_recall = stats.ttest_rel(holdout_scores_sklearn["test_recall"], holdout_scores_ours["test_recall"])
ttest_stats.append([ttest_accuracy, ttest_precision, ttest_recall])
pval_stats.append([p_val_accuracy, p_val_precision, p_val_recall])

In [15]:
ttest_accuracy, p_val_accuracy = stats.ttest_rel(test_scores_sklearn["test_accuracy"], test_scores_ours["test_accuracy"])
ttest_precision, p_val_precision = stats.ttest_rel(test_scores_sklearn["test_precision"], test_scores_ours["test_precision"])
ttest_recall, p_val_recall = stats.ttest_rel(test_scores_sklearn["test_recall"], test_scores_ours["test_recall"])
ttest_stats.append([ttest_accuracy, ttest_precision, ttest_recall])
pval_stats.append([p_val_accuracy, p_val_precision, p_val_recall])

In [17]:
%store ttest_stats
%store pval_stats

Stored 'ttest_stats' (list)
Stored 'pval_stats' (list)


In [20]:
display(ttest_stats)
display(pval_stats)

[[-2.073458481200585, -4.4567541885701365, -2.073458481200585],
 [0.5499331609275936, -0.7725471732966078, 0.5499331609275936],
 [-0.40940860530315815, 1.7301995246137032, -0.40940860530315815]]

[[0.06798205603147088, 0.0015846629689682111, 0.06798205603147088],
 [0.5957468181365316, 0.45959647814900906, 0.5957468181365316],
 [0.6918119904595366, 0.11764820038903988, 0.6918119904595366]]

### Mean Statistics

In [25]:
mae_sk = [
    metrics.mean_absolute_error(train_adoptionspeed, train_prd_sklearn),
    metrics.mean_absolute_error(holdout_adoptionspeed, holdout_prd_sklearn),
    metrics.mean_absolute_error(test_adoptionspeed, test_prd_sklearn)
]
mae_ours = [
    metrics.mean_absolute_error(train_adoptionspeed, train_prd_ours),
    metrics.mean_absolute_error(holdout_adoptionspeed, holdout_prd_ours),
    metrics.mean_absolute_error(test_adoptionspeed, test_prd_ours)
]
mse_sk = [
    metrics.mean_squared_error(train_adoptionspeed, train_prd_sklearn),
    metrics.mean_squared_error(holdout_adoptionspeed, holdout_prd_sklearn),
    metrics.mean_squared_error(test_adoptionspeed, test_prd_sklearn)
]
mse_ours = [
    metrics.mean_squared_error(train_adoptionspeed, train_prd_ours),
    metrics.mean_squared_error(holdout_adoptionspeed, holdout_prd_ours),
    metrics.mean_squared_error(test_adoptionspeed, test_prd_ours)
]
rmse_sk = [
    metrics.mean_squared_error(train_adoptionspeed, train_prd_sklearn, squared=False),
    metrics.mean_squared_error(holdout_adoptionspeed, holdout_prd_sklearn, squared=False),
    metrics.mean_squared_error(test_adoptionspeed, test_prd_sklearn, squared=False)    
]
rmse_ours = [
    metrics.mean_squared_error(train_adoptionspeed, train_prd_ours, squared=False),
    metrics.mean_squared_error(holdout_adoptionspeed, holdout_prd_ours, squared=False),
    metrics.mean_squared_error(test_adoptionspeed, test_prd_ours, squared=False)    
]
msle_sk = [
    metrics.mean_squared_log_error(train_adoptionspeed, train_prd_sklearn),
    metrics.mean_squared_log_error(holdout_adoptionspeed, holdout_prd_sklearn),
    metrics.mean_squared_log_error(test_adoptionspeed, test_prd_sklearn)
]
msle_ours = [
    metrics.mean_squared_log_error(train_adoptionspeed, train_prd_ours),
    metrics.mean_squared_log_error(holdout_adoptionspeed, holdout_prd_ours),
    metrics.mean_squared_log_error(test_adoptionspeed, test_prd_ours)
]
display(mae_sk)

[1.0319656488549618, 1.0919540229885059, 1.098515519568151]

In [29]:
stats_sklearn = pd.DataFrame([mae_sk, mse_sk, rmse_sk, msle_sk])
stats_sklearn = stats_sklearn.set_axis(['Mean Absolute Error', 'Mean Squared Error', 'Root Mean Squared Error', 'Mean Squared Log Error'], axis='index')
stats_sklearn = stats_sklearn.set_axis(['Training', 'Validation', 'Test'], axis='columns')
stats_sklearn = stats_sklearn.style.set_caption("SKLearn Implementation Statistics")
stats_ours = pd.DataFrame([mae_ours, mse_ours, rmse_ours, msle_ours])
stats_ours = stats_ours.set_axis(['Mean Absolute Error', 'Mean Squared Error', 'Root Mean Squared Error', 'Mean Squared Log Error'], axis='index')
stats_ours = stats_ours.set_axis(['Training', 'Validation', 'Test'], axis='columns')
stats_ours = stats_ours.style.set_caption("From Scratch Implementation Statistics")
display(stats_sklearn)
display(stats_ours)

Unnamed: 0,Training,Validation,Test
Mean Absolute Error,1.031966,1.091954,1.098516
Mean Squared Error,2.015426,2.179175,2.218623
Root Mean Squared Error,1.419657,1.476203,1.489504
Mean Squared Log Error,0.196629,0.213084,0.208667


Unnamed: 0,Training,Validation,Test
Mean Absolute Error,1.024571,1.079108,1.098516
Mean Squared Error,2.024889,2.221771,2.218623
Root Mean Squared Error,1.422986,1.490561,1.489504
Mean Squared Log Error,0.200016,0.219671,0.208667
