# Support Vector Machines
This notebook goes through the process of using Support Vector Machines to create a decision boundary for adoption speed prediction. The notebook provides a holistic view of the experiments and observations I made along the way. This includes the initial SVM implementation, attempts to train and build a five-way

- SVM code adapted from [SciKit Learn documentation](https://scikit-learn.org/stable/modules/svm.html)
- K Fold Cross Validation + TTest adapted from Ashley Pang and Benjamin Denzler

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# 1.3.1 adds some features to SVM that are required
!pip install scikit-learn==1.3.1



In [None]:
# Import crucial modules
import pandas as pd
import numpy as np
from sklearn import svm
from scipy import stats
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score

## Data Pre-processing
First, do some initial preprocessing on the data. This includes removing outliers, and extraneous features like Pet ID and Rescuer ID.
### Pre-processing for SVM
SVM will require some further preprocessing, as categorical data is not the best type of data to use for the model.

In [None]:
TRAIN_PATH = '/content/drive/MyDrive/CS235Project/train.csv'
df = pd.read_csv(TRAIN_PATH)
# Basic pre-processing that the team decided is sufficient for initial consumption.
# Drop columns
df = df.drop(['Name', 'RescuerID', 'PetID', 'Description'], axis=1)

# Drop rows with missing information
df = df.dropna()

# Calculate z-score for only numerical columns
z_scores_fee = np.abs(stats.zscore(df['Fee']))
z_scores_age = np.abs(stats.zscore(df['Age']))

# Using a threshold of 5 std dev. identify the outlier rows
outlier_rows_fee = z_scores_fee > 5
outlier_rows_age = z_scores_age > 5
combined_outlier_rows = outlier_rows_fee | outlier_rows_age

# Remove rows with outliers
df = df[~combined_outlier_rows]

In [None]:
processed_df = df.reset_index()
processed_df = processed_df.drop(["index"], axis=1)

# SVM prefers scaled data, so we must scale all columns
# Type: Ordinal rank between 0-1
processed_df["Type"] = processed_df["Type"].replace({1: 0, 2: 1})
# Age: Scale from 0-1 using Min-Max Scaling
min = processed_df["Age"].min()
max = processed_df["Age"].max()
processed_df["Age"] = (processed_df["Age"] - min) / (max - min)
# Breed: Get the Frequency of the Breed in Each Column and ordinally rank based on frequency. Finally, min max scale down.
breed1_count = processed_df["Breed1"].value_counts()
breed2_count = processed_df["Breed2"].value_counts()
processed_df["Breed1"] = processed_df["Breed1"].map(breed1_count)
processed_df["Breed2"] = processed_df["Breed2"].map(breed2_count)
min = processed_df["Breed1"].min()
max = processed_df["Breed1"].max()
processed_df["Breed1"] = (processed_df["Breed1"] - min) / (max - min)
min = processed_df["Breed2"].min()
max = processed_df["Breed2"].max()
processed_df["Breed2"] = (processed_df["Breed2"] - min) / (max - min)
# Gender: One-hot encode into male, female, or both
gender = []
for i in range(0, len(processed_df)):
    if processed_df["Gender"][i] == 1: # Male
        gender.append([1, 0])
    elif processed_df["Gender"][i] == 2: # Female
        gender.append([0, 1])
    else:
        gender.append([1, 1])
gender = pd.DataFrame(gender, columns=["Gender_Male", "Gender_Female"])
processed_df = processed_df.drop("Gender", axis=1)
processed_df = pd.concat([processed_df, gender], axis=1)
# Color: One-hot encode into black, brown, golden, yellow, cream, gray, white (+7 features)
color = pd.get_dummies(processed_df.loc[:,"Color1"].values)
color = color.rename(columns={
    1: "Color1_Black",
    2: "Color1_Brown",
    3: "Color1_Golden",
    4: "Color1_Yellow",
    5: "Color1_Cream",
    6: "Color1_Gray",
    7: "Color1_White"
})
processed_df = processed_df.drop("Color1", axis=1)
processed_df = pd.concat([processed_df, color], axis=1)

color = pd.get_dummies(processed_df.loc[:,"Color2"].values)
color = color.rename(columns={
    0: "Color2_None",
    1: "Color2_Black",
    2: "Color2_Brown",
    3: "Color2_Golden",
    4: "Color2_Yellow",
    5: "Color2_Cream",
    6: "Color2_Gray",
    7: "Color2_White"
})
processed_df = processed_df.drop("Color2", axis=1)
processed_df = pd.concat([processed_df, color], axis=1)

color = pd.get_dummies(processed_df.loc[:,"Color3"].values)
color = color.rename(columns={
    0: "Color3_None",
    1: "Color3_Black",
    2: "Color3_Brown",
    3: "Color3_Golden",
    4: "Color3_Yellow",
    5: "Color3_Cream",
    6: "Color3_Gray",
    7: "Color3_White"
})
processed_df = processed_df.drop("Color3", axis=1)
processed_df = pd.concat([processed_df, color], axis=1)
# Maturity: One-hot encode into 5 categories (+5 features)
maturity = pd.get_dummies(processed_df.loc[:,"MaturitySize"].values)
maturity = maturity.rename(columns={0: "MaturitySize_NotSpecified", 1: "MaturitySize_Small", 2: "MaturitySize_Medium", 3: "MaturitySize_Large", 4: "MaturitySize_ExtraLarge"})
processed_df = processed_df.drop("MaturitySize", axis=1)
processed_df = pd.concat([processed_df, maturity], axis=1)
# Fur Length: One-hot encode into 4 categories (+4 features)
fur = pd.get_dummies(processed_df.loc[:,"FurLength"].values)
fur = fur.rename(columns={0: "FurLength_NotSpecified", 1: "FurLength_Short", 2: "FurLength_Medium", 3: "FurLength_Long"})
processed_df = processed_df.drop("FurLength", axis=1)
processed_df = pd.concat([processed_df, fur], axis=1)
# Vaccinated: Re-rank from Unsure to Yes 0, 0.5, 1
processed_df["Vaccinated"] = processed_df["Vaccinated"].replace({1: 1, 2: 0.5, 3: 0})
# Dewormed: Re-rank from Unsure to Yes 0, 0.5, 1
processed_df["Dewormed"] = processed_df["Dewormed"].replace({1: 1, 2: 0.5, 3: 0})
# Sterilized: Re-rank from Unsure to Yes 0, 0.5, 1
processed_df["Sterilized"] = processed_df["Sterilized"].replace({1: 1, 2: 0.5, 3: 0})
# Health: Re-rank from 0, 1/3, 2/3, 1 for Not Specified, Serious Injury, Minor Injury, Healthy
processed_df["Health"] = processed_df["Health"].replace({0: 0, 1: 1, 2: 2/3, 3: 1/3})
# Fee: Scale from 0-1
min = processed_df["Fee"].min()
max = processed_df["Fee"].max()
processed_df["Fee"] = (processed_df["Fee"] - min) / (max - min)
# Quantity: Scale from 0-1
min = processed_df["Quantity"].min()
max = processed_df["Quantity"].max()
processed_df["Quantity"] = (processed_df["Quantity"] - min) / (max - min)
# State: One-hot encode the location (+15 features)
state = pd.get_dummies(processed_df.loc[:,"State"].values)
state = state.rename(columns={
    41336: "State_Johor",
    41325: "State_Kedah",
    41367: "State_Kelantan",
    41401: "State_KualaLampur",
    41415: "State_Labuan",
    41324: "State_Melaka",
    41332: "State_NegeriSembilan",
    41335: "State_Pahang",
    41330: "State_Perak",
    41380: "State_Perlis",
    41327: "State_PulauPinang",
    41345: "State_Sabah",
    41342: "State_Sarawak",
    41326: "State_Selangor",
    41361: "State_Terengganu"
})
processed_df = processed_df.drop("State", axis=1)
processed_df = pd.concat([processed_df, state], axis=1)
# VideoAmt: Scale from 0-1
min = processed_df["VideoAmt"].min()
max = processed_df["VideoAmt"].max()
processed_df["VideoAmt"] = (processed_df["VideoAmt"] - min) / (max - min)
# PhotoAmt: Scale from 0-1
min = processed_df["PhotoAmt"].min()
max = processed_df["PhotoAmt"].max()
processed_df["PhotoAmt"] = (processed_df["PhotoAmt"] - min) / (max - min)

display(df)
display(processed_df)

Unnamed: 0,Type,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,FurLength,Vaccinated,Dewormed,Sterilized,Health,Quantity,Fee,State,VideoAmt,PhotoAmt,AdoptionSpeed
0,2,3,299,0,1,1,7,0,1,1,2,2,2,1,1,100,41326,0,1.0,2
1,2,1,265,0,1,1,2,0,2,2,3,3,3,1,1,0,41401,0,2.0,0
2,1,1,307,0,1,2,7,0,2,2,1,1,2,1,1,0,41326,0,7.0,3
3,1,4,307,0,2,1,2,0,2,1,1,1,2,1,1,150,41401,0,8.0,2
4,1,1,307,0,1,1,0,0,2,1,2,2,2,1,1,0,41326,0,3.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14988,2,2,266,0,3,1,0,0,2,2,2,2,2,1,4,0,41326,0,3.0,2
14989,2,60,265,264,3,1,4,7,2,2,1,1,1,1,2,0,41326,0,3.0,4
14990,2,2,265,266,3,5,6,7,3,2,2,1,3,1,5,30,41326,0,5.0,3
14991,2,9,266,0,2,4,7,0,1,1,1,1,1,1,1,0,41336,0,3.0,4


Unnamed: 0,Type,Age,Breed1,Breed2,Vaccinated,Dewormed,Sterilized,Health,Quantity,Fee,...,State_Perak,State_NegeriSembilan,State_Pahang,State_Johor,State_Sarawak,State_Sabah,State_Terengganu,State_Kelantan,State_KualaLampur,State_Labuan
0,1,0.03,0.057777,1.000000,0.5,0.5,0.5,1.0,0.000000,0.250,...,0,0,0,0,0,0,0,0,0,0
1,1,0.01,0.212809,1.000000,0.0,0.0,0.0,1.0,0.000000,0.000,...,0,0,0,0,0,0,0,0,1,0
2,0,0.01,1.000000,1.000000,1.0,1.0,0.5,1.0,0.000000,0.000,...,0,0,0,0,0,0,0,0,0,0
3,0,0.04,1.000000,1.000000,1.0,1.0,0.5,1.0,0.000000,0.375,...,0,0,0,0,0,0,0,0,1,0
4,0,0.01,1.000000,1.000000,0.5,0.5,0.5,1.0,0.000000,0.000,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14791,1,0.02,0.613690,1.000000,0.5,0.5,0.5,1.0,0.157895,0.000,...,0,0,0,0,0,0,0,0,0,0
14792,1,0.60,0.212809,0.010837,1.0,1.0,1.0,1.0,0.052632,0.000,...,0,0,0,0,0,0,0,0,0,0
14793,1,0.02,0.212809,0.056163,0.5,1.0,0.0,1.0,0.210526,0.075,...,0,0,0,0,0,0,0,0,0,0
14794,1,0.09,0.613690,1.000000,1.0,1.0,1.0,1.0,0.000000,0.000,...,0,0,0,1,0,0,0,0,0,0


In [None]:
# Partition the data into training, holdout, and test sets.
# 14796 total data points
# 12576 as training - ~85%
# 1479 for validation - ~10%
# 741 for training - ~5%

size = 14796
train_index = 12576
holdout_index = train_index + 1479
test_index = size - 741

train = processed_df[:train_index]
train_adoptionspeed = train["AdoptionSpeed"].to_numpy()
train = train.drop(["AdoptionSpeed"], axis=1)
train = train.to_numpy()

holdout = processed_df[train_index:holdout_index]
holdout_adoptionspeed = holdout["AdoptionSpeed"].to_numpy()
holdout = holdout.drop(["AdoptionSpeed"], axis=1)
holdout = holdout.to_numpy()

test = processed_df[test_index:]
test_adoptionspeed = test["AdoptionSpeed"].to_numpy()
test = test.drop(["AdoptionSpeed"], axis=1)
test = test.to_numpy()


## Initial SVM Implementation

In [None]:
accuracy_df = [[], [], []]
C = 1
# Vanilla SVM (C = 1, kernel = 'rbf', gamma = 'scale')
clf = svm.SVC(C=C)
clf.fit(train, train_adoptionspeed)
accuracy_df[0].append(clf.score(train, train_adoptionspeed))
accuracy_df[1].append(clf.score(holdout, holdout_adoptionspeed))
accuracy_df[2].append(clf.score(test, test_adoptionspeed))

# Sigmoid SVM (C = 1, kernel = 'sigmoid', gamma = 'scale')
clf = svm.SVC(kernel='sigmoid', C=C, decision_function_shape="ovo")
clf.fit(train, train_adoptionspeed)
accuracy_df[0].append(clf.score(train, train_adoptionspeed))
accuracy_df[1].append(clf.score(holdout, holdout_adoptionspeed))
accuracy_df[2].append(clf.score(test, test_adoptionspeed))

# Polynomial SVM (C = 1, kernel = 'poly', gamma = 'scale')
clf = svm.SVC(kernel='poly', C=C, decision_function_shape="ovo")
clf.fit(train, train_adoptionspeed)
accuracy_df[0].append(clf.score(train, train_adoptionspeed))
accuracy_df[1].append(clf.score(holdout, holdout_adoptionspeed))
accuracy_df[2].append(clf.score(test, test_adoptionspeed))

# Linear SVM
clf = svm.LinearSVC(dual="auto", C=C)
clf.fit(train, train_adoptionspeed)
accuracy_df[0].append(clf.score(train, train_adoptionspeed))
accuracy_df[1].append(clf.score(holdout, holdout_adoptionspeed))
accuracy_df[2].append(clf.score(test, test_adoptionspeed))

accuracy_df = pd.DataFrame(accuracy_df)
accuracy_df = accuracy_df.rename(columns={0: "SVM OvO", 1: "SVM Sigmoid", 2: "SVM Poly", 3: "Linear SVM"})
display(accuracy_df)

Unnamed: 0,SVM OvO,SVM Sigmoid,SVM Poly,Linear SVM
0,0.433524,0.256918,0.458015,0.364901
1,0.369844,0.27451,0.371197,0.354293
2,0.363023,0.240216,0.365722,0.340081


The accuracy was not very good, so let us try to split the SVM into binary classifiers.

In [None]:
# Find out what the most frequent class is for the dataset, we will build a classifier for that class.
mode_classes = processed_df["AdoptionSpeed"].to_numpy()
values, counts = np.unique(mode_classes, return_counts=True)
print(dict(zip(values, counts)))

data0 = processed_df.copy()
data0 = data0["AdoptionSpeed"].to_numpy()
data0 = np.where(data0 == 0, 1, 0)
display(data0)
print(np.count_nonzero(data0))

data1 = processed_df.copy()
data1 = data1["AdoptionSpeed"].to_numpy()
data1 = np.where(data1 == 1, 1, 0)
display(data1)
print(np.count_nonzero(data1))

data2 = processed_df.copy()
data2 = data2["AdoptionSpeed"].to_numpy()
data2 = np.where(data2 == 2, 1, 0)
display(data2)
print(np.count_nonzero(data2))

data3 = processed_df.copy()
data3 = data3["AdoptionSpeed"].to_numpy()
data3 = np.where(data3 == 3, 1, 0)
display(data3)
print(np.count_nonzero(data3))

data4 = processed_df.copy()
data4 = data4["AdoptionSpeed"].to_numpy()
data4 = np.where(data4 == 4, 1, 0)
display(data4)
print(np.count_nonzero(data4))

{0: 403, 1: 3050, 2: 3984, 3: 3223, 4: 4136}


array([0, 1, 0, ..., 0, 0, 0])

403


array([0, 0, 0, ..., 0, 0, 0])

3050


array([1, 0, 0, ..., 0, 0, 0])

3984


array([0, 0, 1, ..., 1, 0, 1])

3223


array([0, 0, 0, ..., 0, 1, 0])

4136


## Five Way Binary SVM Classification

In [None]:
# Mode is 4, so we will create a classifier for AdoptionSpeed of 4.
train4_adoptionspeed = data4[:train_index]
holdout4_adoptionspeed = data4[train_index:holdout_index]
test4_adoptionspeed = data4[test_index:]

clf4 = svm.SVC(C=0.5, kernel="poly", class_weight="balanced", probability=True)
clf4.fit(train, train4_adoptionspeed)

test_predictions = clf4.predict(train)

print(clf4.score(train, train4_adoptionspeed))
print(clf4.score(holdout, holdout4_adoptionspeed))
print(clf4.score(test, test4_adoptionspeed))
print(f1_score(train4_adoptionspeed, test_predictions))
print(recall_score(train4_adoptionspeed, test_predictions))
print(precision_score(train4_adoptionspeed, test_predictions))

0.7258269720101781
0.691683569979716
0.6558704453441295
0.5706102117061022
0.653824200913242
0.506186478126381


In [None]:
train3_adoptionspeed = data3[:train_index]
holdout3_adoptionspeed = data3[train_index:holdout_index]
test3_adoptionspeed = data3[test_index:]

clf3 = svm.SVC(kernel="poly", class_weight="balanced", probability=True)
clf3.fit(train, train3_adoptionspeed)

test_predictions = clf3.predict(train)

print(clf3.score(train, train3_adoptionspeed))
print(clf3.score(holdout, holdout3_adoptionspeed))
print(clf3.score(test, test3_adoptionspeed))
print(f1_score(train3_adoptionspeed, test_predictions))
print(recall_score(train3_adoptionspeed, test_predictions))
print(precision_score(train3_adoptionspeed, test_predictions))

0.6551367684478372
0.5821501014198783
0.5681511470985156
0.4647661359990127
0.6810126582278481
0.35275384038965907


In [None]:
train2_adoptionspeed = data2[:train_index]
holdout2_adoptionspeed = data2[train_index:holdout_index]
test2_adoptionspeed = data2[test_index:]

clf2 = svm.SVC(class_weight="balanced", probability=True)
clf2.fit(train, train2_adoptionspeed)

test_predictions = clf2.predict(train)

print(clf2.score(train, train2_adoptionspeed))
print(clf2.score(holdout, holdout2_adoptionspeed))
print(clf2.score(test, test2_adoptionspeed))
print(f1_score(train2_adoptionspeed, test_predictions))
print(recall_score(train2_adoptionspeed, test_predictions))
print(precision_score(train2_adoptionspeed, test_predictions))

0.5853212468193384
0.5152129817444219
0.5182186234817814
0.49265492752213247
0.7473435655253837
0.3674357858075751


In [None]:
train1_adoptionspeed = data1[:train_index]
holdout1_adoptionspeed = data1[train_index:holdout_index]
test1_adoptionspeed = data1[test_index:]

clf1 = svm.SVC(class_weight="balanced", probability=True)
clf1.fit(train, train1_adoptionspeed)

test_predictions = clf1.predict(train)

print(clf1.score(train, train1_adoptionspeed))
print(clf1.score(holdout, holdout1_adoptionspeed))
print(clf1.score(test, test1_adoptionspeed))
print(f1_score(train1_adoptionspeed, test_predictions))
print(recall_score(train1_adoptionspeed, test_predictions))
print(precision_score(train1_adoptionspeed, test_predictions))

0.6406647582697201
0.6152805949966194
0.5951417004048583
0.46016007645442597
0.7479611650485437
0.33229813664596275


In [None]:
train0_adoptionspeed = data0[:train_index]
holdout0_adoptionspeed = data0[train_index:holdout_index]
test0_adoptionspeed = data0[test_index:]

clf0 = svm.SVC(kernel="poly", degree=10, class_weight="balanced", probability=True)
clf0.fit(train, train0_adoptionspeed)

test_predictions = clf0.predict(train)

print(clf0.score(train, train0_adoptionspeed))
print(clf0.score(holdout, holdout0_adoptionspeed))
print(clf0.score(test, test0_adoptionspeed))
print(f1_score(train0_adoptionspeed, test_predictions))
print(recall_score(train0_adoptionspeed, test_predictions))
print(precision_score(train0_adoptionspeed, test_predictions))

0.9708969465648855
0.9290060851926978
0.9230769230769231
0.6527514231499052
1.0
0.48450704225352115


In [None]:
accuracy = 0
dupes = 0
all_predictions = [
    clf0.predict_proba(train),
    clf1.predict_proba(train),
    clf2.predict_proba(train),
    clf3.predict_proba(train),
    clf4.predict_proba(train),
]
for i in range(len(train)):
    values = [
        all_predictions[0][i][1],
        all_predictions[1][i][1],
        all_predictions[2][i][1],
        all_predictions[3][i][1],
        all_predictions[4][i][1],
    ]
    classification = values.index(np.max(values))
    if classification == train_adoptionspeed[i]:
        accuracy += 1
print(accuracy / len(train))

accuracy = 0
all_predictions = [
    clf0.predict_proba(holdout),
    clf1.predict_proba(holdout),
    clf2.predict_proba(holdout),
    clf3.predict_proba(holdout),
    clf4.predict_proba(holdout),
]
for i in range(len(holdout)):
    values = [
        all_predictions[0][i][1],
        all_predictions[1][i][1],
        all_predictions[2][i][1],
        all_predictions[3][i][1],
        all_predictions[4][i][1],
    ]
    classification = values.index(np.max(values))
    if classification == holdout_adoptionspeed[i]:
        accuracy += 1
print(accuracy / len(holdout))

accuracy = 0
all_predictions = [
    clf0.predict_proba(test),
    clf1.predict_proba(test),
    clf2.predict_proba(test),
    clf3.predict_proba(test),
    clf4.predict_proba(test),
]
for i in range(len(test)):
    values = [
        all_predictions[0][i][1],
        all_predictions[1][i][1],
        all_predictions[2][i][1],
        all_predictions[3][i][1],
        all_predictions[4][i][1],
    ]
    classification = values.index(np.max(values))
    if classification == test_adoptionspeed[i]:
        accuracy += 1
print(accuracy / len(test))

0.46087786259541985
0.37254901960784315
0.3630229419703104


Unfortunately, the accuracy is similar to the SKLearn implementation results. Thus, it is better to tune for the SKLearn implementation.

## Hyperparameter Tuning

In [None]:
clf = svm.SVC(kernel="poly", C=1, break_ties=True)
clf.fit(train, train_adoptionspeed)

train_predictions = clf.predict(train)
holdout_predictions = clf.predict(holdout)
test_predictions = clf.predict(test)

stats_df = [["Training"], ["Validation"], ["Test"]]

stats_df[0].append(accuracy_score(train_adoptionspeed, train_predictions))
stats_df[0].append(precision_score(train_adoptionspeed, train_predictions, average='weighted', zero_division=0))
stats_df[0].append(recall_score(train_adoptionspeed, train_predictions, average='weighted', zero_division=0))
stats_df[0].append(f1_score(train_adoptionspeed, train_predictions, average='weighted', zero_division=0))

stats_df[1].append(accuracy_score(holdout_adoptionspeed, holdout_predictions))
stats_df[1].append(precision_score(holdout_adoptionspeed, holdout_predictions, average='weighted', zero_division=0))
stats_df[1].append(recall_score(holdout_adoptionspeed, holdout_predictions, average='weighted', zero_division=0))
stats_df[1].append(f1_score(holdout_adoptionspeed, holdout_predictions, average='weighted', zero_division=0))

stats_df[2].append(accuracy_score(test_adoptionspeed, test_predictions))
stats_df[2].append(precision_score(test_adoptionspeed, test_predictions, average='weighted', zero_division=0))
stats_df[2].append(recall_score(test_adoptionspeed, test_predictions, average='weighted', zero_division=0))
stats_df[2].append(f1_score(test_adoptionspeed, test_predictions, average='weighted', zero_division=0))

stats_df = pd.DataFrame(stats_df, columns=["", "Accuracy", "Precision", "Recall", "F-Score"])
display(stats_df.style.hide(axis="index").set_caption("Overall Model Statistics"))

class_df = []
class_df.append(precision_score(train_adoptionspeed, train_predictions, average=None, zero_division=0))
class_df.append(recall_score(train_adoptionspeed, train_predictions, average=None, zero_division=0))
class_df.append(f1_score(train_adoptionspeed, train_predictions, average=None, zero_division=0))
class_df = pd.DataFrame(class_df).T
class_df = class_df.rename(columns={0: "Precision", 1: "Recall", 2: "F-Score"})
display(class_df.style.set_caption("Training Statistics Per Class"))

class_df = []
class_df.append(precision_score(holdout_adoptionspeed, holdout_predictions, average=None, zero_division=0))
class_df.append(recall_score(holdout_adoptionspeed, holdout_predictions, average=None, zero_division=0))
class_df.append(f1_score(holdout_adoptionspeed, holdout_predictions, average=None, zero_division=0))
class_df = pd.DataFrame(class_df).T
class_df = class_df.rename(columns={0: "Precision", 1: "Recall", 2: "F-Score"})
display(class_df.style.set_caption("Validation Statistics Per Class"))

class_df = []
class_df.append(precision_score(test_adoptionspeed, test_predictions, average=None, zero_division=0))
class_df.append(recall_score(test_adoptionspeed, test_predictions, average=None, zero_division=0))
class_df.append(f1_score(test_adoptionspeed, test_predictions, average=None, zero_division=0))
class_df = pd.DataFrame(class_df).T
class_df = class_df.rename(columns={0: "Precision", 1: "Recall", 2: "F-Score"})
display(class_df.style.set_caption("Test Statistics Per Class"))


Unnamed: 0,Accuracy,Precision,Recall,F-Score
Training,0.459526,0.453993,0.459526,0.443523
Validation,0.370521,0.359769,0.370521,0.35749
Test,0.364372,0.34672,0.364372,0.346972


Unnamed: 0,Precision,Recall,F-Score
0,0.0,0.0,0.0
1,0.451629,0.360777,0.401123
2,0.395012,0.579693,0.469856
3,0.482736,0.267993,0.344651
4,0.534646,0.612158,0.570782


Unnamed: 0,Precision,Recall,F-Score
0,0.0,0.0,0.0
1,0.376866,0.315625,0.343537
2,0.3,0.416667,0.348837
3,0.326203,0.192429,0.242063
4,0.466245,0.544335,0.502273


Unnamed: 0,Precision,Recall,F-Score
0,0.0,0.0,0.0
1,0.362903,0.290323,0.322581
2,0.325806,0.505,0.396078
3,0.205128,0.113475,0.146119
4,0.471616,0.477876,0.474725


In [None]:
# K Fold Cross Validation
clf_08 = svm.SVC(kernel="poly", C=0.8, break_ties=True)
clf.fit(train, train_adoptionspeed)

k = 10
scoring_metrics = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(
        precision_score, average='weighted', zero_division=0
    ),
    'recall': make_scorer(
        recall_score, average='weighted', zero_division=0
    )
}
scores = cross_validate(
    clf, train, train_adoptionspeed,
    cv=k, scoring=scoring_metrics
)
scores_modified = cross_validate(
    clf_08, train, train_adoptionspeed,
    cv=k, scoring=scoring_metrics
)
mean_accuracy = np.mean(scores["test_accuracy"])
mean_precision = np.mean(scores["test_precision"])
mean_recall = np.mean(scores["test_recall"])
print(str(k)+"-fold cross validation accuracy mean: "+ str(mean_accuracy))
print(str(k)+"-fold cross validation precision mean: "+ str(mean_precision))
print(str(k)+"-fold cross validation recall mean: "+ str(mean_recall))
print("\n")
mean_accuracy_modified = np.mean(scores_modified["test_accuracy"])
mean_precision_modified = np.mean(scores_modified["test_precision"])
mean_recall_modified = np.mean(scores_modified["test_recall"])
print(str(k)+"-fold cross validation accuracy mean: "+ str(mean_accuracy_modified))
print(str(k)+"-fold cross validation precision mean: "+ str(mean_precision_modified))
print(str(k)+"-fold cross validation recall mean: "+ str(mean_recall_modified))

ttest_accuracy, p_val_accuracy = stats.ttest_rel(scores["test_accuracy"], scores_modified["test_accuracy"])
ttest_precision, p_val_precision = stats.ttest_rel(scores["test_precision"], scores_modified["test_precision"])
ttest_recall, p_val_recall = stats.ttest_rel(scores["test_recall"], scores_modified["test_recall"])

print(f'\nAccuracy T-test statistic: {ttest_accuracy}, Accuracy p-val: {p_val_accuracy}')
print(f'Precision T-test statistic: {ttest_precision}, Accuracy p-val: {p_val_precision}')
print(f'Recall T-test statistic: {ttest_recall}, Accuracy p-val: {p_val_recall}')

10-fold cross validation accuracy mean: 0.36243807333937894
10-fold cross validation precision mean: 0.35065419828457906
10-fold cross validation recall mean: 0.36243807333937894


10-fold cross validation accuracy mean: 0.3607679980977749
10-fold cross validation precision mean: 0.3480907830547285
10-fold cross validation recall mean: 0.3607679980977749

Accuracy T-test statistic: 1.506453590838381, Accuracy p-val: 0.166216538154221
Precision T-test statistic: 2.1823150880944624, Accuracy p-val: 0.05695920998877176
Recall T-test statistic: 1.506453590838381, Accuracy p-val: 0.166216538154221
