In [1]:
### IMPORTS AND READ CSV
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

# Initialise dataframe with headers
col_names = ["elevation", "aspect", "slope", "horizontaltohydro", "verticaltohydro", "horizontaltoroadway", "hillshade9am", "hillshadenoon", "hillshade3pm", "horizontaltofirepoints", "wildernessareaRawah", "wildernessareaNeota", "wildernessareaComanche", "wildernessareaCache", "soil1", "soi2", "soil3", "soil4", "soil5", "soil6", "soil7", "soil8", "soil9", "soil10", "soil11", "soil12", "soil13", "soil14", "soil15", "soil16", "soil17", "soil18", "soil19", "soil20", "soil21", "soil22", "soil23", "soil24", "soil25", "soil26", "soil27", "soil28", "soil29", "soil30", "soil31", "soil32", "soil33", "soil34", "soil35", "soil36", "soil37", "soil38", "soil39", "soil40", "target"]
cover_data = pd.read_csv('covtype.data', sep=',', names=col_names)
# Output dataframe contains 8124 entries
cover_data

Unnamed: 0,elevation,aspect,slope,horizontaltohydro,verticaltohydro,horizontaltoroadway,hillshade9am,hillshadenoon,hillshade3pm,horizontaltofirepoints,...,soil32,soil33,soil34,soil35,soil36,soil37,soil38,soil39,soil40,target
0,2596,51,3,258,0,510,221,232,148,6279,...,0,0,0,0,0,0,0,0,0,5
1,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,0,5
2,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,0,2
3,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,0,2
4,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
581007,2396,153,20,85,17,108,240,237,118,837,...,0,0,0,0,0,0,0,0,0,3
581008,2391,152,19,67,12,95,240,237,119,845,...,0,0,0,0,0,0,0,0,0,3
581009,2386,159,17,60,7,90,236,241,130,854,...,0,0,0,0,0,0,0,0,0,3
581010,2384,170,15,60,5,90,230,245,143,864,...,0,0,0,0,0,0,0,0,0,3


In [2]:
### DATA CLEANING
# Remove rows with ANY null values
X1 = cover_data.dropna()
# doesn't actually do anything because all "values" are present in the dataset
# Output dataframe contains 581012 entries
X1

Unnamed: 0,elevation,aspect,slope,horizontaltohydro,verticaltohydro,horizontaltoroadway,hillshade9am,hillshadenoon,hillshade3pm,horizontaltofirepoints,...,soil32,soil33,soil34,soil35,soil36,soil37,soil38,soil39,soil40,target
0,2596,51,3,258,0,510,221,232,148,6279,...,0,0,0,0,0,0,0,0,0,5
1,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,0,5
2,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,0,2
3,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,0,2
4,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
581007,2396,153,20,85,17,108,240,237,118,837,...,0,0,0,0,0,0,0,0,0,3
581008,2391,152,19,67,12,95,240,237,119,845,...,0,0,0,0,0,0,0,0,0,3
581009,2386,159,17,60,7,90,236,241,130,854,...,0,0,0,0,0,0,0,0,0,3
581010,2384,170,15,60,5,90,230,245,143,864,...,0,0,0,0,0,0,0,0,0,3


In [3]:
### Cleaned training and target data, ready to encode, split, and train

# Training data without the target column (22 Categorical Attributes)
X = X1.drop(columns=["target"])

# Initialise target column (Edible or Poisonous) in new dataframe
Y = X1["target"]

In [26]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [5]:
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [21]:
%%time
### Linear SVC

clfSVC = make_pipeline(StandardScaler(), LinearSVC(loss='hinge', random_state=0, tol=1e-5))
clfSVC.fit(X_train, Y_train)

CPU times: total: 10min 59s
Wall time: 11min 1s




In [22]:
predict = clfSVC.predict(X_test)

accuracy = accuracy_score(Y_test, predict)
accuracy

0.7111434300319269

In [8]:
### SVC with SGD
from sklearn.linear_model import SGDClassifier

In [18]:
%%time
clfSGD = make_pipeline(StandardScaler(), SGDClassifier(loss='hinge'))

clfSGD.fit(X_train, Y_train)

CPU times: total: 9.89 s
Wall time: 9.9 s


In [19]:
### Before Cross Validation, SGD Support Vector Machine accuracy when 80-20 split
predict = clfSGD.predict(X_test)

accuracy = accuracy_score(Y_test, predict)
accuracy

0.7136218514151959

In [28]:
%%time
clfSGDLR = make_pipeline(StandardScaler(), SGDClassifier(loss='log_loss'))
clfSGDLR.fit(X_train, Y_train)

CPU times: total: 10.1 s
Wall time: 10.2 s


In [29]:
### Before Cross Validation, SGD Logistic Regression accuracy when 80-20 split
predict = clfSGD.predict(X_test)

accuracy = accuracy_score(Y_test, predict)
accuracy

0.7103947402390643

In [30]:
%%time
### 10-fold Cross-validation loss hinge (linear SVC)
from sklearn.model_selection import KFold
accuracy_array_10 = []
k = 10
kf = KFold(n_splits=k, random_state=None)
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
    Y_train, Y_test = Y[train_index], Y[test_index]
    
    clfSGD.fit(X_train, Y_train)
    predict = clfSGD.predict(X_test)
    
    accuracy = accuracy_score(Y_test, predict)
    print(accuracy)
    accuracy_array_10.append(accuracy)

average_accuracy_10 = sum(accuracy_array_10)/k
print("avg:" + str(average_accuracy_10))


0.6621630924925132
0.8382499741833328
0.7202457789022564
0.6925354124713861
0.5641038880570042
0.6219170065919691
0.6781122528011566
0.7478528768868006
0.5623827472849005
0.5957728782637132
avg:0.6683335907935033
CPU times: total: 2min 4s
Wall time: 1min 52s


In [33]:
%%time
### 5-fold Cross-validation loss hinge (linear SVC)
accuracy_array_5 = []
k = 5
kf = KFold(n_splits=k, random_state=None)
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
    Y_train, Y_test = Y[train_index], Y[test_index]
    
    clfSGD.fit(X_train, Y_train)
    predict = clfSGD.predict(X_test)
    
    accuracy = accuracy_score(Y_test, predict)
    print(accuracy)
    accuracy_array_5.append(accuracy)

average_accuracy_5 = sum(accuracy_array_5)/k
print("avg:" + str(average_accuracy_5))

0.741865528428698
0.7140521329053466
0.5528992616306088
0.6876387669747509
0.5875372196691967
avg:0.6567985819217202
CPU times: total: 56.4 s
Wall time: 50.4 s


In [25]:
%%time
### 10-fold Cross-validation logistic regression
accuracy_array_10 = []
k = 10
kf = KFold(n_splits=k, random_state=None)
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
    Y_train, Y_test = Y[train_index], Y[test_index]
    
    clfSGDLR.fit(X_train, Y_train)
    predict = clfSGDLR.predict(X_test)
    
    accuracy = accuracy_score(Y_test, predict)
    print(accuracy)
    accuracy_array_10.append(accuracy)

average_accuracy_10 = sum(accuracy_array_10)/k
print("avg:" + str(average_accuracy_10))

0.6831434374031875
0.8273725517193901
0.7202285674945353
0.7023287034646564
0.5517288859055782
0.6286122441954527
0.6877162183094956
0.7284556203851913
0.5710228739608613
0.6256346706597132
avg:0.6726243773498062
CPU times: total: 2min 10s
Wall time: 1min 57s


In [31]:
%%time
### 5-fold Cross-validation logistic regression
accuracy_array_5 = []
k = 5
kf = KFold(n_splits=k, random_state=None)
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
    Y_train, Y_test = Y[train_index], Y[test_index]
    
    clfSGDLR.fit(X_train, Y_train)
    predict = clfSGDLR.predict(X_test)
    
    accuracy = accuracy_score(Y_test, predict)
    print(accuracy)
    accuracy_array_5.append(accuracy)

average_accuracy_5 = sum(accuracy_array_5)/k
print("avg:" + str(average_accuracy_5))

0.7481992719637187
0.7040523910742408
0.5630798092976025
0.6998158379373849
0.609886232594964
avg:0.6650067085735822
CPU times: total: 58.6 s
Wall time: 52.5 s
