In [1]:
### IMPORTS AND READ CSV
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

# Initialise dataframe with headers
col_names = ["elevation", "aspect", "slope", "horizontaltohydro", "verticaltohydro", "horizontaltoroadway", "hillshade9am", "hillshadenoon", "hillshade3pm", "horizontaltofirepoints", "wildernessareaRawah", "wildernessareaNeota", "wildernessareaComanche", "wildernessareaCache", "soil1", "soi2", "soil3", "soil4", "soil5", "soil6", "soil7", "soil8", "soil9", "soil10", "soil11", "soil12", "soil13", "soil14", "soil15", "soil16", "soil17", "soil18", "soil19", "soil20", "soil21", "soil22", "soil23", "soil24", "soil25", "soil26", "soil27", "soil28", "soil29", "soil30", "soil31", "soil32", "soil33", "soil34", "soil35", "soil36", "soil37", "soil38", "soil39", "soil40", "target"]
cover_data = pd.read_csv('covtype.data', sep=',', names=col_names)
# Output dataframe contains 8124 entries

In [2]:
### DATA CLEANING
# Remove rows with ANY null values
X1 = cover_data.dropna()
# Output dataframe contains 581012 entries

In [3]:
### Cleaned training and target data, ready to encode, split, and train

# Training data without the target column (22 Categorical Attributes)
X = X1.drop(columns=["target"])

# Initialise target column (Edible or Poisonous) in new dataframe
Y = X1["target"]

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.linear_model import SGDClassifier

In [5]:
# split data for no cross validation training
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [8]:
%%time
# SGD Linear Support Vector Classifier - No Cross Validation -  max_iter = 1000
clfSGD = make_pipeline(StandardScaler(), SGDClassifier(loss='hinge'))

clfSGD.fit(X_train, Y_train)

predict = clfSGD.predict(X_test)

accuracy = accuracy_score(Y_test, predict)
svc_NoCV_1k = accuracy
svc_NoCV_1k

CPU times: total: 10.2 s
Wall time: 10.3 s


In [10]:
%%time
# SGD Logistic Regression Classifier - No Cross Validation - max_iter = 1000
clfSGDLR = make_pipeline(StandardScaler(), SGDClassifier(loss='log_loss'))
clfSGDLR.fit(X_train, Y_train)

predict = clfSGDLR.predict(X_test)

accuracy = accuracy_score(Y_test, predict)
lr_NoCV_1k = accuracy
lr_NoCV_1k

CPU times: total: 11.9 s
Wall time: 10.7 s


In [12]:
%%time
### 10-fold Cross-validation loss hinge (linear SVC) - max_iter = 1000
accuracy_array_10 = []
k = 10
kf = KFold(n_splits=k, random_state=None)
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
    Y_train, Y_test = Y[train_index], Y[test_index]
    
    clfSGD.fit(X_train, Y_train)
    predict = clfSGD.predict(X_test)
    
    accuracy = accuracy_score(Y_test, predict)
    print(accuracy)
    accuracy_array_10.append(accuracy)

average_accuracy_10 = sum(accuracy_array_10)/k
svc_10fold_1k = average_accuracy_10
print("avg:" + str(svc_10fold_1k))


0.6724897593886613
0.83689029637534
0.720056453417325
0.6858057520524604
0.6048949243558631
0.6430009810502401
0.6772688938228257
0.7232577752534379
0.5634154317481627
0.6042064680470216
avg:0.6731286735511338
CPU times: total: 2min 1s
Wall time: 1min 48s


In [13]:
%%time
### 5-fold Cross-validation loss hinge (linear SVC) - max iter = 1000
accuracy_array_5 = []
k = 5
kf = KFold(n_splits=k, random_state=None)
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
    Y_train, Y_test = Y[train_index], Y[test_index]
    
    clfSGD.fit(X_train, Y_train)
    predict = clfSGD.predict(X_test)
    
    accuracy = accuracy_score(Y_test, predict)
    print(accuracy)
    accuracy_array_5.append(accuracy)

average_accuracy_5 = sum(accuracy_array_5)/k
svc_5fold_1k = average_accuracy_5
print("avg:" + str(svc_5fold_1k))

0.741323373751108
0.7024947720798946
0.6068914476515034
0.6782413383590644
0.5904201304624705
avg:0.6638742124608081
CPU times: total: 56.4 s
Wall time: 49.3 s


In [14]:
%%time
### 10-fold Cross-validation logistic regression - max iter = 1000
accuracy_array_10 = []
k = 10
kf = KFold(n_splits=k, random_state=None)
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
    Y_train, Y_test = Y[train_index], Y[test_index]
    
    clfSGDLR.fit(X_train, Y_train)
    predict = clfSGDLR.predict(X_test)
    
    accuracy = accuracy_score(Y_test, predict)
    print(accuracy)
    accuracy_array_10.append(accuracy)

average_accuracy_10 = sum(accuracy_array_10)/k
lr_10fold_1k = average_accuracy_10
print("avg:" + str(lr_10fold_1k))

0.709321537984923
0.831761385150253
0.7233782551074852
0.70305158258894
0.5467375776664773
0.6191976041720453
0.7040842670522022
0.7291612881017538
0.5793876181132855
0.6249117915354296
avg:0.6770992907472795
CPU times: total: 2min 10s
Wall time: 1min 56s


In [15]:
%%time
### 5-fold Cross-validation logistic regression - max iter = 1000
accuracy_array_5 = []
k = 5
kf = KFold(n_splits=k, random_state=None)
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
    Y_train, Y_test = Y[train_index], Y[test_index]
    
    clfSGDLR.fit(X_train, Y_train)
    predict = clfSGDLR.predict(X_test)
    
    accuracy = accuracy_score(Y_test, predict)
    print(accuracy)
    accuracy_array_5.append(accuracy)

average_accuracy_5 = sum(accuracy_array_5)/k
lr_5fold_1k = average_accuracy_5
print("avg:" + str(lr_5fold_1k))

0.7722778241525606
0.7058853902222835
0.5573398048226365
0.6887144799573157
0.5902049878659575
avg:0.6628844974041507
CPU times: total: 59.6 s
Wall time: 52.6 s


In [16]:
# SGD Linear Support Vector Classifier - No Cross Validation - max_iter = 500
clfSGD500 = make_pipeline(StandardScaler(), SGDClassifier(loss='hinge',max_iter=500))

In [17]:
# SGD Logistic Regression Classifier - No Cross Validation - max_iter = 500
clfSGDLR500 = make_pipeline(StandardScaler(), SGDClassifier(loss='log_loss',max_iter=500))

In [18]:
%%time
### 10-fold Cross-validation loss hinge (linear SVC)
accuracy_array_10 = []
k = 10
kf = KFold(n_splits=k, random_state=None)
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
    Y_train, Y_test = Y[train_index], Y[test_index]
    
    clfSGD500.fit(X_train, Y_train)
    predict = clfSGD500.predict(X_test)
    
    accuracy = accuracy_score(Y_test, predict)
    print(accuracy)
    accuracy_array_10.append(accuracy)

average_accuracy_10 = sum(accuracy_array_10)/k
print("avg:" + str(average_accuracy_10))

0.6750542150012048
0.8348938074420846
0.7184041582761054
0.6825872188086264
0.6054801122183784
0.6362024750004303
0.6679747336534655
0.7032064852584293
0.5859279530472797
0.5995077537391783
avg:0.6709238912445183
CPU times: total: 2min 4s
Wall time: 1min 51s


In [19]:
%%time
### 5-fold Cross-validation loss hinge (linear SVC)
accuracy_array_5 = []
k = 5
kf = KFold(n_splits=k, random_state=None)
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
    Y_train, Y_test = Y[train_index], Y[test_index]
    
    clfSGD500.fit(X_train, Y_train)
    predict = clfSGD500.predict(X_test)
    
    accuracy = accuracy_score(Y_test, predict)
    print(accuracy)
    accuracy_array_5.append(accuracy)

average_accuracy_5 = sum(accuracy_array_5)/k
print("avg:" + str(average_accuracy_5))

0.7410824161166235
0.7014793077631386
0.5691382248154077
0.6903323522830932
0.5931481385862549
avg:0.6590360879129035
CPU times: total: 58.1 s
Wall time: 51.1 s


In [20]:
%%time
### 10-fold Cross-validation logistic regression
accuracy_array_10 = []
k = 10
kf = KFold(n_splits=k, random_state=None)
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
    Y_train, Y_test = Y[train_index], Y[test_index]
    
    clfSGDLR500.fit(X_train, Y_train)
    predict = clfSGDLR500.predict(X_test)
    
    accuracy = accuracy_score(Y_test, predict)
    print(accuracy)
    accuracy_array_10.append(accuracy)

average_accuracy_10 = sum(accuracy_array_10)/k
print("avg:" + str(average_accuracy_10))

0.6771883928264087
0.8295583628790747
0.7193679971084835
0.7025524517650299
0.5373057262353488
0.6233799762482574
0.6866663224385122
0.7323798213455879
0.569077984888384
0.6236209359563518
avg:0.6701097971691439
CPU times: total: 2min 8s
Wall time: 1min 55s


In [21]:
%%time
### 5-fold Cross-validation logistic regression
accuracy_array_5 = []
k = 5
kf = KFold(n_splits=k, random_state=None)
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
    Y_train, Y_test = Y[train_index], Y[test_index]
    
    clfSGDLR500.fit(X_train, Y_train)
    predict = clfSGDLR500.predict(X_test)
    
    accuracy = accuracy_score(Y_test, predict)
    print(accuracy)
    accuracy_array_5.append(accuracy)

average_accuracy_5 = sum(accuracy_array_5)/k
print("avg:" + str(average_accuracy_5))

0.7551009870657384
0.7098697968210803
0.5739746303850192
0.6991015645169618
0.5876663052271045
avg:0.6651426568031809
CPU times: total: 1min 1s
Wall time: 54.2 s
