In [2]:
# for google colab
from google.colab import drive
# mount your Google Drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
# for google colab
# copy all files from "project1" directory in Google drive to current directory
!cp -r ./gdrive/MyDrive/project1/* .

In [16]:
import os
import pandas as pd
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import cross_val_score, KFold
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split

# Load Data

In [17]:
# load data from .csv file
train_df = pd.read_csv('train_data.csv')
train_df.head()

# collect features without price range
features = list(train_df.columns)[0:-1]
X, y = train_df[features], train_df['price_range']
# put data into numpy array
X, y = X.values, y.values

# used when testing the performance of smaller training dataset
# X, X_re, y, y_re = train_test_split(X, y, test_size=0.2, random_state=0)

# PCA
Doing PCA on dataset to reduce the dimension

In [6]:
pca = PCA(n_components=0.85)
pca.fit(X)
X = pca.transform(X)

# Cross-validation

In [19]:
def cross_validation(x_train, y_train, k=5):
    returnList = list()
    folds = list()
    # generate numpy array filled with value from 0 to length of x_train for latter use as index
    random_idx = np.arange(len(x_train))
    seed = 120
    np.random.seed(seed)
    # shuffle the index
    np.random.shuffle(random_idx)
    # get the size of each fold
    n_split = len(x_train) // k

    # separate the index into training part and testing part
    keep = 0
    for i in range(k):
        if i < len(x_train) % k: # used when mode != 0
            folds.append(random_idx[keep : keep + n_split + 1])
            keep += (n_split + 1)
        else:
            folds.append(random_idx[keep : keep + n_split])
            keep += n_split

    for i in range(k):
        returnList.append([np.setdiff1d(random_idx, folds[i]), folds[i]])
       
    return returnList

In [20]:
folds_data = cross_validation(X, y, k=5)

# kNN

In [21]:
# loop from 1 to 10 to get the best k
for k in range(1, 11):
  model = KNeighborsClassifier(n_neighbors=k)
  metric1, metric2, metric3, metric4, metric5 = 0, 0, 0, 0, 0

  # loop through different specified training and testing data
  for i in range(5):
    # train the data with kNN classifier
    model.fit(X[folds_data[i][0]], y[folds_data[i][0]])
    # test data
    y_pred = model.predict(X[folds_data[i][1]])

    # sum up all folds' metric value
    metric1 += accuracy_score(y[folds_data[i][1]], y_pred)
    metric2 += precision_score(y[folds_data[i][1]], y_pred, average='weighted')
    metric3 += recall_score(y[folds_data[i][1]], y_pred, average='weighted')
    metric4 += f1_score(y[folds_data[i][1]], y_pred, average='weighted')
    metric5 += roc_auc_score(y[folds_data[i][1]], model.predict_proba(X[folds_data[i][1]]), multi_class='ovr')

  # evaluate the model with accuracy, precision, recall, f1-score, aucroc 
  print("k=%d, accuracy=%.2f%%" % (k, metric1 * 20)) # (100% / 5 folds = 20)
  print("k=%d, precision=%.2f%%" % (k, metric2 * 20))
  print("k=%d, recall=%.2f%%" % (k, metric3 * 20))
  print("k=%d, f1_score=%.2f%%" % (k, metric4 * 20))
  print("k=%d, AUROC=%.2f%%" % (k, metric5 * 20))
  print('\n')

k=1, accuracy=86.18%
k=1, precision=86.36%
k=1, recall=86.18%
k=1, f1_score=86.18%
k=1, AUROC=90.84%


k=2, accuracy=85.83%
k=2, precision=86.29%
k=2, recall=85.83%
k=2, f1_score=85.80%
k=2, AUROC=95.51%


k=3, accuracy=88.61%
k=3, precision=88.71%
k=3, recall=88.61%
k=3, f1_score=88.62%
k=3, AUROC=96.96%


k=4, accuracy=87.92%
k=4, precision=88.20%
k=4, recall=87.92%
k=4, f1_score=87.89%
k=4, AUROC=97.81%


k=5, accuracy=89.38%
k=5, precision=89.52%
k=5, recall=89.38%
k=5, f1_score=89.39%
k=5, AUROC=98.22%


k=6, accuracy=88.75%
k=6, precision=89.12%
k=6, recall=88.75%
k=6, f1_score=88.76%
k=6, AUROC=98.36%


k=7, accuracy=89.51%
k=7, precision=89.73%
k=7, recall=89.51%
k=7, f1_score=89.55%
k=7, AUROC=98.42%


k=8, accuracy=89.24%
k=8, precision=89.56%
k=8, recall=89.24%
k=8, f1_score=89.26%
k=8, AUROC=98.48%


k=9, accuracy=89.44%
k=9, precision=89.60%
k=9, recall=89.44%
k=9, f1_score=89.47%
k=9, AUROC=98.47%


k=10, accuracy=89.65%
k=10, precision=89.79%
k=10, recall=89.65%
k=10, f1

# SVM

In [23]:
# using SVM with different kernel (Gaussian Kernel, Linear Kernel, Polynomial Kernel) to predict Fashion-MNIST
# Gaussian Kernel
model = SVC(kernel='rbf', decision_function_shape='ovr', probability=True)
# Linear Kernel
# model = SVC(kernel='linear', decision_function_shape='ovr', probability=True)
# Polynomial Kernel
# model = SVC(C=10, kernel='poly', gamma="auto", probability=True)
metrics = np.zeros(5)

# loop through different specified training and testing data
for i in range(5):
  # train data
  model.fit(X[folds_data[i][0]], y[folds_data[i][0]])
  # test data
  y_pred = model.predict(X[folds_data[i][1]])

  # sum up all folds' metric value
  metrics[0] += accuracy_score(y[folds_data[i][1]], y_pred)
  metrics[1] += precision_score(y[folds_data[i][1]], y_pred, average='weighted')
  metrics[2] += recall_score(y[folds_data[i][1]], y_pred, average='weighted')
  metrics[3] += f1_score(y[folds_data[i][1]], y_pred, average='weighted')
  metrics[4] += roc_auc_score(y[folds_data[i][1]], model.predict_proba(X[folds_data[i][1]]), multi_class='ovr')
metrics /= 5

# evaluate the model with accuracy, precision, recall, f1-score, aucroc 
print("accuracy=%.2f%%" % (metrics[0] * 100))
print("precision=%.2f%%" % (metrics[1] * 100))
print("recall=%.2f%%" % (metrics[2] * 100))
print("f1_score=%.2f%%" % (metrics[3] * 100))
print("AUROC=%.2f%%" % (metrics[4] * 100))
print('\n')

accuracy=97.01%
precision=97.04%
recall=97.01%
f1_score=97.01%
AUROC=99.89%




# Random Forest

In [24]:
# using random forest with different n_estimators to predict data
n_estimator = [100, 300, 1000]

for n in n_estimator:
  model = RandomForestClassifier(n_estimators=n, max_depth=3, random_state=42)
  metrics = np.zeros(5)

  # loop through different specified training and testing data
  for i in range(5):
    # training the random forest classifier
    model.fit(X[folds_data[i][0]], y[folds_data[i][0]])
    # test data
    y_pred = model.predict(X[folds_data[i][1]])

    # sum up all folds' metric value
    metrics[0] += accuracy_score(y[folds_data[i][1]], y_pred)
    metrics[1] += precision_score(y[folds_data[i][1]], y_pred, average='weighted')
    metrics[2] += recall_score(y[folds_data[i][1]], y_pred, average='weighted')
    metrics[3] += f1_score(y[folds_data[i][1]], y_pred, average='weighted')
    metrics[4] += roc_auc_score(y[folds_data[i][1]], model.predict_proba(X[folds_data[i][1]]), multi_class='ovr')
  metrics /= 5

# evaluate the model with accuracy, precision, recall, f1-score, aucroc
  print("n_estimator: %d" % (n))
  print("accuracy=%.2f%%" % (metrics[0] * 100))
  print("precision=%.2f%%" % (metrics[1] * 100))
  print("recall=%.2f%%" % (metrics[2] * 100))
  print("f1_score=%.2f%%" % (metrics[3] * 100))
  print("AUROC=%.2f%%" % (metrics[4] * 100))
  print('\n')

n_estimator: 100
accuracy=80.35%
precision=80.65%
recall=80.35%
f1_score=79.17%
AUROC=93.58%


n_estimator: 300
accuracy=81.32%
precision=81.77%
recall=81.32%
f1_score=80.21%
AUROC=94.02%


n_estimator: 1000
accuracy=80.76%
precision=81.18%
recall=80.76%
f1_score=79.61%
AUROC=93.91%




# Linear Discriminant Analysis

In [25]:
metrics = np.zeros(5)
# loop through different specified training and testing data
for i in range(5):
  # training the linear discriminant analysis classifier
  model.fit(X[folds_data[i][0]], y[folds_data[i][0]])
  # test data
  y_pred = model.predict(X[folds_data[i][1]])

  # sum up all folds' metric value
  metrics[0] += accuracy_score(y[folds_data[i][1]], y_pred)
  metrics[1] += precision_score(y[folds_data[i][1]], y_pred, average='weighted')
  metrics[2] += recall_score(y[folds_data[i][1]], y_pred, average='weighted')
  metrics[3] += f1_score(y[folds_data[i][1]], y_pred, average='weighted')
  metrics[4] += roc_auc_score(y[folds_data[i][1]], model.predict_proba(X[folds_data[i][1]]), multi_class='ovr')
metrics /= 5

# evaluate the model with accuracy, precision, recall, f1-score, aucroc
print("accuracy=%.2f%%" % (metrics[0] * 100))
print("precision=%.2f%%" % (metrics[1] * 100))
print("recall=%.2f%%" % (metrics[2] * 100))
print("f1_score=%.2f%%" % (metrics[3] * 100))
print("AUROC=%.2f%%" % (metrics[4] * 100))
print('\n')

accuracy=80.76%
precision=81.18%
recall=80.76%
f1_score=79.61%
AUROC=93.91%


