In [1]:
# for google colab
from google.colab import drive
# mount your Google Drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
# for google colab
# copy all files from "project1" directory in Google drive to current directory
!cp -r ./gdrive/MyDrive/project1/* .

In [50]:
import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_validate, ParameterGrid
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split

In [8]:
colabPath = '/content/gdrive/MyDrive/project1'
# unzip BBCNews.zip
zipPath = os.path.join(colabPath, 'BBCNews.zip')
!unzip $zipPath 

Archive:  /content/gdrive/MyDrive/project1/BBCNews.zip
  inflating: business/0.txt          
  inflating: business/1.txt          
  inflating: business/10.txt         
  inflating: business/11.txt         
  inflating: business/12.txt         
  inflating: business/13.txt         
  inflating: business/14.txt         
  inflating: business/15.txt         
  inflating: business/16.txt         
  inflating: business/17.txt         
  inflating: business/18.txt         
  inflating: business/19.txt         
  inflating: business/2.txt          
  inflating: business/20.txt         
  inflating: business/21.txt         
  inflating: business/22.txt         
  inflating: business/23.txt         
  inflating: business/24.txt         
  inflating: business/25.txt         
  inflating: business/26.txt         
  inflating: business/27.txt         
  inflating: business/28.txt         
  inflating: business/29.txt         
  inflating: business/3.txt          
  inflating: business/30.txt     

# Load Data

In [62]:
# different categories collected from BBC News
cats = ['business', 'technology', 'science-environment', 'entertainment-arts']
X = []
y = []

# Convert a collection of raw documents to a matrix of TF-IDF features.
vectorizer = TfidfVectorizer(stop_words='english')
# load data from files
for idx, cat in enumerate(cats):
    cat_dir = f'{ cat }'
    for file_name in os.listdir(cat_dir):
        with open(f'{ cat_dir }/{ file_name }') as file:
            article = file.read()
            X.append(article)
            y.append(idx)

X = vectorizer.fit_transform(X).toarray()
y = np.array(y)
# used when testing the performance of smaller training dataset
X, X_re, y, y_re = train_test_split(X, y, test_size=0.2, random_state=0)

# Cross-validation

In [19]:
def cross_validation(x_train, y_train, k=5):
    returnList = list()
    folds = list()
    # generate numpy array filled with value from 0 to length of x_train for latter use as index
    random_idx = np.arange(len(x_train))
    seed = 120
    np.random.seed(seed)
    # shuffle the index
    np.random.shuffle(random_idx)
    # get the size of each fold
    n_split = len(x_train) // k

    # separate the index into training part and testing part
    keep = 0
    for i in range(k):
        if i < len(x_train) % k: # used when mode != 0
            folds.append(random_idx[keep : keep + n_split + 1])
            keep += (n_split + 1)
        else:
            folds.append(random_idx[keep : keep + n_split])
            keep += n_split

    for i in range(k):
        returnList.append([np.setdiff1d(random_idx, folds[i]), folds[i]])
       
    return returnList

In [63]:
folds_data = cross_validation(X, y, k=5)

# kNN

In [64]:
# loop from 1 to 10 to get the best k
for k in range(1, 11):
  model = KNeighborsClassifier(n_neighbors=k)
  metric1, metric2, metric3, metric4, metric5 = 0, 0, 0, 0, 0

  # loop through different specified training and testing data
  for i in range(5):
    # train the data with kNN classifier
    model.fit(X[folds_data[i][0]], y[folds_data[i][0]])
    # test data
    y_pred = model.predict(X[folds_data[i][1]])

    # sum up all folds' metric value
    metric1 += accuracy_score(y[folds_data[i][1]], y_pred)
    metric2 += precision_score(y[folds_data[i][1]], y_pred, average='weighted')
    metric3 += recall_score(y[folds_data[i][1]], y_pred, average='weighted')
    metric4 += f1_score(y[folds_data[i][1]], y_pred, average='weighted')
    metric5 += roc_auc_score(y[folds_data[i][1]], model.predict_proba(X[folds_data[i][1]]), multi_class='ovr')

  # evaluate the model with accuracy, precision, recall, f1-score, aucroc 
  print("k=%d, accuracy=%.2f%%" % (k, metric1 * 20)) # (100% / 5 folds = 20)
  print("k=%d, precision=%.2f%%" % (k, metric2 * 20))
  print("k=%d, recall=%.2f%%" % (k, metric3 * 20))
  print("k=%d, f1_score=%.2f%%" % (k, metric4 * 20))
  print("k=%d, AUROC=%.2f%%" % (k, metric5 * 20))
  print('\n')

k=1, accuracy=90.62%
k=1, precision=91.69%
k=1, recall=90.62%
k=1, f1_score=90.58%
k=1, AUROC=94.07%


k=2, accuracy=83.12%
k=2, precision=85.16%
k=2, recall=83.12%
k=2, f1_score=82.92%
k=2, AUROC=94.76%


k=3, accuracy=85.62%
k=3, precision=87.02%
k=3, recall=85.62%
k=3, f1_score=85.87%
k=3, AUROC=95.71%


k=4, accuracy=85.62%
k=4, precision=87.32%
k=4, recall=85.62%
k=4, f1_score=85.66%
k=4, AUROC=95.87%


k=5, accuracy=85.62%
k=5, precision=87.47%
k=5, recall=85.62%
k=5, f1_score=85.80%
k=5, AUROC=96.74%


k=6, accuracy=83.75%
k=6, precision=85.83%
k=6, recall=83.75%
k=6, f1_score=83.90%
k=6, AUROC=96.18%


k=7, accuracy=84.38%
k=7, precision=86.68%
k=7, recall=84.38%
k=7, f1_score=84.93%
k=7, AUROC=95.91%


k=8, accuracy=83.75%
k=8, precision=87.06%
k=8, recall=83.75%
k=8, f1_score=84.37%
k=8, AUROC=96.49%


k=9, accuracy=84.38%
k=9, precision=87.37%
k=9, recall=84.38%
k=9, f1_score=84.98%
k=9, AUROC=96.25%


k=10, accuracy=83.75%
k=10, precision=87.16%
k=10, recall=83.75%
k=10, f1

# SVM

In [65]:
# using SVM with different kernel (Gaussian Kernel, Linear Kernel, Polynomial Kernel) to predict Fashion-MNIST
# Gaussian Kernel
model = SVC(kernel='rbf', decision_function_shape='ovr', probability=True)
# Linear Kernel
# model = SVC(kernel='linear', decision_function_shape='ovr', probability=True)
# Polynomial Kernel
# model = SVC(C=10, kernel='poly', gamma="auto", probability=True)
metrics = np.zeros(5)

# loop through different specified training and testing data
for i in range(5):
  # train data
  model.fit(X[folds_data[i][0]], y[folds_data[i][0]])
  # test data
  y_pred = model.predict(X[folds_data[i][1]])

  # sum up all folds' metric value
  metrics[0] += accuracy_score(y[folds_data[i][1]], y_pred)
  metrics[1] += precision_score(y[folds_data[i][1]], y_pred, average='weighted')
  metrics[2] += recall_score(y[folds_data[i][1]], y_pred, average='weighted')
  metrics[3] += f1_score(y[folds_data[i][1]], y_pred, average='weighted')
  metrics[4] += roc_auc_score(y[folds_data[i][1]], model.predict_proba(X[folds_data[i][1]]), multi_class='ovr')
metrics /= 5

# evaluate the model with accuracy, precision, recall, f1-score, aucroc 
print("accuracy=%.2f%%" % (metrics[0] * 100))
print("precision=%.2f%%" % (metrics[1] * 100))
print("recall=%.2f%%" % (metrics[2] * 100))
print("f1_score=%.2f%%" % (metrics[3] * 100))
print("AUROC=%.2f%%" % (metrics[4] * 100))
print('\n')

accuracy=89.38%
precision=90.85%
recall=89.38%
f1_score=89.44%
AUROC=97.77%




# Random forest

In [66]:
# using random forest with different n_estimators to predict data
n_estimator = [100, 300, 1000]

for n in n_estimator:
  model = RandomForestClassifier(n_estimators=n, max_depth=3, random_state=42)
  metrics = np.zeros(5)

  # loop through different specified training and testing data
  for i in range(5):
    # training the random forest classifier
    model.fit(X[folds_data[i][0]], y[folds_data[i][0]])
    # test data
    y_pred = model.predict(X[folds_data[i][1]])

    # sum up all folds' metric value
    metrics[0] += accuracy_score(y[folds_data[i][1]], y_pred)
    metrics[1] += precision_score(y[folds_data[i][1]], y_pred, average='weighted')
    metrics[2] += recall_score(y[folds_data[i][1]], y_pred, average='weighted')
    metrics[3] += f1_score(y[folds_data[i][1]], y_pred, average='weighted')
    metrics[4] += roc_auc_score(y[folds_data[i][1]], model.predict_proba(X[folds_data[i][1]]), multi_class='ovr')
  metrics /= 5

# evaluate the model with accuracy, precision, recall, f1-score, aucroc
  print("n_estimator: %d" % (n))
  print("accuracy=%.2f%%" % (metrics[0] * 100))
  print("precision=%.2f%%" % (metrics[1] * 100))
  print("recall=%.2f%%" % (metrics[2] * 100))
  print("f1_score=%.2f%%" % (metrics[3] * 100))
  print("AUROC=%.2f%%" % (metrics[4] * 100))
  print('\n')

n_estimator: 100
accuracy=71.25%
precision=83.76%
recall=71.25%
f1_score=71.99%
AUROC=95.75%


n_estimator: 300
accuracy=75.62%
precision=85.14%
recall=75.62%
f1_score=76.41%
AUROC=96.82%


n_estimator: 1000
accuracy=76.88%
precision=85.89%
recall=76.88%
f1_score=77.66%
AUROC=96.94%


