In [16]:
from google.colab import drive
drive.mount('/content/gdrive')
import torch
import torchvision
import torchvision.transforms as transforms
from matplotlib import  pyplot as plt
import _pickle as cPickle
import pickle
import os.path
import warnings
from sklearn.tree import  DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, accuracy_score
from tqdm import tqdm
projPath = './gdrive/MyDrive/CSE498-homework/hw1'
dataDir = f'{projPath}/db/'
modelDir = f'{projPath}/model/'

Mounted at /content/gdrive


# MNIST Dataset



## Prepare Dataset

In [None]:
transform = None
trainset = torchvision.datasets.MNIST(dataDir, train=True,  download=False, transform=transform)
testset = torchvision.datasets.MNIST(dataDir, train=False,  download=False, transform=transform)
nTrainSamples, width, height = trainset.data.shape
nTestSamples, width, height = testset.data.shape
print(f'# train samples: {nTrainSamples} | # test samples:{nTestSamples}')
print(f'per image size: {width}*{height}')
Xtrain = trainset.data.view([nTrainSamples, width*height])
Ytrain = trainset.targets.view([nTrainSamples,])
Xtest = testset.data.view([nTestSamples, width*height])
Ytest = testset.targets.view([nTestSamples,])

# train samples: 60000 | # test samples:10000
per image size: 28*28


In [None]:
# check class distribution in train samples 
# almost evenly distributed
print(torch.bincount(Ytrain.view([-1,])))
print(torch.bincount(Ytest.view([-1,])))

tensor([5923, 6742, 5958, 6131, 5842, 5421, 5918, 6265, 5851, 5949])
tensor([ 980, 1135, 1032, 1010,  982,  892,  958, 1028,  974, 1009])


In [None]:
# generic training + parameter tuning
def train_and_tune(X, y, model, parameters, scoring='f1_macro', kfold=5, verbose=0):
  """
    @X: array-like of shape (n_samples, n_features)
    @y: array-like of shape (n_samples,)
    @model: (object) a sklearn model class
    @parameters: (dict) contains the parameters you want to tune in the model
    @metric: (str) the metric used to evaluate the quality of the model

    return: a trained model with the best parameters
  """
  cvSearchObj = GridSearchCV(model, parameters, scoring=scoring, n_jobs=-1, cv=kfold, verbose=verbose)
  cvSearchObj.fit(X,y)
  return cvSearchObj.best_estimator_

## Decision Tress Classifier

In [None]:
# train
dtree = DecisionTreeClassifier(random_state=498)
parameters = {'min_samples_leaf':[1,2,3]}
dtreeBest = train_and_tune(Xtrain, Ytrain, dtree, parameters, scoring='f1_macro', kfold=5)

In [None]:
# test
Ypred = dtreeBest.predict(Xtest)
print(f'Classification accuracy:{accuracy_score(Ypred, Ytest):6.5f} | Macro-F1 score:{f1_score(Ypred, Ytest, average="macro"):6.5f}')

Classification accuracy:0.88120 | Macro-F1 score:0.87962


In [None]:
# save
modelPath = modelDir+ '{}.pkl'.format('decisionTree')
if os.path.exists(modelPath):
  warnings.warn(f"Model exits at {modelPath}")
else:
  with open(modelPath,"wb") as f:
    pickle.dump(dtreeBest, f)
  print(f"Model is saved at {modelPath}")

Model is saved at ./gdrive/MyDrive/CSE498-homework/hw1/model/decisionTree.pkl


## GradientBoosting Classifier

In [None]:
# train
gb = GradientBoostingClassifier(random_state=498)
parameters = {'learning_rate':[0.05, 0.1], 'n_estimators':[50, 100]}
gbBest = train_and_tune(Xtrain, Ytrain, gb, parameters, scoring='f1_macro', kfold=3, verbose=10)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed: 29.4min
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed: 87.6min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed: 179.1min
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed: 273.6min finished


In [None]:
# test
Ypred = gbBest.predict(Xtest)
print(f'Classification accuracy:{accuracy_score(Ypred, Ytest):6.5f} | Macro-F1 score:{f1_score(Ypred, Ytest, average="macro"):6.5f}')

Classification accuracy:0.94580 | Macro-F1 score:0.94528


In [None]:
# save
modelPath = modelDir+ '{}.pkl'.format('gradientBoosting')
if os.path.exists(modelPath):
  warnings.warn(f"Model exits at {modelPath}")
else:
  with open(modelPath,"wb") as f:
    pickle.dump(gbBest, f)
  print(f"Model is saved at {modelPath}")

Model is saved at ./gdrive/MyDrive/CSE498-homework/hw1/model/gradientBoosting.pkl


## Naive Bayes

In [None]:
# assume each feature follows Gaussian 
# X_i: i-th feature; mu_c: class mean; sigma_c^2: class variance
# X_i ~ N(mu_c, sigma_c^2) 

# train
nb = GaussianNB()
parameters = {'var_smoothing':[1, 1e-1, 1e-3, 1e-9]}
nbBest = train_and_tune(Xtrain, Ytrain, nb, parameters, scoring='f1_macro', kfold=5, verbose=10)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   11.0s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:   15.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   19.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   19.8s finished


In [None]:
# test
Ypred = nbBest.predict(Xtest)
print(f'Classification accuracy:{accuracy_score(Ypred, Ytest):6.5f} | Macro-F1 score:{f1_score(Ypred, Ytest, average="macro"):6.5f}')

In [None]:
# save
modelPath = modelDir+ '{}.pkl'.format('GaussianNB')
# with open(modelPath,"wb") as f:
#   pickle.dump(clf, f)
if os.path.exists(modelPath):
  warnings.warn(f"Model exits at {modelPath}")
else:
  with open(modelPath,"wb") as f:
    pickle.dump(nbBest, f)
  print(f"Model is saved at {modelPath}")

Model is saved at ./gdrive/MyDrive/CSE498-homework/hw1/model/GaussianNB.pkl


# CIFAR-10

## Prepare datasets

In [None]:
transform = None
trainset = torchvision.datasets.CIFAR10(dataDir, train=True,  download=False, transform=transform)
testset = torchvision.datasets.CIFAR10(dataDir, train=False,  download=False, transform=transform)
nTrainSamples, width, height, channel = trainset.data.shape
nTestSamples, width, height, channel = testset.data.shape
print(f'# train samples: {nTrainSamples} | # test samples:{nTestSamples}')
print(f'per image size: {width}*{height} | per image channel:{channel}')
Xtrain = trainset.data.reshape([nTrainSamples, width*height*channel,])
Ytrain = torch.tensor(trainset.targets)
Xtest = testset.data.reshape([nTestSamples, width*height*channel])
Ytest = torch.tensor(testset.targets)
print(f'Xtrain  shape: {Xtrain.shape} | Ytrain shape: {Ytrain.shape}')
print(f'Xtest  shape: {Xtest.shape} | Ytest shape: {Ytest.shape}')

# train samples: 50000 | # test samples:10000
per image size: 32*32 | per image channel:3
Xtrain  shape: (50000, 3072) | Ytrain shape: torch.Size([50000])
Xtest  shape: (10000, 3072) | Ytest shape: torch.Size([10000])


## Naive Bayes - Gaussian

In [None]:
nb = GaussianNB()
parameters = {'var_smoothing':[1, 1e-1, 1e-3, 1e-9]}
nbBest = train_and_tune(Xtrain, Ytrain, nb, parameters, scoring='f1_macro', kfold=5, verbose=10)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    9.8s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   21.2s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   41.7s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:   52.6s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  1.1min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  1.1min finished


In [None]:
# test
Ypred = nbBest.predict(Xtest)
print(f'Classification accuracy:{accuracy_score(Ypred, Ytest):6.5f} | Macro-F1 score:{f1_score(Ypred, Ytest, average="macro"):6.5f}')

Classification accuracy:0.29760 | Macro-F1 score:0.27546


In [None]:
# save
modelPath = modelDir+ '{}.pkl'.format('cifa10-GaussianNB')
# with open(modelPath,"wb") as f:
#   pickle.dump(clf, f)
if os.path.exists(modelPath):
  warnings.warn(f"Model exits at {modelPath}")
else:
  with open(modelPath,"wb") as f:
    pickle.dump(nbBest, f)
  print(f"Model is saved at {modelPath}")

Model is saved at ./gdrive/MyDrive/CSE498-homework/hw1/model/cifa10-GaussianNB.pkl


## Decision Tree

In [None]:
# train
dtree = DecisionTreeClassifier(random_state=498)
parameters = {'min_samples_leaf':[1,2,3]}
dtreeBest = train_and_tune(Xtrain, Ytrain, dtree, parameters, scoring='f1_macro', kfold=3)

In [None]:
# test
Ypred = dtreeBest.predict(Xtest)
print(f'Classification accuracy:{accuracy_score(Ypred, Ytest):6.5f} | Macro-F1 score:{f1_score(Ypred, Ytest, average="macro"):6.5f}')

Classification accuracy:0.26970 | Macro-F1 score:0.26961


In [None]:
# save
modelPath = modelDir+ '{}.pkl'.format('cifa10-decisionTree')
if os.path.exists(modelPath):
  warnings.warn(f"Model exits at {modelPath}")
else:
  with open(modelPath,"wb") as f:
    cPickle.dump(dtreeBest, f)
  print(f"Model is saved at {modelPath}")

Model is saved at ./gdrive/MyDrive/CSE498-homework/hw1/model/cifa10-decisionTree.pkl


## GradientBoosting

The code takes too long to be trained colab ...

In [None]:
# train
gb = GradientBoostingClassifier(random_state=498)
parameters = {'learning_rate':[0.05, 0.1], 'n_estimators':[50, 100]}
gbBest = train_and_tune(Xtrain, Ytrain, gb, parameters, scoring='f1_macro', kfold=3, verbose=10)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed: 261.9min


In [None]:
# test
Ypred = gbBest.predict(Xtest)
print(f'Classification accuracy:{accuracy_score(Ypred, Ytest):6.5f} | Macro-F1 score:{f1_score(Ypred, Ytest, average="macro"):6.5f}')

In [None]:
# save
modelPath = modelDir+ '{}.pkl'.format('cifa10-gradientBoosting')
if os.path.exists(modelPath):
  warnings.warn(f"Model exits at {modelPath}")
else:
  with open(modelPath,"wb") as f:
    cPickle.dump(gbBest, f)
  print(f"Model is saved at {modelPath}")

In [10]:
gb = GradientBoostingClassifier(random_state=498, learning_rate=0.1, n_estimators=100)
gb.fit(Xtrain, Ytrain)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=498, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [12]:
# test
Ypred = gb.predict(Xtest)
print(f'Classification accuracy:{accuracy_score(Ypred, Ytest):6.5f} | Macro-F1 score:{f1_score(Ypred, Ytest, average="macro"):6.5f}')

Classification accuracy:0.48250 | Macro-F1 score:0.47972


In [18]:
# save
modelPath = modelDir+ '{}.pkl'.format('cifa10-gradientBoosting')
if os.path.exists(modelPath):
  warnings.warn(f"Model exits at {modelPath}")
else:
  with open(modelPath,"wb") as f:
    cPickle.dump(gb, f)
  print(f"Model is saved at {modelPath}")

Model is saved at ./gdrive/MyDrive/CSE498-homework/hw1/model/cifa10-gradientBoosting.pkl
