# COMP5318 Assignment 1: Classification

### Group number: SID1: 530335425 , SID2: 530039187

In [2]:
# Import all libraries
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings("ignore")


In [3]:
# Load dataset
df = pd.read_csv("breast-cancer-wisconsin.csv")
#df = pd.read_csv("test-before.csv")

1. Filling in the missing values

In [4]:
# Pre-process dataset

# remove 'class' column
df_without_class = df.loc[:, df.columns != df.columns[-1]]

# change dataframe into numeric type
df_without_class = df_without_class.apply(lambda x: pd.to_numeric(x, errors='coerce'), axis= 1)

# create SimpleImputer, fit and transfrom data, change it back to df
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
array_imputed = imp_mean.fit_transform(df_without_class)
df_imputed = pd.DataFrame(array_imputed, columns=df_without_class.columns)

2. Normalising the data

In [5]:
# create MinMaxScaler, fit and transfrom data
scaler = MinMaxScaler()
array_normalized = scaler.fit_transform(df_imputed)
df_normalized = pd.DataFrame(array_normalized, columns=df_without_class.columns)

3. Changing the class

In [6]:
# class1 to 0, class2 to 1
df_class = df[df.columns[-1]].apply(lambda x: 0 if x == 'class1' else 1)

4. Print the first 10 rows of the pre-processed dataset

In [7]:
# Print first ten rows of pre-processed dataset to 4 decimal places as per assignment spec
# A function is provided to assist

def print_data(X, y, n_rows=10):
    """Takes a numpy data array and target and prints the first ten rows.
    
    Arguments:
        X: numpy array of shape (n_examples, n_features)
        y: numpy array of shape (n_examples)
        n_rows: numpy of rows to print
    """
    for example_num in range(n_rows):
        for feature in X[example_num]:
            print("{:.4f}".format(feature), end=",")

        if example_num == len(X)-1:
            print(y[example_num],end="")
        else:
            print(y[example_num])
            


In [8]:
print_data(df_normalized.to_numpy(), df_class.to_numpy())

0.4444,0.0000,0.0000,0.0000,0.1111,0.0000,0.2222,0.0000,0.0000,0
0.4444,0.3333,0.3333,0.4444,0.6667,1.0000,0.2222,0.1111,0.0000,0
0.2222,0.0000,0.0000,0.0000,0.1111,0.1111,0.2222,0.0000,0.0000,0
0.5556,0.7778,0.7778,0.0000,0.2222,0.3333,0.2222,0.6667,0.0000,0
0.3333,0.0000,0.0000,0.2222,0.1111,0.0000,0.2222,0.0000,0.0000,0
0.7778,1.0000,1.0000,0.7778,0.6667,1.0000,0.8889,0.6667,0.0000,1
0.0000,0.0000,0.0000,0.0000,0.1111,1.0000,0.2222,0.0000,0.0000,0
0.1111,0.0000,0.1111,0.0000,0.1111,0.0000,0.2222,0.0000,0.0000,0
0.1111,0.0000,0.0000,0.0000,0.1111,0.0000,0.0000,0.0000,0.4444,0
0.3333,0.1111,0.0000,0.0000,0.1111,0.0000,0.1111,0.0000,0.0000,0


### Part 1: Cross-validation without parameter tuning

In [9]:
## Setting the 10 fold stratified cross-validation

cvKFold=StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
# The stratified folds from cvKFold should be provided to the classifiers


In [10]:
def logregClassifier(X, y, cvKFold):
    clf = LogisticRegression(random_state=0).fit(X, y)
    scores = cross_val_score(clf, X, y, cv=cvKFold)
    return scores.mean()

In [11]:
#Naïve Bayes
def nbClassifier(X, y, cvKFold):
    nb = GaussianNB()
    scores = cross_val_score(nb, X, y, cv=cvKFold)
    return scores.mean()

In [12]:
# Decision Tree
def dtClassifier(X, y, cvKFold):
    dt = DecisionTreeClassifier(criterion='entropy',random_state=0)
    scores = cross_val_score(dt, X, y, cv=cvKFold)
    return scores.mean()

In [13]:
# Ensembles: Bagging, Ada Boost and Gradient Boosting
def bagDTClassifier(X, y, n_estimators, max_samples, max_depth, cvKFold):
    bag_clf = BaggingClassifier(
    DecisionTreeClassifier(criterion='entropy', random_state=0, max_depth=max_depth), n_estimators=n_estimators, 
        max_samples=max_samples, bootstrap=True, random_state=0)
    scores = cross_val_score(bag_clf, X, y, cv=cvKFold)
    return scores.mean()

def adaDTClassifier(X, y, n_estimators, learning_rate, max_depth, cvKFold):
    ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(criterion='entropy', max_depth=max_depth), n_estimators=n_estimators, 
        learning_rate=learning_rate, random_state=0)
    scores = cross_val_score(ada_clf, X, y, cv=cvKFold)
    return scores.mean()

def gbClassifier(X, y, n_estimators, learning_rate, cvKFold):
    gb_clf = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate, random_state=0)
    scores = cross_val_score(gb_clf, X, y, cv=cvKFold)
    return scores.mean()

### Part 1 Results

In [15]:
# Parameters for Part 1:

#Bagging
bag_n_estimators = 60
bag_max_samples = 100
bag_max_depth = 6

#AdaBoost
ada_n_estimators = 60
ada_learning_rate = 0.5
ada_bag_max_depth = 6

#GB
gb_n_estimators = 60
gb_learning_rate = 0.5

# assign the data to X, y
X = df_normalized.to_numpy()
y = df_class.to_numpy()

# Print results for each classifier in part 1 to 4 decimal places here:
LogR_score = logregClassifier(X, y, cvKFold)
print("LogR average cross-validation accuracy: {:.4f}".format(LogR_score))
NB_score = nbClassifier(X, y, cvKFold)
print("NB average cross-validation accuracy: {:.4f}".format(NB_score))
dt_score = dtClassifier(X, y, cvKFold)
print("DT average cross-validation accuracy: {:.4f}".format(dt_score))
bagDT_score = bagDTClassifier(X, y, bag_n_estimators, bag_max_samples, bag_max_depth, cvKFold)
print("Bagging average cross-validation accuracy: {:.4f}".format(bagDT_score))
AdaBoost_score = adaDTClassifier(X, y, ada_n_estimators, ada_learning_rate, ada_bag_max_depth, cvKFold)
print("AdaBoost average cross-validation accuracy: {:.4f}".format(AdaBoost_score))
GB_score = gbClassifier(X, y, gb_n_estimators, gb_learning_rate, cvKFold)
print("GB average cross-validation accuracy: {:.4f}".format(GB_score))

LogR average cross-validation accuracy: 0.9642
NB average cross-validation accuracy: 0.9585
DT average cross-validation accuracy: 0.9385
Bagging average cross-validation accuracy: 0.9571
AdaBoost average cross-validation accuracy: 0.9570
GB average cross-validation accuracy: 0.9613


### Part 2: Cross-validation with parameter tuning

In [16]:
# KNN
k = [1, 3, 5, 7, 9]
p = [1, 2]


def bestKNNClassifier(X, y, cvKFold, k, p):
    # create param grid
    param_grid = {'n_neighbors' : k, 'p': p}

    # train_test_split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, stratify=y, random_state=0)
    
    # create grid and fit data (return grid_search)
    grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=cvKFold,
                          return_train_score=True)
    grid_search.fit(X_train, y_train)
    
    # test set accuracy(return test_score)
    test_score = grid_search.score(X_test, y_test)
    return grid_search, test_score


In [17]:
# SVM
# You should use SVC from sklearn.svm with kernel set to 'rbf'
C = [0.01, 0.1, 1, 5, 15] 
gamma = [0.01, 0.1, 1, 10, 50]

def bestSVMClassifier(X, y, cvKFold, C, gamma):
    # create param grid
    param_grid = {'C' : C, 'gamma': gamma}

    # train_test_split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, stratify=y, random_state=0)
    
    # create grid and fit data (return grid_search)
    grid_search = GridSearchCV(SVC(kernel='rbf'), param_grid, cv=cvKFold,
                          return_train_score=True)
    grid_search.fit(X_train, y_train)

    # test set accuracy(return test_score)
    test_score = grid_search.score(X_test, y_test)

    return  grid_search, test_score

In [18]:
# Random Forest
# You should use RandomForestClassifier from sklearn.ensemble with information gain and max_features set to ‘sqrt’.
n_estimators = [10, 30, 60, 100, 150]
max_leaf_nodes = [6, 12, 18]

def bestRFClassifier(X, y, cvKFold, n_estimators, max_leaf_nodes):
    # create param grid
    param_grid = {'n_estimators' : n_estimators, 'max_leaf_nodes': max_leaf_nodes}
    
    # train_test_split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, stratify=y, random_state=0)
    
    # create grid and fit data (return grid_search)
    rnf_clf = RandomForestClassifier(criterion='entropy', max_features = 'sqrt',random_state=0)
    grid_search = GridSearchCV(rnf_clf, param_grid, cv=cvKFold, return_train_score=True)
    grid_search.fit(X_train, y_train)

    # test set accuracy(return test_score)
    test_score = grid_search.score(X_test, y_test)

    # macro_f1, weighted_f1 for rnf with best params (return macro_f1, weighted_f1)
    rnf_clf_best = RandomForestClassifier(criterion='entropy', max_features = 'sqrt',random_state=0, 
                                          max_leaf_nodes = grid_search.best_params_['max_leaf_nodes'],
                                          n_estimators = grid_search.best_params_['n_estimators'])
    # fit and predict data with best rnf_clf
    rnf_clf_best.fit(X_train, y_train)
    y_pred = rnf_clf_best.predict(X_test)

    # get the macro_f1 and weighted_f1
    macro_f1 = f1_score(y_test, y_pred, average='macro')
    weighted_f1 = f1_score(y_test, y_pred, average='weighted')
    return grid_search, test_score, macro_f1, weighted_f1



### Part 2: Results

In [19]:
# Perform Grid Search with 10-fold stratified cross-validation (GridSearchCV in sklearn). 
# The stratified folds from cvKFold should be provided to GridSearchV

# This should include using train_test_split from sklearn.model_selection with stratification and random_state=0
# Print results for each classifier here. All results should be printed to 4 decimal places except for
# "k", "p", n_estimators" and "max_leaf_nodes" which should be printed as integers.

# assign the data to X, y
X = df_normalized.to_numpy()
y = df_class.to_numpy()

# KNN model
KNN_grid_search,KNN_test_score = bestKNNClassifier(X, y, cvKFold, k, p)
print("KNN best k: {}".format(KNN_grid_search.best_params_['n_neighbors']))
print("KNN best p: {}".format(KNN_grid_search.best_params_['p']))
print("KNN cross-validation accuracy: {:.4f}".format(KNN_grid_search.best_score_))
print("KNN test set accuracy: {:.4f}".format(KNN_test_score))

print()
# SVM model
SVM_grid_search, SVM_test_score = bestSVMClassifier(X, y, cvKFold, C, gamma)
print("SVM best C: {:.4f}".format(SVM_grid_search.best_params_['C']))
print("SVM best gamma: {:.4f}".format(SVM_grid_search.best_params_['gamma']))
print("SVM cross-validation accuracy: {:.4f}".format(SVM_grid_search.best_score_))
print("SVM test set accuracy: {:.4f}".format(SVM_test_score))

print()
# rnf model
rnf_grid_search, rnf_test_score, macro_f1, weighted_f1 = bestRFClassifier(X, y, cvKFold, n_estimators, max_leaf_nodes)
print("RF best n_estimators: {}".format(rnf_grid_search.best_params_['n_estimators']))
print("RF best max_leaf_nodes: {}".format(rnf_grid_search.best_params_['max_leaf_nodes']))
print("RF cross-validation accuracy: {:.4f}".format(rnf_grid_search.best_score_))
print("RF test set accuracy: {:.4f}".format(rnf_test_score))
print("RF test set macro average F1: {:.4f}".format(macro_f1))
print("RF test set weighted average F1: {:.4f}".format(weighted_f1))

KNN best k: 3
KNN best p: 1
KNN cross-validation accuracy: 0.9695
KNN test set accuracy: 0.9543

SVM best C: 5.0000
SVM best gamma: 0.1000
SVM cross-validation accuracy: 0.9676
SVM test set accuracy: 0.9714

RF best n_estimators: 150
RF best max_leaf_nodes: 6
RF cross-validation accuracy: 0.9675
RF test set accuracy: 0.9657
RF test set macro average F1: 0.9628
RF test set weighted average F1: 0.9661
