# Notebook Configuration

## Google drive

In [None]:
from google.colab import drive
import sys

# Mount Google Drive
drive.mount('/content/drive')

# Get the absolute path of the current folder
abspath_curr = '/content/drive/My Drive/Monster_Classification/'

# Get the absolute path of the shallow utilities folder
abspath_util_shallow = '/content/drive/My Drive/ColabNotebooks'

# Get the absolute path of the shallow models folder
abspath_model_shallow = '/content/drive/My Drive/ColabNotebooks'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Warning

In [None]:
import warnings

# Ignore warnings
warnings.filterwarnings('ignore')

## Matplotlib

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline 

# Set matplotlib sizes
plt.rc('font', size=20)
plt.rc('axes', titlesize=20)
plt.rc('axes', labelsize=20)
plt.rc('xtick', labelsize=20)
plt.rc('ytick', labelsize=20)
plt.rc('legend', fontsize=20)
plt.rc('figure', titlesize=20)

## TensorFlow

In [None]:
# The magic below allows us to use tensorflow version 2.x
%tensorflow_version 2.x 
import tensorflow as tf
from tensorflow import keras

## Random seed

In [None]:
# The random seed
random_seed = 42

# Set random seed in tensorflow
tf.random.set_seed(random_seed)

# Set random seed in numpy
import numpy as np
np.random.seed(random_seed)

# Data Preprocessing

In [None]:
# Change working directory to the absolute path of the shallow utilities folder
%cd $abspath_util_shallow

# Import the shallow utitilities
%run pmlm_utilities_shallow.ipynb

/content/drive/My Drive/ColabNotebooks


In [None]:
import pandas as pd

# Load the raw training data
df_raw_train = pd.read_csv('/content/drive/My Drive/ColabNotebooks/Monster_train.csv', header=0)

# Make a copy of df_raw_train
df_train = df_raw_train.copy(deep=True)

# Load the raw test data
df_raw_test = pd.read_csv('/content/drive/My Drive/ColabNotebooks/Monster_test.csv', header=0)

# Make a copy of df_raw_test
df_test = df_raw_test.copy(deep=True)

# Get the name of the target
target = 'type'

In [None]:
# Print the dimension of df_train
pd.DataFrame([[df_train.shape[0], df_train.shape[1]]], columns=['# rows', '# columns'])

Unnamed: 0,# rows,# columns
0,371,7


In [None]:
# Print the dimension of df_test
pd.DataFrame([[df_test.shape[0], df_test.shape[1]]], columns=['# rows', '# columns'])

Unnamed: 0,# rows,# columns
0,529,6


In [None]:
# Print the first 5 rows of df_train
df_train.head()

Unnamed: 0,id,bone_length,rotting_flesh,hair_length,has_soul,color,type
0,0,0.354512,0.350839,0.465761,0.781142,clear,Ghoul
1,1,0.57556,0.425868,0.531401,0.439899,green,Goblin
2,2,0.467875,0.35433,0.811616,0.791225,black,Ghoul
3,4,0.776652,0.508723,0.636766,0.884464,black,Ghoul
4,5,0.566117,0.875862,0.418594,0.636438,green,Ghost


In [None]:
# Print the first 5 rows of df_test
df_test.head()

Unnamed: 0,id,bone_length,rotting_flesh,hair_length,has_soul,color
0,3,0.471774,0.387937,0.706087,0.698537,black
1,6,0.427332,0.645024,0.565558,0.451462,white
2,9,0.549602,0.491931,0.660387,0.449809,black
3,10,0.638095,0.682867,0.471409,0.356924,white
4,13,0.361762,0.583997,0.377256,0.276364,black


Splitting the data

In [None]:
from sklearn.model_selection import train_test_split

# Divide the training data into training (80%) and validation (20%)
df_train, df_val = train_test_split(df_train, train_size=0.75, random_state=random_seed)

# Reset the index
df_train, df_val = df_train.reset_index(drop=True), df_val.reset_index(drop=True)

In [None]:
# Print the dimension of df_train
pd.DataFrame([[df_train.shape[0], df_train.shape[1]]], columns=['# rows', '# columns'])

Unnamed: 0,# rows,# columns
0,278,7


In [None]:
# Print the dimension of df_val
pd.DataFrame([[df_val.shape[0], df_val.shape[1]]], columns=['# rows', '# columns'])

Unnamed: 0,# rows,# columns
0,93,7


Handling uncommon features

In [None]:
# Call common_var_checker
# See the implementation in pmlm_utilities.ipynb
df_common_var = common_var_checker(df_train, df_val, df_test, target)

# Print df_common_var
df_common_var

Unnamed: 0,common var
0,bone_length
1,color
2,hair_length
3,has_soul
4,id
5,rotting_flesh
6,type


In [None]:
# Get the features in the training data but not in the validation or test data
uncommon_feature_train_not_val_test = np.setdiff1d(df_train.columns, df_common_var['common var'])

# Print the uncommon features
pd.DataFrame(uncommon_feature_train_not_val_test, columns=['uncommon feature'])

Unnamed: 0,uncommon feature


In [None]:
# Get the features in the validation data but not in the training or test data
uncommon_feature_val_not_train_test = np.setdiff1d(df_val.columns, df_common_var['common var'])

# Print the uncommon features
pd.DataFrame(uncommon_feature_val_not_train_test, columns=['uncommon feature'])

Unnamed: 0,uncommon feature


In [None]:
# Get the features in the test data but not in the training or validation data
uncommon_feature_test_not_train_val = np.setdiff1d(df_test.columns, df_common_var['common var'])

# Print the uncommon features
pd.DataFrame(uncommon_feature_test_not_train_val, columns=['uncommon feature'])

Unnamed: 0,uncommon feature


Removing uncommon features

In [None]:
# Remove the uncommon features from the training data
df_train = df_train.drop(columns=uncommon_feature_train_not_val_test)

# Print the first 5 rows of df_train
df_train.head()

Unnamed: 0,id,bone_length,rotting_flesh,hair_length,has_soul,color,type
0,756,0.187781,0.736491,0.344348,0.346306,clear,Ghost
1,553,0.250293,0.408878,0.456618,0.466024,white,Goblin
2,561,0.524138,0.578163,0.621787,0.689326,blue,Goblin
3,333,0.384235,0.393451,0.353746,0.490884,white,Goblin
4,31,0.585559,0.585939,1.0,0.708692,black,Ghoul


In [None]:
# Remove the uncommon features from the validation data
df_val = df_val.drop(columns=uncommon_feature_val_not_train_test)

# Print the first 5 rows of df_val
df_val.head()

Unnamed: 0,id,bone_length,rotting_flesh,hair_length,has_soul,color,type
0,779,0.516004,0.527508,0.354857,0.760432,white,Ghoul
1,72,0.523729,0.318483,0.330146,0.427402,green,Goblin
2,29,0.500197,0.438418,0.53253,0.665522,clear,Ghoul
3,745,0.4173,0.377595,0.541834,0.349087,clear,Goblin
4,119,0.515275,0.582627,0.568721,0.534079,clear,Goblin


In [None]:
# Remove the uncommon features from the test data
df_test = df_test.drop(columns=uncommon_feature_test_not_train_val)

# Print the first 5 rows of df_test
df_test.head()

Unnamed: 0,id,bone_length,rotting_flesh,hair_length,has_soul,color
0,3,0.471774,0.387937,0.706087,0.698537,black
1,6,0.427332,0.645024,0.565558,0.451462,white
2,9,0.549602,0.491931,0.660387,0.449809,black
3,10,0.638095,0.682867,0.471409,0.356924,white
4,13,0.361762,0.583997,0.377256,0.276364,black


Handling identifiers

In [None]:
# Combine df_train, df_val and df_test
df = pd.concat([df_train, df_val, df_test], sort=False)

In [None]:
# Call id_checker on df
# See the implementation in pmlm_utilities.ipynb
df_id = id_checker(df)

# Print the first 5 rows of df_id
df_id.head()

Unnamed: 0,id
0,756
1,553
2,561
3,333
4,31


In [None]:
import numpy as np

# Remove identifiers from df_train
df_train.drop(columns=np.intersect1d(df_id.columns, df_train.columns), inplace=True)

# Remove identifiers from df_val
df_val.drop(columns=np.intersect1d(df_id.columns, df_val.columns), inplace=True)

# Remove identifiers from df_test
df_test.drop(columns=np.intersect1d(df_id.columns, df_test.columns), inplace=True)

In [None]:
# Print the first 5 rows of df_train
df_train.head()

Unnamed: 0,bone_length,rotting_flesh,hair_length,has_soul,color,type
0,0.187781,0.736491,0.344348,0.346306,clear,Ghost
1,0.250293,0.408878,0.456618,0.466024,white,Goblin
2,0.524138,0.578163,0.621787,0.689326,blue,Goblin
3,0.384235,0.393451,0.353746,0.490884,white,Goblin
4,0.585559,0.585939,1.0,0.708692,black,Ghoul


In [None]:
# Print the first 5 rows of df_val
df_val.head()

Unnamed: 0,bone_length,rotting_flesh,hair_length,has_soul,color,type
0,0.516004,0.527508,0.354857,0.760432,white,Ghoul
1,0.523729,0.318483,0.330146,0.427402,green,Goblin
2,0.500197,0.438418,0.53253,0.665522,clear,Ghoul
3,0.4173,0.377595,0.541834,0.349087,clear,Goblin
4,0.515275,0.582627,0.568721,0.534079,clear,Goblin


In [None]:
# Print the first 5 rows of df_test
df_test.head()


Unnamed: 0,bone_length,rotting_flesh,hair_length,has_soul,color
0,0.471774,0.387937,0.706087,0.698537,black
1,0.427332,0.645024,0.565558,0.451462,white
2,0.549602,0.491931,0.660387,0.449809,black
3,0.638095,0.682867,0.471409,0.356924,white
4,0.361762,0.583997,0.377256,0.276364,black


Handling date time variables

In [None]:
# Get the date time variables
datetime_vars = []

In [None]:
# Call datetime_transformer on df_train
# See the implementation in pmlm_utilities.ipynb
df_train = datetime_transformer(df_train, datetime_vars)

# Print the first 5 rows of df_train
df_train.head()

Unnamed: 0,bone_length,rotting_flesh,hair_length,has_soul,color,type
0,0.187781,0.736491,0.344348,0.346306,clear,Ghost
1,0.250293,0.408878,0.456618,0.466024,white,Goblin
2,0.524138,0.578163,0.621787,0.689326,blue,Goblin
3,0.384235,0.393451,0.353746,0.490884,white,Goblin
4,0.585559,0.585939,1.0,0.708692,black,Ghoul


In [None]:
# Call datetime_transformer on df_val
# See the implementation in pmlm_utilities.ipynb
df_val = datetime_transformer(df_val, datetime_vars)

# Print the first 5 rows of df_val
df_val.head()

Unnamed: 0,bone_length,rotting_flesh,hair_length,has_soul,color,type
0,0.516004,0.527508,0.354857,0.760432,white,Ghoul
1,0.523729,0.318483,0.330146,0.427402,green,Goblin
2,0.500197,0.438418,0.53253,0.665522,clear,Ghoul
3,0.4173,0.377595,0.541834,0.349087,clear,Goblin
4,0.515275,0.582627,0.568721,0.534079,clear,Goblin


In [None]:
# Call datetime_transformer on df_test
# See the implementation in pmlm_utilities.ipynb
df_test = datetime_transformer(df_test, datetime_vars)

# Print the first 5 rows of df_test
df_test.head()

Unnamed: 0,bone_length,rotting_flesh,hair_length,has_soul,color
0,0.471774,0.387937,0.706087,0.698537,black
1,0.427332,0.645024,0.565558,0.451462,white
2,0.549602,0.491931,0.660387,0.449809,black
3,0.638095,0.682867,0.471409,0.356924,white
4,0.361762,0.583997,0.377256,0.276364,black


Handling missing data

In [None]:
# Combine df_train, df_val and df_test
df = pd.concat([df_train, df_val, df_test], sort=False)

In [None]:
# Call nan_checker on df
# See the implementation in pmlm_utilities.ipynb
df_nan = nan_checker(df)

# Print df_nan
df_nan

Unnamed: 0,var,proportion,dtype
0,type,0.587778,object


In [None]:
# Print the unique data type of variables with NaN
pd.DataFrame(df_nan['dtype'].unique(), columns=['dtype'])

Unnamed: 0,dtype
0,object


In [None]:
# Get the variables with missing values, their proportion of missing values and data type
df_miss = df_nan[df_nan['dtype'] == 'float64'].reset_index(drop=True)

# Print df_miss
df_miss

Unnamed: 0,var,proportion,dtype


In [None]:
# Separating the training data
df_train = df.iloc[:df_train.shape[0], :]

# Separating the validation data
df_val = df.iloc[df_train.shape[0]:df_train.shape[0] + df_val.shape[0], :]

# Separating the test data
df_test = df.iloc[df_train.shape[0] + df_val.shape[0]:, :]

In [None]:
# Print the dimension of df_train
pd.DataFrame([[df_train.shape[0], df_train.shape[1]]], columns=['# rows', '# columns'])

Unnamed: 0,# rows,# columns
0,278,6


In [None]:
# Print the dimension of df_val
pd.DataFrame([[df_val.shape[0], df_val.shape[1]]], columns=['# rows', '# columns'])

Unnamed: 0,# rows,# columns
0,93,6


In [None]:
# Print the dimension of df_test
pd.DataFrame([[df_test.shape[0], df_test.shape[1]]], columns=['# rows', '# columns'])

Unnamed: 0,# rows,# columns
0,529,6


In [None]:
from sklearn.impute import SimpleImputer

# If there are missing values
if len(df_miss['var']) > 0:
    # The SimpleImputer
    si = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

    # Impute the variables with missing values in df_train, df_val and df_test 
    df_train[df_miss['var']] = si.fit_transform(df_train[df_miss['var']])
    df_val[df_miss['var']] = si.transform(df_val[df_miss['var']])
    df_test[df_miss['var']] = si.transform(df_test[df_miss['var']])

Encoding the data

In [None]:
# Combine df_train, df_val and df_test
df = pd.concat([df_train, df_val, df_test], sort=False)

# Print the unique data type of variables in df
pd.DataFrame(df.dtypes.unique(), columns=['dtype'])

Unnamed: 0,dtype
0,float64
1,object


In [None]:
# Call cat_var_checker on df
# See the implementation in pmlm_utilities.ipynb
df_cat = cat_var_checker(df)

# Print the dataframe
df_cat

Unnamed: 0,var,nunique
0,color,6
1,type,4


In [None]:
# One-hot-encode the categorical features in the combined data
df = pd.get_dummies(df, columns=np.setdiff1d(np.intersect1d(df.columns, df_cat['var']), [target]))

# Print the first 5 rows of df
df.head()

Unnamed: 0,bone_length,rotting_flesh,hair_length,has_soul,type,color_black,color_blood,color_blue,color_clear,color_green,color_white
0,0.187781,0.736491,0.344348,0.346306,Ghost,0,0,0,1,0,0
1,0.250293,0.408878,0.456618,0.466024,Goblin,0,0,0,0,0,1
2,0.524138,0.578163,0.621787,0.689326,Goblin,0,0,1,0,0,0
3,0.384235,0.393451,0.353746,0.490884,Goblin,0,0,0,0,0,1
4,0.585559,0.585939,1.0,0.708692,Ghoul,1,0,0,0,0,0


In [None]:
from sklearn.preprocessing import LabelEncoder

# The LabelEncoder
le = LabelEncoder()

# Encode categorical target in the combined data
df[target] = le.fit_transform(df[target].astype(str))

# Print the first 5 rows of df
df.head()

Unnamed: 0,bone_length,rotting_flesh,hair_length,has_soul,type,color_black,color_blood,color_blue,color_clear,color_green,color_white
0,0.187781,0.736491,0.344348,0.346306,0,0,0,0,1,0,0
1,0.250293,0.408878,0.456618,0.466024,2,0,0,0,0,0,1
2,0.524138,0.578163,0.621787,0.689326,2,0,0,1,0,0,0
3,0.384235,0.393451,0.353746,0.490884,2,0,0,0,0,0,1
4,0.585559,0.585939,1.0,0.708692,1,1,0,0,0,0,0


In [None]:
# Separating the training data
df_train = df.iloc[:df_train.shape[0], :]

# Separating the validation data
df_val = df.iloc[df_train.shape[0]:df_train.shape[0] + df_val.shape[0], :]

# Separating the test data
df_test = df.iloc[df_train.shape[0] + df_val.shape[0]:, :]

In [None]:
# Print the dimension of df_train
pd.DataFrame([[df_train.shape[0], df_train.shape[1]]], columns=['# rows', '# columns'])

Unnamed: 0,# rows,# columns
0,278,11


In [None]:
# Print the dimension of df_val
pd.DataFrame([[df_val.shape[0], df_val.shape[1]]], columns=['# rows', '# columns'])

Unnamed: 0,# rows,# columns
0,93,11


In [None]:
# Print the dimension of df_test
pd.DataFrame([[df_test.shape[0], df_test.shape[1]]], columns=['# rows', '# columns'])

Unnamed: 0,# rows,# columns
0,529,11


Splitting the feature and target

In [None]:
# Get the feature matrix
X_train = df_train[np.setdiff1d(df_train.columns, [target])].values
X_val = df_val[np.setdiff1d(df_val.columns, [target])].values
X_test = df_test[np.setdiff1d(df_test.columns, [target])].values

# Get the target vector
y_train = df_train[target].values
y_val = df_val[target].values
y_test = df_test[target].values

In [None]:
from sklearn.preprocessing import StandardScaler

# The StandardScaler
ss = StandardScaler()

In [None]:
# Standardize the training data
X_train = ss.fit_transform(X_train)

# Standardize the validation data
X_val = ss.transform(X_val)

# Standardize the test data
X_test = ss.transform(X_test)


# Hyperparameter Tuning

Creating dictionary of models

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

models = {'lr': LogisticRegression(class_weight='balanced', random_state=random_seed),
          'mlpc': MLPClassifier(hidden_layer_sizes = 50, early_stopping=True, random_state=random_seed),
          'rfc': RandomForestClassifier(n_estimators = 110, class_weight='balanced', random_state=random_seed),
          'hgbc': HistGradientBoostingClassifier(random_state=random_seed)}

Creating the dictionary of pipelines

In [None]:
from sklearn.pipeline import Pipeline

pipes = {}

for acronym, model in models.items():
    pipes[acronym] = Pipeline([('model', model)])

Getting the predefined split cross-validator

In [None]:
# Get the:
# feature matrix and target velctor in the combined training and validation data
# target vector in the combined training and validation data
# PredefinedSplit
# See the implementation in pmlm_utilities.ipynb
X_train_val, y_train_val, ps = get_train_val_ps(X_train, y_train, X_val, y_val)

GridSearchCV

In [None]:
param_grids = {}

In [None]:
# The parameter grid of tol
tol_grid = [10 ** -8, 10 ** -4, 10 ** -1]

# The parameter grid of C
C_grid = [0.01, 0.1, 1, 10]

# Update param_grids
param_grids['lr'] = [{'model__tol': tol_grid,
                      'model__C': C_grid}]

In [None]:
hidden_layer_sizes = [100]

# The grids for alpha
alpha_grids = [10 ** i for i in range(-6, -2)]

# The grids for learning_rate_init
learning_rate_init_grids = [10 ** i for i in range(-4, -1)]

# Update param_grids
param_grids['mlpc'] = [{'model__hidden_layer_sizes': hidden_layer_sizes,
                        'model__alpha': alpha_grids,
                        'model__learning_rate_init': learning_rate_init_grids}]

In [None]:
n_estimators = [10 * i for i in range(5, 30)]

# The grids for min_samples_split
min_samples_split_grids = (0.1, 1, 10)

# The grids for min_samples_leaf
min_samples_leaf_grids = (1, 10, 20)

# Update param_grids
param_grids['rfc'] = [{'model__n_estimators': n_estimators,
                       'model__min_samples_split': min_samples_split_grids,
                       'model__min_samples_leaf': min_samples_leaf_grids}]

In [None]:

# The grids for learning_rate
learning_rate_grids = [10 ** i for i in range(-5, 4)]

# The grids for min_samples_leaf
min_samples_leaf_grids = [1, 20, 100, 200]

# Update param_grids
param_grids['hgbc'] = [{'model__learning_rate': learning_rate_grids,
                        'model__min_samples_leaf': min_samples_leaf_grids}]

In [None]:
# Make directory
directory = os.path.dirname(abspath_curr + '/result/hw3/cv_results/GridSearchCV/')
if not os.path.exists(directory):
    os.makedirs(directory)

In [None]:
from sklearn.model_selection import GridSearchCV

# The list of [best_score_, best_params_, best_estimator_] obtained by GridSearchCV
best_score_params_estimator_gs = []

# For each model
for acronym in pipes.keys():
    # GridSearchCV
    gs = GridSearchCV(estimator=pipes[acronym],
                      param_grid=param_grids[acronym],
                      scoring='f1_macro',
                      n_jobs=2,
                      cv=ps,
                      return_train_score=True)
        
    # Fit the pipeline
    gs = gs.fit(X_train_val, y_train_val)
    
    # Update best_score_params_estimator_gs
    best_score_params_estimator_gs.append([gs.best_score_, gs.best_params_, gs.best_estimator_])
    
    # Sort cv_results in ascending order of 'rank_test_score' and 'std_test_score'
    cv_results = pd.DataFrame.from_dict(gs.cv_results_).sort_values(by=['rank_test_score', 'std_test_score'])
    
    # Get the important columns in cv_results
    important_columns = ['rank_test_score',
                         'mean_test_score', 
                         'std_test_score', 
                         'mean_train_score', 
                         'std_train_score',
                         'mean_fit_time', 
                         'std_fit_time',                        
                         'mean_score_time', 
                         'std_score_time']
    
    # Move the important columns ahead
    cv_results = cv_results[important_columns + sorted(list(set(cv_results.columns) - set(important_columns)))]

    # Write cv_results file
    cv_results.to_csv(path_or_buf=abspath_curr + '/result/hw3/cv_results/GridSearchCV/' + acronym + '.csv', index=False)

# Sort best_score_params_estimator_gs in descending order of the best_score_
best_score_params_estimator_gs = sorted(best_score_params_estimator_gs, key=lambda x : x[0], reverse=True)

# Print best_score_params_estimator_gs
pd.DataFrame(best_score_params_estimator_gs, columns=['best_score', 'best_param', 'best_estimator'])

Unnamed: 0,best_score,best_param,best_estimator
0,0.783838,"{'model__min_samples_leaf': 1, 'model__min_sam...","((DecisionTreeClassifier(ccp_alpha=0.0, class_..."
1,0.733118,"{'model__C': 1, 'model__tol': 1e-08}","(LogisticRegression(C=1, class_weight='balance..."
2,0.727983,"{'model__learning_rate': 1, 'model__min_sample...",(HistGradientBoostingClassifier(l2_regularizat...
3,0.705035,"{'model__alpha': 1e-06, 'model__hidden_layer_s...","(MLPClassifier(activation='relu', alpha=1e-06,..."


# Model Selection

In [None]:
# Get the best_score, best_params and best_estimator obtained by GridSearchCV
best_score_gs, best_params_gs, best_estimator_gs = best_score_params_estimator_gs[0]

# *References*
https://github.com/yuxiaohuang/teaching/tree/master/gwu/machine_learning_I/spring_2021

https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html