# Kaggle Forest Challenge - Comparing models

This notebook aims at comparing different models with different sets of features.

In [1]:
# Import useful packages
import pandas as pd
import numpy as np
import os
import seaborn as sns
from matplotlib import pyplot as plt
import time

# Import ML packages 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier	
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier		

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

from lightgbm import LGBMClassifier

from catboost import CatBoostClassifier

from xgboost import XGBClassifier

from tpot import TPOTClassifier

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Import my functions 
from my_toolkit import *

# Create the dataframes
os.chdir('/Users/camilleepitalon/Documents/DSB/11_machine_learning_2/Project')
df_train = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test-full.csv')

  from pandas import MultiIndex, Int64Index


## Initial dataset vs. preprocessed dataset

Let's first test a series of models retained on the raw data, without preprocessing.

The list of models we are going to test is the following:
- LogisticRegression (from sklearn.linear_model)
- KNeighborsClassifier (from sklearn.neighbors )
- RandomForestClassifier (from sklearn.ensemble )
- HistGradientBoostingClassifier (from sklearn.ensemble )
- ExtraTreesClassifier (from sklearn.ensemble )
- LGBMClassifier (from lightgbm)
- CatBoostClassifier (from catboost)
- XGBClassifier (from xgboost )

### 1. Testing models on initial dataset

In [2]:
# Initialiazz class properties so that no treatment is performed (raw data)
df_train = df_train
df_test = df_test
add_eng_features = False
columns_to_convert_to_log = []
polynomial_degree = 0
columns_to_polynomial = []
add_climate = False
add_geographic = False
add_family = False
add_rocky = False
add_stony = False
keep_initial_rows = True
decomposition = None
columns_to_decomp = None
columns_to_drop = None
random_state = 2

In [11]:
models = [LogisticRegression(), KNeighborsClassifier(), RandomForestClassifier(), HistGradientBoostingClassifier(), ExtraTreesClassifier(), LGBMClassifier(), CatBoostClassifier(), XGBClassifier()]

for model in models:
    print('--------')
    cl = ClassifTools(df_train, df_test, model,
        add_eng_features,
        columns_to_convert_to_log,
        polynomial_degree, columns_to_polynomial,
        add_climate, add_geographic, add_family, add_rocky, add_stony,
        keep_initial_rows, columns_to_drop,
        None, None,
        random_state)
    
    pred, acc, mat, pred_time = cl.test_predict(export_file=True, compute_local_metrics=True)
    export_metrics(acc, mat, pred_time, cl)
    print_results(acc, mat, pred_time, cl)

--------
enriching the data...
number of features: 55
   -- took 12.11 sec
training model...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


   -- took 2.23 sec
predicting test_set...
   -- took 0.15 sec
exporting file...
computing local metrics...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


   -- took 0.92 sec
> SUCCESS
-------------------------
- Parameters:  {'model': LogisticRegression(), 'add_eng_features': False, 'columns_to_convert_to_log': [], 'polynomial_degree': 0, 'columns_to_polynomial': [], 'add_climate': False, 'add_geographic': False, 'add_family': False, 'add_rocky': False, 'add_stony': False, 'keep_initial_rows': True, 'columns_to_drop': None, 'decomposition': None, 'columns_to_decomp': None, 'random_state': 2}
- Execution time: 16.06
- Accuracy: 0.3747
- Confusion matrix:
--------
enriching the data...
number of features: 55
   -- took 12.19 sec
training model...
   -- took 0.01 sec
predicting test_set...
   -- took 156.98 sec
exporting file...
computing local metrics...
   -- took 1.02 sec
> SUCCESS
-------------------------
- Parameters:  {'model': KNeighborsClassifier(), 'add_eng_features': False, 'columns_to_convert_to_log': [], 'polynomial_degree': 0, 'columns_to_polynomial': [], 'add_climate': False, 'add_geographic': False, 'add_family': False, 'ad



   -- took 5.31 sec
predicting test_set...
   -- took 0.8 sec
exporting file...
computing local metrics...




   -- took 5.04 sec
> SUCCESS
-------------------------
- Parameters:  {'model': XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=16,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None), 'add_eng_features': False, 'columns_to_convert_to_log': [], 'polynomial_degree': 0, 'columns_to_polynomial': [], 'add_climate': False, 'add_geographic': False, 'add_family': False, 'add_rocky': False, 'add_stony': False, 'keep_initial

The results can be found in:
- answers/metrics_history.txt (from 31/03/2022 10:15:47 to 31/03/2022 10:21:06)
- answers/full_submission112.csv to answers/full_submission119.csv

Now, let's do the same but on the best set of parameters for each model.

### 1. Testing models on initial dataset

In [12]:
# Initialiazz class properties so that no treatment is performed (raw data)
df_train = df_train
df_test = df_test
add_eng_features = True
columns_to_convert_to_log = []
polynomial_degree = 0
columns_to_polynomial = []
add_climate = True
add_geographic = False
add_family = True
add_rocky = True
add_stony = True
keep_initial_rows = False
decomposition = None
columns_to_decomp = None
columns_to_drop = ['Id', 'ClimZone_3', 'GeoZone_5', 'F_Gothic', 'F_Troutville', 'F_Rogert', 'F_Bross', 'R_Limber_families', 'R_Aquolis', 'R_Cryoborolis', 'R_Cryaquolis', 'R_Borohemists', 'R_Cryaquepts', 'R_Cryaquolls'],
random_state = 2

In [13]:
models = [LogisticRegression(), KNeighborsClassifier(), RandomForestClassifier(), HistGradientBoostingClassifier(), ExtraTreesClassifier(), LGBMClassifier(), CatBoostClassifier(), XGBClassifier()]

for model in models:
    print('--------')
    cl2 = ClassifTools(df_train, df_test, model,
        add_eng_features,
        columns_to_convert_to_log,
        polynomial_degree, columns_to_polynomial,
        add_climate, add_geographic, add_family, add_rocky, add_stony,
        keep_initial_rows, columns_to_drop,
        None, None,
        random_state)
    
    pred, acc, mat, pred_time = cl2.test_predict(export_file=True, compute_local_metrics=True)
    export_metrics(acc, mat, pred_time, cl2)
    print_results(acc, mat, pred_time, cl2)

--------
enriching the data...
number of features: 57
   -- took 54.94 sec
training model...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


   -- took 0.6 sec
predicting test_set...
   -- took 0.15 sec
exporting file...
computing local metrics...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


   -- took 2.09 sec
> SUCCESS
-------------------------
- Parameters:  {'model': LogisticRegression(), 'add_eng_features': True, 'columns_to_convert_to_log': [], 'polynomial_degree': 0, 'columns_to_polynomial': [], 'add_climate': True, 'add_geographic': False, 'add_family': True, 'add_rocky': True, 'add_stony': True, 'keep_initial_rows': False, 'columns_to_drop': (['Id', 'ClimZone_3', 'GeoZone_5', 'F_Gothic', 'F_Troutville', 'F_Rogert', 'F_Bross', 'R_Limber_families', 'R_Aquolis', 'R_Cryoborolis', 'R_Cryaquolis', 'R_Borohemists', 'R_Cryaquepts', 'R_Cryaquolls'],), 'decomposition': None, 'columns_to_decomp': None, 'random_state': 2}
- Execution time: 58.49
- Accuracy: 0.3879
- Confusion matrix:
--------
enriching the data...
number of features: 57
   -- took 51.92 sec
training model...
   -- took 0.0 sec
predicting test_set...
   -- took 202.47 sec
exporting file...
computing local metrics...
   -- took 3.01 sec
> SUCCESS
-------------------------
- Parameters:  {'model': KNeighborsClas



   -- took 8.66 sec
predicting test_set...
   -- took 1.29 sec
exporting file...
computing local metrics...




   -- took 9.9 sec
> SUCCESS
-------------------------
- Parameters:  {'model': XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=16,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None), 'add_eng_features': True, 'columns_to_convert_to_log': [], 'polynomial_degree': 0, 'columns_to_polynomial': [], 'add_climate': True, 'add_geographic': False, 'add_family': True, 'add_rocky': True, 'add_stony': True, 'keep_initial_rows'

The results can be found in:
- answers/metrics_history.txt (31/03/2022 10:33:30 to 31/03/2022 10:47:31)
- answers/full_submission120.csv to answers/full_submission127.csv

In [14]:
# Initialize class properties so that no treatment is performed (raw data)
df_train = df_train
df_test = df_test
add_eng_features = True
columns_to_convert_to_log = []
polynomial_degree = 0
columns_to_polynomial = []
add_climate = True
add_geographic = True
add_family = True
add_rocky = True
add_stony = True
keep_initial_rows = False
decomposition = None
columns_to_decomp = None
columns_to_drop = ['ClimZone_3', 'GeoZone_5', 'F_Gothic', 'F_Troutville', 'F_Rogert', 'F_Bross', 'R_Limber_families', 'R_Aquolis', 'R_Cryoborolis', 'R_Cryaquolis', 'R_Borohemists', 'R_Cryaquepts', 'R_Cryaquolls'],
random_state = 2

models = [LogisticRegression(), KNeighborsClassifier(), RandomForestClassifier(), HistGradientBoostingClassifier(), ExtraTreesClassifier(), LGBMClassifier(), CatBoostClassifier(), XGBClassifier()]

for model in models:
    print('--------')
    cl2 = ClassifTools(df_train, df_test, model,
        add_eng_features,
        columns_to_convert_to_log,
        polynomial_degree, columns_to_polynomial,
        add_climate, add_geographic, add_family, add_rocky, add_stony,
        keep_initial_rows, columns_to_drop,
        None, None,
        random_state)
    
    pred, acc, mat, pred_time = cl2.test_predict(export_file=True, compute_local_metrics=True)
    export_metrics(acc, mat, pred_time, cl2)
    print_results(acc, mat, pred_time, cl2)

--------
enriching the data...
number of features: 61
   -- took 77.81 sec
training model...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


   -- took 0.91 sec
predicting test_set...
   -- took 0.18 sec
exporting file...
computing local metrics...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


   -- took 3.01 sec
> SUCCESS
-------------------------
- Parameters:  {'model': LogisticRegression(), 'add_eng_features': True, 'columns_to_convert_to_log': [], 'polynomial_degree': 0, 'columns_to_polynomial': [], 'add_climate': True, 'add_geographic': True, 'add_family': True, 'add_rocky': True, 'add_stony': True, 'keep_initial_rows': False, 'columns_to_drop': (['ClimZone_3', 'GeoZone_5', 'F_Gothic', 'F_Troutville', 'F_Rogert', 'F_Bross', 'R_Limber_families', 'R_Aquolis', 'R_Cryoborolis', 'R_Cryaquolis', 'R_Borohemists', 'R_Cryaquepts', 'R_Cryaquolls'],), 'decomposition': None, 'columns_to_decomp': None, 'random_state': 2}
- Execution time: 82.88
- Accuracy: 0.3912
- Confusion matrix:
--------
enriching the data...
number of features: 61
   -- took 76.91 sec
training model...
   -- took 0.0 sec
predicting test_set...
   -- took 222.0 sec
exporting file...
computing local metrics...
   -- took 2.98 sec
> SUCCESS
-------------------------
- Parameters:  {'model': KNeighborsClassifier()



   -- took 8.5 sec
predicting test_set...
   -- took 1.16 sec
exporting file...
computing local metrics...




   -- took 9.67 sec
> SUCCESS
-------------------------
- Parameters:  {'model': XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=16,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None), 'add_eng_features': True, 'columns_to_convert_to_log': [], 'polynomial_degree': 0, 'columns_to_polynomial': [], 'add_climate': True, 'add_geographic': True, 'add_family': True, 'add_rocky': True, 'add_stony': True, 'keep_initial_rows'

The results can be found in:
- answers/metrics_history.txt (31/03/2022 10:48:54 to 31/03/2022 11:03:44)
- answers/full_submission128.csv to answers/full_submission135.csv

In [15]:
# Initialiazz class properties so that no treatment is performed (raw data)
df_train = df_train
df_test = df_test
add_eng_features = True
columns_to_convert_to_log = []
polynomial_degree = 0
columns_to_polynomial = []
add_climate = True
add_geographic = True
add_family = True
add_rocky = True
add_stony = True
keep_initial_rows = False
decomposition = None
columns_to_decomp = None
columns_to_drop = ['ClimZone_3', 'GeoZone_5', 'F_Gothic', 'F_Troutville', 'F_Rogert', 'F_Bross', 'R_Limber_families', 'R_Aquolis', 'R_Cryoborolis', 'R_Cryaquolis', 'R_Borohemists', 'R_Cryaquepts', 'R_Cryaquolls'],
random_state = 2

models = [HistGradientBoostingClassifier(loss='categorical_crossentropy', learning_rate=0.01, max_iter=1000, max_depth=None), ExtraTreesClassifier(n_estimators=300, max_features = None, min_samples_leaf= 1, min_samples_split= 2), LGBMClassifier(learning_rate=0.01, n_estimators=2048, num_leaves=1028)]

for model in models:
    print('--------')
    cl2 = ClassifTools(df_train, df_test, model,
        add_eng_features,
        columns_to_convert_to_log,
        polynomial_degree, columns_to_polynomial,
        add_climate, add_geographic, add_family, add_rocky, add_stony,
        keep_initial_rows, columns_to_drop,
        None, None,
        random_state)
    
    pred, acc, mat, pred_time = cl2.test_predict(export_file=True, compute_local_metrics=True)
    export_metrics(acc, mat, pred_time, cl2)
    print_results(acc, mat, pred_time, cl2)

--------
enriching the data...
number of features: 61
   -- took 54.37 sec
training model...
   -- took 69.14 sec
predicting test_set...
   -- took 23.78 sec
exporting file...
computing local metrics...
   -- took 62.9 sec
> SUCCESS
-------------------------
- Parameters:  {'model': HistGradientBoostingClassifier(learning_rate=0.01,
                               loss='categorical_crossentropy', max_iter=1000), 'add_eng_features': True, 'columns_to_convert_to_log': [], 'polynomial_degree': 0, 'columns_to_polynomial': [], 'add_climate': True, 'add_geographic': True, 'add_family': True, 'add_rocky': True, 'add_stony': True, 'keep_initial_rows': False, 'columns_to_drop': (['ClimZone_3', 'GeoZone_5', 'F_Gothic', 'F_Troutville', 'F_Rogert', 'F_Bross', 'R_Limber_families', 'R_Aquolis', 'R_Cryoborolis', 'R_Cryaquolis', 'R_Borohemists', 'R_Cryaquepts', 'R_Cryaquolls'],), 'decomposition': None, 'columns_to_decomp': None, 'random_state': 2}
- Execution time: 210.91
- Accuracy: 0.8972
- Confusion