# Kaggle Forest Challenge - Modelling

In [1]:
# Import useful packages
import pandas as pd
import numpy as np
import os
import seaborn as sns
from matplotlib import pyplot as plt

import time

# Import ML packages 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, FunctionTransformer

from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier	
from sklearn.ensemble import HistGradientBoostingClassifier				

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

from lightgbm import LGBMClassifier

from catboost import CatBoostClassifier

# Import my functions 
from my_functions import *
from my_dictionaries import *

In [3]:
os.chdir('/Users/camilleepitalon/Documents/DSB/11_machine_learning_2/Project')
df_train = pd.read_csv('./train.csv')
df_test = pd.read_csv('./test-full.csv')

## Preprocessing

In [4]:
columns = list(df_test.columns)
num_columns = [col for col in columns if 'Soil_Type' not in col and 'Wilderness_Area' not in col and 'Id' not in col]
num_columns_enriched = num_columns + ['ClimZone', 'GeoZone']
dummy_cols = [col for col in columns if col not in num_columns and 'Id' not in col]

In [5]:
X_train, X_test, y_train, y_test = split_trainset(df_train, test_size=0.2)
X, y = split_trainset(df_train, test_size=0)

In [6]:
X_train

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40
998,17888,2702,8,8,42,3,2072,211,223,150,...,0,0,0,0,0,0,0,0,0,0
11712,423652,2749,311,18,323,92,1869,170,225,195,...,0,0,0,0,0,0,0,0,0,0
11957,255002,2286,329,23,162,90,792,158,206,184,...,0,0,0,0,0,0,0,0,0,0
10210,20,2503,38,5,85,10,741,220,228,144,...,0,0,0,0,0,0,0,0,0,0
12698,251697,2694,24,6,0,0,1294,217,227,147,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11633,250671,2153,328,29,0,0,573,139,194,188,...,0,0,0,0,0,0,0,0,0,0
1344,463370,3063,63,10,30,4,743,229,220,123,...,0,0,0,0,0,0,0,0,0,0
12815,299330,2422,2,22,60,13,607,184,194,142,...,0,0,0,0,0,0,0,0,0,0
7293,269651,2140,264,22,60,8,997,160,244,220,...,0,0,0,0,0,0,0,0,0,0


## Modelling

In [6]:
preprocessor = ColumnTransformer([('standard_scaler', StandardScaler(), num_columns), ('pass', 'passthrough', dummy_cols)])
param_grid = {
    'max_depth': [3, 10, 20],
    'n_estimators': [100, 500],
    'verbose': [False]
}
model = CatBoostClassifier()
GSCV = GridSearchCV(model, param_grid=param_grid, n_jobs=-1, cv=4)									

In [55]:
#GSCV.get_params().keys()

In [53]:
# Perform GridSearchCV
GSCV.fit(X_train, y_train)
print(GSCV.best_score_)
print(GSCV.best_params_)					
best_model = GSCV.best_estimator_
y_pred_best = best_model.predict(X_test)
print(accuracy_score(y_test, y_pred_best))
confusion_matrix(y_test, y_pred_best)

8 fits failed out of a total of 24.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
8 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/camilleepitalon/miniconda3/envs/DS/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/camilleepitalon/miniconda3/envs/DS/lib/python3.9/site-packages/catboost/core.py", line 4675, in fit
    self._fit(X, y, cat_features, text_features, embedding_features, None, sample_weight, None, None, None, None, baseline, use_best_model,
  File "/Users/camilleepitalon/miniconda3/envs/DS/lib/python3.9/site-packages/catboost/core.py", line 1981, in _fit
    train_params = s

0.8742559523809524
{'max_depth': 10, 'n_estimators': 500, 'verbose': False}
0.8779761904761905


array([[339,  60,   0,   0,   9,   2,  27],
       [ 61, 320,  14,   0,  36,   5,   2],
       [  0,   2, 361,  13,   9,  34,   0],
       [  0,   0,  13, 390,   0,   3,   0],
       [  1,  15,   7,   0, 417,   3,   0],
       [  0,   4,  27,  11,   3, 393,   0],
       [  8,   0,   0,   0,   0,   0, 435]])

In [6]:
# Test not-tuned LGBMClassifier on df_train

model = LGBMClassifier()

accuracy1, matrix1 = local_metrics(
    df_train, model, test_size=0.2,
    display_accuracy=True, display_matrix=True)

print(accuracy1)
matrix1

0.8878968253968254


Unnamed: 0,1,2,3,4,5,6,7
1,346,57,0,0,9,3,22
2,54,334,7,0,33,9,1
3,0,1,363,12,9,34,0
4,0,0,10,391,0,5,0
5,0,16,3,0,420,4,0
6,0,4,26,8,2,398,0
7,10,0,0,0,0,0,433


In [15]:
# Test not-tuned LGBMClassifier on df_train + climate and geo

clim_and_geo = FunctionTransformer(climate_and_geo)
preprocessor = ColumnTransformer([('standard_scaler', StandardScaler(), num_columns), ('pass', 'passthrough', dummy_cols+['Id'])])
model = LGBMClassifier()
pipe = make_pipeline(clim_and_geo, model)

accuracy1b, matrix1b = local_metrics(
    df_train, pipe, test_size=0.2,
    display_accuracy=True, display_matrix=True)

print(accuracy1b)
matrix1b

0.8928571428571429


Unnamed: 0,1,2,3,4,5,6,7
1,349,55,0,0,9,2,22
2,46,343,10,0,31,7,1
3,0,2,367,13,11,26,0
4,0,0,12,390,0,4,0
5,0,17,3,0,419,4,0
6,0,6,25,7,3,397,0
7,8,0,0,0,0,0,435


In [28]:
# Test tuned LGBMClassifier on df_train

model = LGBMClassifier(learning_rate=0.01, max_depth=32, num_iterations=1024, num_leaves=512, n_jobs=-1)

accuracy2, matrix2 = local_metrics(
    df_train, model, test_size=0.2,
    display_accuracy=True, display_matrix=True)

print(accuracy2)
matrix2



0.8988095238095238


Unnamed: 0,1,2,3,4,5,6,7
1,350,53,0,0,9,3,22
2,45,341,8,0,33,8,3
3,0,2,374,13,7,23,0
4,0,0,10,394,0,2,0
5,1,17,3,0,420,2,0
6,0,4,19,8,3,404,0
7,8,0,0,0,0,0,435


In [16]:
# Test tuned LGBMClassifier on df_train + climate and geo

clim_and_geo = FunctionTransformer(climate_and_geo)
preprocessor = ColumnTransformer([('standard_scaler', StandardScaler(), num_columns), ('pass', 'passthrough', dummy_cols+['Id'])])
model = LGBMClassifier(learning_rate=0.01, max_depth=32, num_iterations=1024, num_leaves=512, n_jobs=-1)
pipe = make_pipeline(clim_and_geo, model)

accuracy2b, matrix2b = local_metrics(
    df_train, pipe, test_size=0.2,
    display_accuracy=True, display_matrix=True)

print(accuracy2b)
matrix2b



0.8991402116402116


Unnamed: 0,1,2,3,4,5,6,7
1,353,49,0,0,10,3,22
2,47,343,6,0,32,8,2
3,0,2,376,15,6,20,0
4,0,0,8,395,0,3,0
5,1,16,6,0,418,2,0
6,0,5,21,9,4,399,0
7,8,0,0,0,0,0,435


In [None]:
# Test tuned LGBMClassifier on df_train + climate and geo

clim_and_geo = FunctionTransformer(climate_and_geo)
preprocessor = ColumnTransformer([('standard_scaler', StandardScaler(), num_columns_enriched), ('pass', 'passthrough', dummy_cols+['Id'])])
model = LGBMClassifier(learning_rate=0.01, max_depth=32, num_iterations=1024, num_leaves=512, n_jobs=-1)
pipe = make_pipeline(clim_and_geo, preprocessor, model)

accuracy2c, matrix2c = local_metrics(
    df_train, pipe, test_size=0.2,
    display_accuracy=True, display_matrix=True)

print(accuracy2c)
matrix2c

In [7]:
# Test not-tuned HistGradientBoostingClassifier on df_train

model = HistGradientBoostingClassifier()

accuracy3, matrix3 = local_metrics(
    df_train, model, test_size=0.2,
    display_accuracy=True, display_matrix=True)

print(accuracy3)
matrix3

0.8902116402116402


Unnamed: 0,1,2,3,4,5,6,7
1,352,53,0,0,8,2,22
2,56,327,10,0,38,5,2
3,0,1,371,13,6,28,0
4,0,0,10,392,0,4,0
5,0,20,5,0,417,1,0
6,0,6,24,7,3,398,0
7,8,0,0,0,0,0,435


In [46]:
# Test quickly tuned HistGradientBoostingClassifier on df_train

model = HistGradientBoostingClassifier(learning_rate=0.1, max_leaf_nodes=512)

accuracy4, matrix4 = local_metrics(
    df_train, model, test_size=0.2,
    display_accuracy=True, display_matrix=True)

print(accuracy4)
matrix4

0.8915343915343915


Unnamed: 0,1,2,3,4,5,6,7
1,346,53,0,0,11,2,25
2,48,340,8,0,33,7,2
3,1,2,368,16,7,25,0
4,0,0,10,393,0,3,0
5,0,20,3,0,417,3,0
6,0,4,26,8,1,399,0
7,10,0,0,0,0,0,433


In [10]:
# Test quickly tuned HistGradientBoostingClassifier on df_train


preprocessor = ColumnTransformer([('standard_scaler', StandardScaler(), num_columns), ('pass', 'passthrough', dummy_cols+['Id'])])
model = HistGradientBoostingClassifier(learning_rate=0.1, max_leaf_nodes=512)
pipe = pipe = make_pipeline(preprocessor, model)

accuracy5, matrix5 = local_metrics(
    df_train, pipe, test_size=0.2,
    display_accuracy=True, display_matrix=True)

print(accuracy5)
matrix5

0.892526455026455


Unnamed: 0,1,2,3,4,5,6,7
1,343,57,0,0,8,2,27
2,45,342,11,0,31,8,1
3,0,2,372,11,7,27,0
4,0,0,8,396,0,2,0
5,1,16,4,0,419,3,0
6,0,6,29,9,2,392,0
7,8,0,0,0,0,0,435


In [13]:
# Test quickly tuned HistGradientBoostingClassifier on df_train

clim_and_geo = FunctionTransformer(climate_and_geo)
preprocessor = ColumnTransformer([('standard_scaler', StandardScaler(), num_columns), ('pass', 'passthrough', dummy_cols+['Id'])])
model = HistGradientBoostingClassifier(learning_rate=0.1, max_leaf_nodes=512)
pipe = make_pipeline(clim_and_geo, model)

accuracy6, matrix6 = local_metrics(
    df_train, pipe, test_size=0.2,
    display_accuracy=True, display_matrix=True)

print(accuracy6)
matrix6

0.8971560846560847


Unnamed: 0,1,2,3,4,5,6,7
1,345,59,0,0,8,2,23
2,43,350,9,0,29,6,1
3,0,2,371,12,10,24,0
4,0,0,11,393,0,2,0
5,0,15,5,0,422,1,0
6,0,4,19,12,3,400,0
7,10,1,0,0,0,0,432


## Exporting results

In [5]:
model1 = LGBMClassifier(learning_rate=0.01, max_depth=32, num_iterations=1024, num_leaves=512, n_jobs=-1)
pred1 = test_predict(df_train=df_train, df_test=df_test, model=model1, export_file=True, display_local_metrics=False)

spliting data...
training model...




-- training took 135.14 sec
predicting test_set...
can take several minutes...
predicting took 402.53 sec
creating dataframe...
exporting file...
file created!


In [6]:
model2 = HistGradientBoostingClassifier(learning_rate=0.1, max_leaf_nodes=512)
pred2 = test_predict(df_train=df_train, df_test=df_test, model=model2, export_file=True, display_local_metrics=False)

spliting data...
training model...
-- training took 83.81 sec
predicting test_set...
can take several minutes...
predicting took 4.24 sec
creating dataframe...
exporting file...
file created!


NameError: name 'my_dictionaries' is not defined

In [12]:
# Test tuned LGBMClassifier on df_train + add_zone_info
from my_functions import add_zone_info


add_zone_info_ = FunctionTransformer(add_zone_info)
#preprocessor = ColumnTransformer([('standard_scaler', StandardScaler(), num_columns), ('pass', 'passthrough', dummy_cols+['Id'])])
model = LGBMClassifier(learning_rate=0.01, max_depth=32, num_iterations=1024, num_leaves=512, n_jobs=-1)
pipe = make_pipeline(add_zone_info_, model)

pred2 = test_predict(df_train=df_train, df_test=df_test, model=pipe, export_file=True, display_local_metrics=False)

spliting data...
training model...


ValueError: A given column is not a column of the dataframe

In [8]:
add_zone_info(df_train)

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,R_till_substratum,R_Cryaquepts,R_Cryumbrepts,R_Cryorthents,R_Cryaquolls,Rock,S_rubbly,S_stony,S_very stony,S_extremely stony
0,242642,2881,130,22,210,54,1020,250,221,88,...,0,0,0,0,0,1,0,0,0,1
1,309891,3005,351,14,242,-16,1371,194,215,159,...,0,0,0,0,0,0,0,0,0,1
2,287847,3226,63,14,618,2,1092,232,210,107,...,0,0,0,0,0,0,0,0,0,1
3,516307,3298,317,8,661,60,752,198,233,174,...,1,0,0,0,1,0,0,0,0,0
4,124860,3080,35,6,175,26,3705,219,227,144,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15115,475155,3328,321,13,323,12,5109,186,227,180,...,0,0,0,0,1,0,0,0,0,1
15116,514378,3455,37,5,841,92,939,220,229,146,...,0,0,0,1,0,1,0,0,0,1
15117,368425,3279,90,14,404,113,1513,240,218,105,...,0,0,0,0,0,0,0,0,0,1
15118,537844,3589,357,9,418,52,1868,205,223,155,...,0,0,0,1,0,1,0,0,0,1


In [16]:
X_train.shape

(12096, 55)

In [28]:
X_train = add_zone_info(df=X, climate=True, geographic=True, family=True, rock=True, stony=True, drop_initial_rows=True)
X_test = add_zone_info(df=df_test, climate=True, geographic=True, family=True, rock=True, stony=True, drop_initial_rows=True)

In [29]:
preprocessor = ColumnTransformer([('standard_scaler', StandardScaler(), num_columns), ('pass', 'passthrough', [col for col in list(X_train.columns) if col not in num_columns])])
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.fit_transform(X_test)

In [30]:
model = LGBMClassifier(learning_rate=0.01, max_depth=32, num_iterations=1024, num_leaves=512, n_jobs=-1)
model.fit(X_train, y)
preds = model.predict(X_test)



TypeError: 'LGBMClassifier' object is not callable

In [31]:
preds = model.predict(X_test)

In [32]:
df = pd.DataFrame(preds)
df.reset_index(inplace=True)
df.rename({'index':'Id', 0:'Cover_type'}, axis='columns', inplace=True)
df['Id'] = df['Id'].apply(lambda x : x + 1)

In [33]:
# Export the predictions by writing a csv file in the 'answers' folder (if asked)
print('exporting file...')
os.chdir('/Users/camilleepitalon/Documents/DSB/11_machine_learning_2/Project/')
try:
    os.chdir('answers')
except: 
    os.mkdir('answers')
    os.chdir('answers')
attempt_num = str(len(os.listdir()))
df.to_csv('full_submission'+attempt_num+'.csv', index=False)
os.chdir('..')
print('file created!')

exporting file...
file created!


In [25]:
other_columns = [col for col in list(X_train.columns) if col not in num_columns]

In [24]:
X_train.columns()

TypeError: 'Index' object is not callable

In [2]:
from my_class import MyClass

In [3]:
os.chdir('/Users/camilleepitalon/Documents/DSB/11_machine_learning_2/Project')
df_train = pd.read_csv('./train.csv')
df_test = pd.read_csv('./test-full.csv')

class1 = MyClass(
    df_train,
    df_test,
    LGBMClassifier(),
    True,   #climate
    True,   #geographic
    True,   #family
    True,   #rock
    True,   #stony
    True,   #drop_initial_rows
    10      #randomstate
)
        

In [4]:
class1.test_predict(export_file=True, display_local_metrics=True)

enriching the data...


TypeError: split_trainset() got multiple values for argument 'split_test_size'