# Kaggle Forest Challenge - Dimensionality Reduction

This notebook aims at comparing different models with different sets of features.

In [1]:
# Import useful packages
import pandas as pd
import numpy as np
import os
import seaborn as sns
from matplotlib import pyplot as plt
import time

# Import ML packages 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier	
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier		

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

from lightgbm import LGBMClassifier

from catboost import CatBoostClassifier

from tpot import TPOTClassifier

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import prince


# Import my functions 

from my_toolkit import *

os.chdir('/Users/camilleepitalon/Documents/DSB/11_machine_learning_2/Project')
df_train = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test-full.csv')


In [2]:
df_train.head(5)

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,242642,2881,130,22,210,54,1020,250,221,88,...,0,0,0,0,0,0,0,0,0,1
1,309891,3005,351,14,242,-16,1371,194,215,159,...,0,0,0,0,0,0,0,0,0,1
2,287847,3226,63,14,618,2,1092,232,210,107,...,0,0,0,0,0,0,0,0,0,1
3,516307,3298,317,8,661,60,752,198,233,174,...,0,0,0,0,0,0,0,0,0,1
4,124860,3080,35,6,175,26,3705,219,227,144,...,0,0,0,0,0,0,0,0,0,1


In [5]:
# This function can be find in my_toolkit.py

""" def perform_mca(df, decomposition, columns_mca):
    
    columns_mca = [col for col in columns_mca if col in df.columns]
    
    df_new = df.drop(columns=columns_mca)

    X = df[columns_mca]
    decomposition.fit(X)
    X_mca = decomposition.transform(X)
    n_comp = len(X_mca.columns)
    X_mca.columns = ['C' + str(i+1) for i in range(n_comp)]

    for col in X_mca.columns:
        df_new[col] = X_mca[col]
    
    return df_new """

print()




In [7]:
""" df = X

mca = prince.MCA(
    n_components=10,
    n_iter=3,
    copy=True,
    check_input=True,
    engine='auto',
    random_state=42
)

X_mca = perform_mca(df, mca, columns_mca) """

print()




In [10]:
# To delete 

""" # Class parameters
df_train = df_train
df_test = df_test
model = LGBMClassifier()
#model = LGBMClassifier(learning_rate=0.01, max_depth=32, num_iteration=1024, num_leaves=512, n_jobs=-1)
#model = ExtraTreesClassifier(n_estimators=300, max_features = None, min_samples_leaf= 1, min_samples_split= 2)
add_eng_features = True
add_climate = True
add_geographic = True
add_family = True
add_rocky = True
add_stony = True
keep_initial_rows = False
columns_to_drop = ['ClimZone_3', 'GeoZone_5', 'F_Gothic', 'F_Troutville', 'F_Rogert', 'F_Bross', 'R_Limber_families', 'R_Aquolis', 'R_Cryoborolis', 'R_Cryaquolis', 'R_Borohemists', 'R_Cryaquepts', 'R_Cryaquolls']
random_state = 2

cl = ClassifTools(df_train, df_test, model,
    add_eng_features,
    add_climate, add_geographic, add_family, add_rocky, add_stony,
    keep_initial_rows, columns_to_drop,
    random_state)

mca = prince.MCA(
    n_components=10,
    n_iter=3,
    copy=True,
    check_input=True,
    engine='auto',
    random_state=42
)

decomposition = mca

pred, acc, mat, pred_time = cl.test_predict(decomposition=decomposition, export_file=True, compute_local_metrics=True)
export_metrics(acc, mat, pred_time, cl, decomposition=decomposition)
print_results(acc, mat, pred_time, cl) """

print()




In [11]:
# Decomposition
n_components = 10
mca = prince.MCA(
    n_components=n_components,
    n_iter=3,
    copy=True,
    check_input=True,
    engine='auto',
    random_state=42
)

# Columns to decompose
dummies_only = ['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4', 'ClimZone_2', 'ClimZone_4', 'ClimZone_5', 'ClimZone_6', 'ClimZone_7', 'ClimZone_8', 'GeoZone_1', 'GeoZone_2', 'GeoZone_7', 'F_Cathedral', 'F_Ratake', 'F_Vanet', 'F_Legault', 'F_Catamount', 'F_Bullwark', 'F_Gateview', 'F_Leighcan', 'F_Como', 'F_Moran', 'R_Rock_outcrop', 'R_Ratake_families', 'R_Rock_land', 'R_till_substratum', 'R_Cryumbrepts', 'R_Cryorthents', 'Rock', 'S_rubbly', 'S_stony', 'S_very stony', 'S_extremely stony']
all_columns_except_id = ['S_stony', 'Soil_Type39', 'R_Cryumbrepts', 'F_Moran', 'F_Como', 'Rock', 'ClimZone_2', 'Wilderness_Area3', 'F_Cathedral', 'Soil_Type8', 'Hillshade_Noon', 'Soil_Type16', 'Hillshade_3pm', 'F_Catamount', 'Soil_Type32', 'Aspect', 'Soil_Type24', 'F_Ratake', 'Soil_Type6', 'Soil_Type12', 'R_Cryorthents', 'R_Rock_outcrop', 'Hillshade_9am', 'Soil_Type4', 'Soil_Type14', 'Soil_Type40', 'F_Vanet', 'ClimZone_8', 'Soil_Type5', 'Soil_Type2', 'Soil_Type22', 'Soil_Type3', 'Soil_Type27', 'Wilderness_Area1', 'Soil_Type30', 'Soil_Type36', 'F_Bullwark', 'R_till_substratum', 'Soil_Type38', 'Soil_Type21', 'R_Ratake_families', 'S_extremely stony', 'Soil_Type35', 'F_Leighcan', 'R_Rock_land', 'Soil_Type33', 'Slope', 'Soil_Type1', 'Horizontal_Distance_To_Fire_Points', 'Soil_Type9', 'GeoZone_7', 'Cover_Type', 'Soil_Type23', 'Soil_Type11', 'ClimZone_6', 'Wilderness_Area4', 'Soil_Type37', 'Wilderness_Area2', 'ClimZone_5', 'Elevation', 'Soil_Type15', 'Soil_Type10', 'F_Gateview', 'S_rubbly', 'Soil_Type25', 'Soil_Type29', 'Soil_Type26', 'Soil_Type20', 'Soil_Type28', 'Soil_Type19', 'ClimZone_7', 'F_Legault', 'Soil_Type13', 'GeoZone_1', 'GeoZone_2', 'Soil_Type34', 'Horizontal_Distance_To_Roadways', 'Soil_Type31', 'S_very stony', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Hydrology', 'ClimZone_4', 'Soil_Type7', 'Soil_Type18', 'Soil_Type17']
all_columns_with_id = ['Id', 'S_stony', 'Soil_Type39', 'R_Cryumbrepts', 'F_Moran', 'F_Como', 'Rock', 'ClimZone_2', 'Wilderness_Area3', 'F_Cathedral', 'Soil_Type8', 'Hillshade_Noon', 'Soil_Type16', 'Hillshade_3pm', 'F_Catamount', 'Soil_Type32', 'Aspect', 'Soil_Type24', 'F_Ratake', 'Soil_Type6', 'Soil_Type12', 'R_Cryorthents', 'R_Rock_outcrop', 'Hillshade_9am', 'Soil_Type4', 'Soil_Type14', 'Soil_Type40', 'F_Vanet', 'ClimZone_8', 'Soil_Type5', 'Soil_Type2', 'Soil_Type22', 'Soil_Type3', 'Soil_Type27', 'Wilderness_Area1', 'Soil_Type30', 'Soil_Type36', 'F_Bullwark', 'R_till_substratum', 'Soil_Type38', 'Soil_Type21', 'R_Ratake_families', 'S_extremely stony', 'Soil_Type35', 'F_Leighcan', 'R_Rock_land', 'Soil_Type33', 'Slope', 'Soil_Type1', 'Horizontal_Distance_To_Fire_Points', 'Soil_Type9', 'GeoZone_7', 'Cover_Type', 'Soil_Type23', 'Soil_Type11', 'ClimZone_6', 'Wilderness_Area4', 'Soil_Type37', 'Wilderness_Area2', 'ClimZone_5', 'Elevation', 'Soil_Type15', 'Soil_Type10', 'F_Gateview', 'S_rubbly', 'Soil_Type25', 'Soil_Type29', 'Soil_Type26', 'Soil_Type20', 'Soil_Type28', 'Soil_Type19', 'ClimZone_7', 'F_Legault', 'Soil_Type13', 'GeoZone_1', 'GeoZone_2', 'Soil_Type34', 'Horizontal_Distance_To_Roadways', 'Soil_Type31', 'S_very stony', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Hydrology', 'ClimZone_4', 'Soil_Type7', 'Soil_Type18', 'Soil_Type17']

Let's try MCA, with 3, 5 and 10 components

In [14]:
columns_to_decomp = ['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4', 'ClimZone_2', 'ClimZone_4', 'ClimZone_5', 'ClimZone_6', 'ClimZone_7', 'ClimZone_8', 'GeoZone_1', 'GeoZone_2', 'GeoZone_7', 'F_Cathedral', 'F_Ratake', 'F_Vanet', 'F_Legault', 'F_Catamount', 'F_Bullwark', 'F_Gateview', 'F_Leighcan', 'F_Como', 'F_Moran', 'R_Rock_outcrop', 'R_Ratake_families', 'R_Rock_land', 'R_till_substratum', 'R_Cryumbrepts', 'R_Cryorthents', 'Rock', 'S_rubbly', 'S_stony', 'S_very stony', 'S_extremely stony']

n_components_list = [3, 5, 10]

for n_components in n_components_list:
    
    models = [LogisticRegression(), ExtraTreesClassifier(), LGBMClassifier()]
    
    mca = prince.MCA(
        n_components=n_components,
        n_iter=3,
        copy=True,
        check_input=True,
        engine='auto',
        random_state=42
    )

    for model in models:
        
        # Initialiazz class properties so that no treatment is performed (raw data)
        df_train = df_train
        df_test = df_test
        add_eng_features = True
        columns_to_convert_to_log = []
        polynomial_degree = 0
        columns_to_polynomial = []
        add_climate = True
        add_geographic = True
        add_family = True
        add_rocky = True
        add_stony = True
        keep_initial_rows = False
        decomposition = mca
        columns_to_decomp = columns_to_decomp
        columns_to_drop = ['Id']
        random_state = 2

        cl = ClassifTools(df_train, df_test, model,
            add_eng_features,
            columns_to_convert_to_log,
            polynomial_degree, columns_to_polynomial,
            add_climate, add_geographic, 
            add_family, add_rocky, add_stony,
            keep_initial_rows, columns_to_drop,
            decomposition, columns_to_decomp,
            random_state)

        pred, acc, mat, pred_time = cl.test_predict(export_file=True, compute_local_metrics=True)
        print(mat)
        export_metrics(acc, mat, pred_time, cl)
        print_results(acc, mat, pred_time, cl)

enriching the data...
number of features: 32
   -- took 77.27 sec
training model...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


   -- took 0.6 sec
predicting test_set...
   -- took 0.12 sec
exporting file...
computing local metrics...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


   -- took 2.53 sec
> SUCCESS
       pred6  pred1  pred5  pred7  pred2  pred3  pred4
true6    200     48      0      0     55     25    115
true1     81    153     18      1     76     49     39
true5      3     27    137    118     45     77      4
true7      0      2     28    359      8     39      0
true2     13     87     29      1    210     64      9
true3      9     62     97     64     82    136      8
true4    160      4      0      0     10      1    271
-------------------------
- Parameters:  {'model': LogisticRegression(), 'add_eng_features': True, 'columns_to_convert_to_log': [], 'polynomial_degree': 0, 'columns_to_polynomial': [], 'add_climate': True, 'add_geographic': True, 'add_family': True, 'add_rocky': True, 'add_stony': True, 'keep_initial_rows': False, 'columns_to_drop': ['Id'], 'decomposition': MCA(n_components=3, n_iter=3, random_state=42), 'columns_to_decomp': ['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4', 'ClimZone_2', 'ClimZ

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


   -- took 0.68 sec
predicting test_set...
   -- took 0.13 sec
exporting file...
computing local metrics...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


   -- took 2.93 sec
> SUCCESS
       pred6  pred1  pred5  pred7  pred2  pred3  pred4
true6    148     59      1      0     35     31    169
true1     61    166     20      1     71     47     51
true5      0     31    121    121     52     78      8
true7      0      3     22    350     22     39      0
true2     10     74     36      2    217     55     19
true3      4     59     84     63     93    143     12
true4    119     13      4      0      4      3    303
-------------------------
- Parameters:  {'model': LogisticRegression(), 'add_eng_features': True, 'columns_to_convert_to_log': [], 'polynomial_degree': 0, 'columns_to_polynomial': [], 'add_climate': True, 'add_geographic': True, 'add_family': True, 'add_rocky': True, 'add_stony': True, 'keep_initial_rows': False, 'columns_to_drop': ['Id'], 'decomposition': MCA(n_components=5, n_iter=3, random_state=42), 'columns_to_decomp': ['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4', 'ClimZone_2', 'ClimZ

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


   -- took 0.69 sec
predicting test_set...
   -- took 0.17 sec
exporting file...
computing local metrics...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


   -- took 2.9 sec
> SUCCESS
       pred6  pred1  pred5  pred7  pred2  pred3  pred4
true6    164     69      1      0     32     21    156
true1     84    159     23      0     55     45     51
true5      1     31    113    122     43     94      7
true7      0      0     19    330     10     77      0
true2     13     84     29      3    193     75     16
true3     12     62     97     55     69    152     11
true4    119      4      0      4      5      0    314
-------------------------
- Parameters:  {'model': LogisticRegression(), 'add_eng_features': True, 'columns_to_convert_to_log': [], 'polynomial_degree': 0, 'columns_to_polynomial': [], 'add_climate': True, 'add_geographic': True, 'add_family': True, 'add_rocky': True, 'add_stony': True, 'keep_initial_rows': False, 'columns_to_drop': ['Id'], 'decomposition': MCA(n_components=10, n_iter=3, random_state=42), 'columns_to_decomp': ['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4', 'ClimZone_2', 'ClimZ

Now, let's try PCA, with 3, 5 and 10 components

In [15]:
columns_to_decomp = ['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4', 'ClimZone_2', 'ClimZone_4', 'ClimZone_5', 'ClimZone_6', 'ClimZone_7', 'ClimZone_8', 'GeoZone_1', 'GeoZone_2', 'GeoZone_7', 'F_Cathedral', 'F_Ratake', 'F_Vanet', 'F_Legault', 'F_Catamount', 'F_Bullwark', 'F_Gateview', 'F_Leighcan', 'F_Como', 'F_Moran', 'R_Rock_outcrop', 'R_Ratake_families', 'R_Rock_land', 'R_till_substratum', 'R_Cryumbrepts', 'R_Cryorthents', 'Rock', 'S_rubbly', 'S_stony', 'S_very stony', 'S_extremely stony']

n_components_list = [3, 5, 10]

for n_components in n_components_list:
    
    models = [LogisticRegression(), ExtraTreesClassifier(), LGBMClassifier()]
    
    pca = prince.PCA(
        n_components=n_components,
        n_iter=3,
        rescale_with_mean=True,
        rescale_with_std=True,
        copy=True,
        check_input=True,
        engine='auto',
        random_state=42
    )

    for model in models:
        
        # Initialiazz class properties so that no treatment is performed (raw data)
        df_train = df_train
        df_test = df_test
        add_eng_features = True
        columns_to_convert_to_log = []
        polynomial_degree = 0
        columns_to_polynomial = []
        add_climate = True
        add_geographic = True
        add_family = True
        add_rocky = True
        add_stony = True
        keep_initial_rows = False
        decomposition = pca
        columns_to_decomp = columns_to_decomp
        columns_to_drop = ['Id']
        random_state = 2

        cl = ClassifTools(df_train, df_test, model,
            add_eng_features,
            columns_to_convert_to_log,
            polynomial_degree, columns_to_polynomial,
            add_climate, add_geographic, 
            add_family, add_rocky, add_stony,
            keep_initial_rows, columns_to_drop,
            decomposition, columns_to_decomp,
            random_state)

        pred, acc, mat, pred_time = cl.test_predict(export_file=True, compute_local_metrics=True)
        print(mat)
        export_metrics(acc, mat, pred_time, cl)
        print_results(acc, mat, pred_time, cl)

enriching the data...
number of features: 32
   -- took 62.73 sec
training model...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


   -- took 0.82 sec
predicting test_set...
   -- took 0.13 sec
exporting file...
computing local metrics...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


   -- took 2.47 sec
> SUCCESS
       pred6  pred1  pred5  pred7  pred2  pred3  pred4
true6    124     65      1      0     44     26    183
true1     56    154     20      0     86     44     57
true5      1     23    112    119     66     83      7
true7      0      2     24    341     28     41      0
true2      5     68     28      0    247     52     13
true3      6     52     91     56    105    136     12
true4    104     10      3      1      5      1    322
-------------------------
- Parameters:  {'model': LogisticRegression(), 'add_eng_features': True, 'columns_to_convert_to_log': [], 'polynomial_degree': 0, 'columns_to_polynomial': [], 'add_climate': True, 'add_geographic': True, 'add_family': True, 'add_rocky': True, 'add_stony': True, 'keep_initial_rows': False, 'columns_to_drop': ['Id'], 'decomposition': PCA(n_components=3, random_state=42), 'columns_to_decomp': ['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4', 'ClimZone_2', 'ClimZone_4', 'C

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


   -- took 0.76 sec
predicting test_set...
   -- took 0.14 sec
exporting file...
computing local metrics...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


   -- took 2.54 sec
> SUCCESS
       pred6  pred1  pred5  pred7  pred2  pred3  pred4
true6    155     54      3      0     46     23    162
true1     67    159     29      1     72     39     50
true5      2     26    131    122     51     73      6
true7      0      0     30    334     15     57      0
true2     11     72     33      3    219     64     11
true3      8     47    113     51     97    132     10
true4    134      4      2      2      6      1    297
-------------------------
- Parameters:  {'model': LogisticRegression(), 'add_eng_features': True, 'columns_to_convert_to_log': [], 'polynomial_degree': 0, 'columns_to_polynomial': [], 'add_climate': True, 'add_geographic': True, 'add_family': True, 'add_rocky': True, 'add_stony': True, 'keep_initial_rows': False, 'columns_to_drop': ['Id'], 'decomposition': PCA(n_components=5, random_state=42), 'columns_to_decomp': ['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4', 'ClimZone_2', 'ClimZone_4', 'C

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


   -- took 0.7 sec
predicting test_set...
   -- took 0.16 sec
exporting file...
computing local metrics...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


   -- took 2.55 sec
> SUCCESS
       pred6  pred1  pred5  pred7  pred2  pred3  pred4
true6    139     32      4      0     56     27    185
true1     67    135     25      1     76     45     68
true5      0     20    129    120     55     81      6
true7      0      0     26    338     13     59      0
true2     20     53     37      3    219     61     20
true3      7     38    101     56    102    142     12
true4     85      0      4      0     11      5    341
-------------------------
- Parameters:  {'model': LogisticRegression(), 'add_eng_features': True, 'columns_to_convert_to_log': [], 'polynomial_degree': 0, 'columns_to_polynomial': [], 'add_climate': True, 'add_geographic': True, 'add_family': True, 'add_rocky': True, 'add_stony': True, 'keep_initial_rows': False, 'columns_to_drop': ['Id'], 'decomposition': PCA(n_components=10, random_state=42), 'columns_to_decomp': ['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4', 'ClimZone_2', 'ClimZone_4', '

In [26]:
dummies_only = ['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4', 'ClimZone_2', 'ClimZone_4', 'ClimZone_5', 'ClimZone_6', 'ClimZone_7', 'ClimZone_8', 'GeoZone_1', 'GeoZone_2', 'GeoZone_7', 'F_Cathedral', 'F_Ratake', 'F_Vanet', 'F_Legault', 'F_Catamount', 'F_Bullwark', 'F_Gateview', 'F_Leighcan', 'F_Como', 'F_Moran', 'R_Rock_outcrop', 'R_Ratake_families', 'R_Rock_land', 'R_till_substratum', 'R_Cryumbrepts', 'R_Cryorthents', 'Rock', 'S_rubbly', 'S_stony', 'S_very stony', 'S_extremely stony']
all_columns_except_id = ['S_stony', 'Soil_Type39', 'R_Cryumbrepts', 'F_Moran', 'F_Como', 'Rock', 'ClimZone_2', 'Wilderness_Area3', 'F_Cathedral', 'Soil_Type8', 'Hillshade_Noon', 'Soil_Type16', 'Hillshade_3pm', 'F_Catamount', 'Soil_Type32', 'Aspect', 'Soil_Type24', 'F_Ratake', 'Soil_Type6', 'Soil_Type12', 'R_Cryorthents', 'R_Rock_outcrop', 'Hillshade_9am', 'Soil_Type4', 'Soil_Type14', 'Soil_Type40', 'F_Vanet', 'ClimZone_8', 'Soil_Type5', 'Soil_Type2', 'Soil_Type22', 'Soil_Type3', 'Soil_Type27', 'Wilderness_Area1', 'Soil_Type30', 'Soil_Type36', 'F_Bullwark', 'R_till_substratum', 'Soil_Type38', 'Soil_Type21', 'R_Ratake_families', 'S_extremely stony', 'Soil_Type35', 'F_Leighcan', 'R_Rock_land', 'Soil_Type33', 'Slope', 'Soil_Type1', 'Horizontal_Distance_To_Fire_Points', 'Soil_Type9', 'GeoZone_7', 'Cover_Type', 'Soil_Type23', 'Soil_Type11', 'ClimZone_6', 'Wilderness_Area4', 'Soil_Type37', 'Wilderness_Area2', 'ClimZone_5', 'Elevation', 'Soil_Type15', 'Soil_Type10', 'F_Gateview', 'S_rubbly', 'Soil_Type25', 'Soil_Type29', 'Soil_Type26', 'Soil_Type20', 'Soil_Type28', 'Soil_Type19', 'ClimZone_7', 'F_Legault', 'Soil_Type13', 'GeoZone_1', 'GeoZone_2', 'Soil_Type34', 'Horizontal_Distance_To_Roadways', 'Soil_Type31', 'S_very stony', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Hydrology', 'ClimZone_4', 'Soil_Type7', 'Soil_Type18', 'Soil_Type17']
all_columns_with_id = ['Id', 'S_stony', 'Soil_Type39', 'R_Cryumbrepts', 'F_Moran', 'F_Como', 'Rock', 'ClimZone_2', 'Wilderness_Area3', 'F_Cathedral', 'Soil_Type8', 'Hillshade_Noon', 'Soil_Type16', 'Hillshade_3pm', 'F_Catamount', 'Soil_Type32', 'Aspect', 'Soil_Type24', 'F_Ratake', 'Soil_Type6', 'Soil_Type12', 'R_Cryorthents', 'R_Rock_outcrop', 'Hillshade_9am', 'Soil_Type4', 'Soil_Type14', 'Soil_Type40', 'F_Vanet', 'ClimZone_8', 'Soil_Type5', 'Soil_Type2', 'Soil_Type22', 'Soil_Type3', 'Soil_Type27', 'Wilderness_Area1', 'Soil_Type30', 'Soil_Type36', 'F_Bullwark', 'R_till_substratum', 'Soil_Type38', 'Soil_Type21', 'R_Ratake_families', 'S_extremely stony', 'Soil_Type35', 'F_Leighcan', 'R_Rock_land', 'Soil_Type33', 'Slope', 'Soil_Type1', 'Horizontal_Distance_To_Fire_Points', 'Soil_Type9', 'GeoZone_7', 'Cover_Type', 'Soil_Type23', 'Soil_Type11', 'ClimZone_6', 'Wilderness_Area4', 'Soil_Type37', 'Wilderness_Area2', 'ClimZone_5', 'Elevation', 'Soil_Type15', 'Soil_Type10', 'F_Gateview', 'S_rubbly', 'Soil_Type25', 'Soil_Type29', 'Soil_Type26', 'Soil_Type20', 'Soil_Type28', 'Soil_Type19', 'ClimZone_7', 'F_Legault', 'Soil_Type13', 'GeoZone_1', 'GeoZone_2', 'Soil_Type34', 'Horizontal_Distance_To_Roadways', 'Soil_Type31', 'S_very stony', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Hydrology', 'ClimZone_4', 'Soil_Type7', 'Soil_Type18', 'Soil_Type17']

for n_components in [5, 8, 12, 15, 20, 25]:
    for columns_to_decomp in [dummies_only, all_columns_except_id, all_columns_with_id]:
        
        mca = prince.MCA(
            n_components=n_components,
            n_iter=3,
            copy=True,
            check_input=True,
            engine='auto',
            random_state=42
        )

        decomposition = mca

        # Class parameters
        df_train = df_train
        df_test = df_test
        model = LGBMClassifier()
        """model = LGBMClassifier(learning_rate=0.01, max_depth=32, num_iteration=1024, num_leaves=512, n_jobs=-1)
        model = ExtraTreesClassifier(n_estimators=300, max_features = None, min_samples_leaf= 1, min_samples_split= 2) """
        add_eng_features = True
        add_climate = True
        add_geographic = True
        add_family = True
        add_rocky = True
        add_stony = True
        keep_initial_rows = False
        columns_to_drop = None
        decomposition=decomposition
        columns_to_decomp=columns_to_decomp
        random_state = 2

        cl = ClassifTools(df_train, df_test, model,
            add_eng_features,
            add_climate, add_geographic, add_family, add_rocky, add_stony,
            keep_initial_rows, columns_to_drop,
            decomposition, columns_to_decomp,
            random_state)

        pred, acc, mat, pred_time = cl.test_predict(export_file=True, compute_local_metrics=True)
        export_metrics(acc, mat, pred_time, cl)
        print_results(acc, mat, pred_time, cl)

enriching the data...
number of features: 22
   -- took 69.67 sec
training model...
   -- took 0.65 sec
predicting test_set...
   -- took 2.34 sec
exporting file...
computing local metrics...
   -- took 2.62 sec
> SUCCESS
-------------------------
- Parameters:  {'model': LGBMClassifier(), 'add_eng_features': True, 'add_climate': True, 'add_geographic': True, 'add_family': True, 'add_rocky': True, 'add_stony': True, 'keep_initial_rows': False, 'columns_to_drop': ['ClimZone_3', 'GeoZone_5', 'F_Gothic', 'F_Troutville', 'F_Rogert', 'F_Bross', 'R_Limber_families', 'R_Aquolis', 'R_Cryoborolis', 'R_Cryaquolis', 'R_Borohemists', 'R_Cryaquepts', 'R_Cryaquolls', 'Vertical_Distance_To_Hydrology'], 'decomposition': MCA(n_components=5, n_iter=3, random_state=42), 'columns_to_decomp': ['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4', 'ClimZone_2', 'ClimZone_4', 'ClimZone_5', 'ClimZone_6', 'ClimZone_7', 'ClimZone_8', 'GeoZone_1', 'GeoZone_2', 'GeoZone_7', 'F_Cathedral'

In [None]:
dummies_only = ['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4', 'ClimZone_2', 'ClimZone_4', 'ClimZone_5', 'ClimZone_6', 'ClimZone_7', 'ClimZone_8', 'GeoZone_1', 'GeoZone_2', 'GeoZone_7', 'F_Cathedral', 'F_Ratake', 'F_Vanet', 'F_Legault', 'F_Catamount', 'F_Bullwark', 'F_Gateview', 'F_Leighcan', 'F_Como', 'F_Moran', 'R_Rock_outcrop', 'R_Ratake_families', 'R_Rock_land', 'R_till_substratum', 'R_Cryumbrepts', 'R_Cryorthents', 'Rock', 'S_rubbly', 'S_stony', 'S_very stony', 'S_extremely stony']
all_columns_except_id = ['S_stony', 'Soil_Type39', 'R_Cryumbrepts', 'F_Moran', 'F_Como', 'Rock', 'ClimZone_2', 'Wilderness_Area3', 'F_Cathedral', 'Soil_Type8', 'Hillshade_Noon', 'Soil_Type16', 'Hillshade_3pm', 'F_Catamount', 'Soil_Type32', 'Aspect', 'Soil_Type24', 'F_Ratake', 'Soil_Type6', 'Soil_Type12', 'R_Cryorthents', 'R_Rock_outcrop', 'Hillshade_9am', 'Soil_Type4', 'Soil_Type14', 'Soil_Type40', 'F_Vanet', 'ClimZone_8', 'Soil_Type5', 'Soil_Type2', 'Soil_Type22', 'Soil_Type3', 'Soil_Type27', 'Wilderness_Area1', 'Soil_Type30', 'Soil_Type36', 'F_Bullwark', 'R_till_substratum', 'Soil_Type38', 'Soil_Type21', 'R_Ratake_families', 'S_extremely stony', 'Soil_Type35', 'F_Leighcan', 'R_Rock_land', 'Soil_Type33', 'Slope', 'Soil_Type1', 'Horizontal_Distance_To_Fire_Points', 'Soil_Type9', 'GeoZone_7', 'Cover_Type', 'Soil_Type23', 'Soil_Type11', 'ClimZone_6', 'Wilderness_Area4', 'Soil_Type37', 'Wilderness_Area2', 'ClimZone_5', 'Elevation', 'Soil_Type15', 'Soil_Type10', 'F_Gateview', 'S_rubbly', 'Soil_Type25', 'Soil_Type29', 'Soil_Type26', 'Soil_Type20', 'Soil_Type28', 'Soil_Type19', 'ClimZone_7', 'F_Legault', 'Soil_Type13', 'GeoZone_1', 'GeoZone_2', 'Soil_Type34', 'Horizontal_Distance_To_Roadways', 'Soil_Type31', 'S_very stony', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Hydrology', 'ClimZone_4', 'Soil_Type7', 'Soil_Type18', 'Soil_Type17']
all_columns_with_id = ['Id', 'S_stony', 'Soil_Type39', 'R_Cryumbrepts', 'F_Moran', 'F_Como', 'Rock', 'ClimZone_2', 'Wilderness_Area3', 'F_Cathedral', 'Soil_Type8', 'Hillshade_Noon', 'Soil_Type16', 'Hillshade_3pm', 'F_Catamount', 'Soil_Type32', 'Aspect', 'Soil_Type24', 'F_Ratake', 'Soil_Type6', 'Soil_Type12', 'R_Cryorthents', 'R_Rock_outcrop', 'Hillshade_9am', 'Soil_Type4', 'Soil_Type14', 'Soil_Type40', 'F_Vanet', 'ClimZone_8', 'Soil_Type5', 'Soil_Type2', 'Soil_Type22', 'Soil_Type3', 'Soil_Type27', 'Wilderness_Area1', 'Soil_Type30', 'Soil_Type36', 'F_Bullwark', 'R_till_substratum', 'Soil_Type38', 'Soil_Type21', 'R_Ratake_families', 'S_extremely stony', 'Soil_Type35', 'F_Leighcan', 'R_Rock_land', 'Soil_Type33', 'Slope', 'Soil_Type1', 'Horizontal_Distance_To_Fire_Points', 'Soil_Type9', 'GeoZone_7', 'Cover_Type', 'Soil_Type23', 'Soil_Type11', 'ClimZone_6', 'Wilderness_Area4', 'Soil_Type37', 'Wilderness_Area2', 'ClimZone_5', 'Elevation', 'Soil_Type15', 'Soil_Type10', 'F_Gateview', 'S_rubbly', 'Soil_Type25', 'Soil_Type29', 'Soil_Type26', 'Soil_Type20', 'Soil_Type28', 'Soil_Type19', 'ClimZone_7', 'F_Legault', 'Soil_Type13', 'GeoZone_1', 'GeoZone_2', 'Soil_Type34', 'Horizontal_Distance_To_Roadways', 'Soil_Type31', 'S_very stony', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Hydrology', 'ClimZone_4', 'Soil_Type7', 'Soil_Type18', 'Soil_Type17']

for n_components in [5, 8, 12, 15, 20, 25]:
    for columns_to_decomp in [dummies_only, all_columns_except_id, all_columns_with_id]:
        
        pca = prince.PCA(
            n_components=n_components,
            n_iter=3,
            rescale_with_mean=True,
            rescale_with_std=True,
            copy=True,
            check_input=True,
            engine='auto',
            random_state=42
        )

        decomposition = pca

        # Class parameters
        df_train = df_train
        df_test = df_test
        model = LGBMClassifier()
        """model = LGBMClassifier(learning_rate=0.01, max_depth=32, num_iteration=1024, num_leaves=512, n_jobs=-1)
        model = ExtraTreesClassifier(n_estimators=300, max_features = None, min_samples_leaf= 1, min_samples_split= 2) """
        add_eng_features = True
        add_climate = True
        add_geographic = True
        add_family = True
        add_rocky = True
        add_stony = True
        keep_initial_rows = False
        columns_to_drop = ['ClimZone_3', 'GeoZone_5', 'F_Gothic', 'F_Troutville', 'F_Rogert', 'F_Bross', 'R_Limber_families', 'R_Aquolis', 'R_Cryoborolis', 'R_Cryaquolis', 'R_Borohemists', 'R_Cryaquepts', 'R_Cryaquolls', 'Vertical_Distance_To_Hydrology']
        decomposition=decomposition
        columns_to_decomp=columns_to_decomp
        random_state = 2

        cl = ClassifTools(df_train, df_test, model,
            add_eng_features,
            add_climate, add_geographic, add_family, add_rocky, add_stony,
            keep_initial_rows, columns_to_drop,
            decomposition, columns_to_decomp,
            random_state)

        pred, acc, mat, pred_time = cl.test_predict(export_file=True, compute_local_metrics=True)
        export_metrics(acc, mat, pred_time, cl)
        print_results(acc, mat, pred_time, cl)

In [21]:
['S_stony', 'Soil_Type39', 'R_Cryumbrepts', 'F_Moran', 'F_Como', 'Rock', 'ClimZone_2', 'Wilderness_Area3', 'F_Cathedral', 'Soil_Type8', 'Hillshade_Noon', 'Soil_Type16', 'Hillshade_3pm', 'F_Catamount', 'Soil_Type32', 'Aspect', 'Soil_Type24', 'F_Ratake', 'Soil_Type6', 'Soil_Type12', 'R_Cryorthents', 'R_Rock_outcrop', 'Hillshade_9am', 'Soil_Type4', 'Soil_Type14', 'Soil_Type40', 'F_Vanet', 'ClimZone_8', 'Soil_Type5', 'Soil_Type2', 'Soil_Type22', 'Soil_Type3', 'Soil_Type27', 'Wilderness_Area1', 'Soil_Type30', 'Soil_Type36', 'F_Bullwark', 'R_till_substratum', 'Soil_Type38', 'Soil_Type21', 'R_Ratake_families', 'S_extremely stony', 'Soil_Type35', 'F_Leighcan', 'R_Rock_land', 'Soil_Type33', 'Slope', 'Soil_Type1', 'Horizontal_Distance_To_Fire_Points', 'Soil_Type9', 'GeoZone_7', 'Cover_Type', 'Soil_Type23', 'Soil_Type11', 'ClimZone_6', 'Wilderness_Area4', 'Soil_Type37', 'Wilderness_Area2', 'ClimZone_5', 'Elevation', 'Soil_Type15', 'Soil_Type10', 'F_Gateview', 'S_rubbly', 'Soil_Type25', 'Soil_Type29', 'Soil_Type26', 'Soil_Type20', 'Soil_Type28', 'Soil_Type19', 'ClimZone_7', 'F_Legault', 'Soil_Type13', 'GeoZone_1', 'GeoZone_2', 'Soil_Type34', 'Horizontal_Distance_To_Roadways', 'Soil_Type31', 'S_very stony', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Hydrology', 'ClimZone_4', 'Soil_Type7', 'Soil_Type18', 'Soil_Type17']

['S_stony',
 'Soil_Type39',
 'R_Cryumbrepts',
 'F_Moran',
 'F_Como',
 'Rock',
 'ClimZone_2',
 'Wilderness_Area3',
 'F_Cathedral',
 'Soil_Type8',
 'Hillshade_Noon',
 'Soil_Type16',
 'Hillshade_3pm',
 'F_Catamount',
 'Soil_Type32',
 'Aspect',
 'Soil_Type24',
 'F_Ratake',
 'Soil_Type6',
 'Soil_Type12',
 'R_Cryorthents',
 'R_Rock_outcrop',
 'Id',
 'Hillshade_9am',
 'Soil_Type4',
 'Soil_Type14',
 'Soil_Type40',
 'F_Vanet',
 'ClimZone_8',
 'Soil_Type5',
 'Soil_Type2',
 'Soil_Type22',
 'Soil_Type3',
 'Soil_Type27',
 'Wilderness_Area1',
 'Soil_Type30',
 'Soil_Type36',
 'F_Bullwark',
 'R_till_substratum',
 'Soil_Type38',
 'Soil_Type21',
 'R_Ratake_families',
 'S_extremely stony',
 'Soil_Type35',
 'F_Leighcan',
 'R_Rock_land',
 'Soil_Type33',
 'Slope',
 'Soil_Type1',
 'Horizontal_Distance_To_Fire_Points',
 'Soil_Type9',
 'GeoZone_7',
 'Cover_Type',
 'Soil_Type23',
 'Soil_Type11',
 'ClimZone_6',
 'Wilderness_Area4',
 'Soil_Type37',
 'Wilderness_Area2',
 'ClimZone_5',
 'Elevation',
 'Soil_Type15',

In [27]:
dummies_only = ['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4', 'ClimZone_2', 'ClimZone_4', 'ClimZone_5', 'ClimZone_6', 'ClimZone_7', 'ClimZone_8', 'GeoZone_1', 'GeoZone_2', 'GeoZone_7', 'F_Cathedral', 'F_Ratake', 'F_Vanet', 'F_Legault', 'F_Catamount', 'F_Bullwark', 'F_Gateview', 'F_Leighcan', 'F_Como', 'F_Moran', 'R_Rock_outcrop', 'R_Ratake_families', 'R_Rock_land', 'R_till_substratum', 'R_Cryumbrepts', 'R_Cryorthents', 'Rock', 'S_rubbly', 'S_stony', 'S_very stony', 'S_extremely stony']
all_columns_except_id = ['S_stony', 'Soil_Type39', 'R_Cryumbrepts', 'F_Moran', 'F_Como', 'Rock', 'ClimZone_2', 'Wilderness_Area3', 'F_Cathedral', 'Soil_Type8', 'Hillshade_Noon', 'Soil_Type16', 'Hillshade_3pm', 'F_Catamount', 'Soil_Type32', 'Aspect', 'Soil_Type24', 'F_Ratake', 'Soil_Type6', 'Soil_Type12', 'R_Cryorthents', 'R_Rock_outcrop', 'Hillshade_9am', 'Soil_Type4', 'Soil_Type14', 'Soil_Type40', 'F_Vanet', 'ClimZone_8', 'Soil_Type5', 'Soil_Type2', 'Soil_Type22', 'Soil_Type3', 'Soil_Type27', 'Wilderness_Area1', 'Soil_Type30', 'Soil_Type36', 'F_Bullwark', 'R_till_substratum', 'Soil_Type38', 'Soil_Type21', 'R_Ratake_families', 'S_extremely stony', 'Soil_Type35', 'F_Leighcan', 'R_Rock_land', 'Soil_Type33', 'Slope', 'Soil_Type1', 'Horizontal_Distance_To_Fire_Points', 'Soil_Type9', 'GeoZone_7', 'Cover_Type', 'Soil_Type23', 'Soil_Type11', 'ClimZone_6', 'Wilderness_Area4', 'Soil_Type37', 'Wilderness_Area2', 'ClimZone_5', 'Elevation', 'Soil_Type15', 'Soil_Type10', 'F_Gateview', 'S_rubbly', 'Soil_Type25', 'Soil_Type29', 'Soil_Type26', 'Soil_Type20', 'Soil_Type28', 'Soil_Type19', 'ClimZone_7', 'F_Legault', 'Soil_Type13', 'GeoZone_1', 'GeoZone_2', 'Soil_Type34', 'Horizontal_Distance_To_Roadways', 'Soil_Type31', 'S_very stony', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Hydrology', 'ClimZone_4', 'Soil_Type7', 'Soil_Type18', 'Soil_Type17']
all_columns_with_id = ['Id', 'S_stony', 'Soil_Type39', 'R_Cryumbrepts', 'F_Moran', 'F_Como', 'Rock', 'ClimZone_2', 'Wilderness_Area3', 'F_Cathedral', 'Soil_Type8', 'Hillshade_Noon', 'Soil_Type16', 'Hillshade_3pm', 'F_Catamount', 'Soil_Type32', 'Aspect', 'Soil_Type24', 'F_Ratake', 'Soil_Type6', 'Soil_Type12', 'R_Cryorthents', 'R_Rock_outcrop', 'Hillshade_9am', 'Soil_Type4', 'Soil_Type14', 'Soil_Type40', 'F_Vanet', 'ClimZone_8', 'Soil_Type5', 'Soil_Type2', 'Soil_Type22', 'Soil_Type3', 'Soil_Type27', 'Wilderness_Area1', 'Soil_Type30', 'Soil_Type36', 'F_Bullwark', 'R_till_substratum', 'Soil_Type38', 'Soil_Type21', 'R_Ratake_families', 'S_extremely stony', 'Soil_Type35', 'F_Leighcan', 'R_Rock_land', 'Soil_Type33', 'Slope', 'Soil_Type1', 'Horizontal_Distance_To_Fire_Points', 'Soil_Type9', 'GeoZone_7', 'Cover_Type', 'Soil_Type23', 'Soil_Type11', 'ClimZone_6', 'Wilderness_Area4', 'Soil_Type37', 'Wilderness_Area2', 'ClimZone_5', 'Elevation', 'Soil_Type15', 'Soil_Type10', 'F_Gateview', 'S_rubbly', 'Soil_Type25', 'Soil_Type29', 'Soil_Type26', 'Soil_Type20', 'Soil_Type28', 'Soil_Type19', 'ClimZone_7', 'F_Legault', 'Soil_Type13', 'GeoZone_1', 'GeoZone_2', 'Soil_Type34', 'Horizontal_Distance_To_Roadways', 'Soil_Type31', 'S_very stony', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Hydrology', 'ClimZone_4', 'Soil_Type7', 'Soil_Type18', 'Soil_Type17']

for n_components in [5, 8, 12, 15, 20, 25]:
    for columns_to_decomp in [dummies_only, all_columns_except_id, all_columns_with_id]:
        
        pca = prince.PCA(
            n_components=n_components,
            n_iter=3,
            rescale_with_mean=True,
            rescale_with_std=True,
            copy=True,
            check_input=True,
            engine='auto',
            random_state=42
        )


        decomposition = pca

        # Class parameters
        df_train = df_train
        df_test = df_test
        model = LGBMClassifier()
        """model = LGBMClassifier(learning_rate=0.01, max_depth=32, num_iteration=1024, num_leaves=512, n_jobs=-1)
        model = ExtraTreesClassifier(n_estimators=300, max_features = None, min_samples_leaf= 1, min_samples_split= 2) """
        add_eng_features = True
        add_climate = True
        add_geographic = True
        add_family = True
        add_rocky = True
        add_stony = True
        keep_initial_rows = False
        columns_to_drop = ['ClimZone_3', 'GeoZone_5', 'F_Gothic', 'F_Troutville', 'F_Rogert', 'F_Bross', 'R_Limber_families', 'R_Aquolis', 'R_Cryoborolis', 'R_Cryaquolis', 'R_Borohemists', 'R_Cryaquepts', 'R_Cryaquolls', 'Vertical_Distance_To_Hydrology']
        decomposition=decomposition
        columns_to_decomp=columns_to_decomp
        random_state = 2

        cl = ClassifTools(df_train, df_test, model,
            add_eng_features,
            add_climate, add_geographic, add_family, add_rocky, add_stony,
            keep_initial_rows, columns_to_drop,
            decomposition, columns_to_decomp,
            random_state)

        pred, acc, mat, pred_time = cl.test_predict(export_file=True, compute_local_metrics=True)
        export_metrics(acc, mat, pred_time, cl)
        print_results(acc, mat, pred_time, cl)

enriching the data...
number of features: 22
   -- took 59.14 sec
training model...
   -- took 1.17 sec
predicting test_set...
   -- took 2.35 sec
exporting file...
computing local metrics...
   -- took 2.31 sec
> SUCCESS
-------------------------
- Parameters:  {'model': LGBMClassifier(), 'add_eng_features': True, 'add_climate': True, 'add_geographic': True, 'add_family': True, 'add_rocky': True, 'add_stony': True, 'keep_initial_rows': False, 'columns_to_drop': ['ClimZone_3', 'GeoZone_5', 'F_Gothic', 'F_Troutville', 'F_Rogert', 'F_Bross', 'R_Limber_families', 'R_Aquolis', 'R_Cryoborolis', 'R_Cryaquolis', 'R_Borohemists', 'R_Cryaquepts', 'R_Cryaquolls', 'Vertical_Distance_To_Hydrology'], 'decomposition': PCA(n_components=5, random_state=42), 'columns_to_decomp': ['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4', 'ClimZone_2', 'ClimZone_4', 'ClimZone_5', 'ClimZone_6', 'ClimZone_7', 'ClimZone_8', 'GeoZone_1', 'GeoZone_2', 'GeoZone_7', 'F_Cathedral', 'F_Ratak

In [5]:
dummies_only = ['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4', 'ClimZone_2', 'ClimZone_4', 'ClimZone_5', 'ClimZone_6', 'ClimZone_7', 'ClimZone_8', 'GeoZone_1', 'GeoZone_2', 'GeoZone_7', 'F_Cathedral', 'F_Ratake', 'F_Vanet', 'F_Legault', 'F_Catamount', 'F_Bullwark', 'F_Gateview', 'F_Leighcan', 'F_Como', 'F_Moran', 'R_Rock_outcrop', 'R_Ratake_families', 'R_Rock_land', 'R_till_substratum', 'R_Cryumbrepts', 'R_Cryorthents', 'Rock', 'S_rubbly', 'S_stony', 'S_very stony', 'S_extremely stony']
all_columns_except_id = ['S_stony', 'Soil_Type39', 'R_Cryumbrepts', 'F_Moran', 'F_Como', 'Rock', 'ClimZone_2', 'Wilderness_Area3', 'F_Cathedral', 'Soil_Type8', 'Hillshade_Noon', 'Soil_Type16', 'Hillshade_3pm', 'F_Catamount', 'Soil_Type32', 'Aspect', 'Soil_Type24', 'F_Ratake', 'Soil_Type6', 'Soil_Type12', 'R_Cryorthents', 'R_Rock_outcrop', 'Hillshade_9am', 'Soil_Type4', 'Soil_Type14', 'Soil_Type40', 'F_Vanet', 'ClimZone_8', 'Soil_Type5', 'Soil_Type2', 'Soil_Type22', 'Soil_Type3', 'Soil_Type27', 'Wilderness_Area1', 'Soil_Type30', 'Soil_Type36', 'F_Bullwark', 'R_till_substratum', 'Soil_Type38', 'Soil_Type21', 'R_Ratake_families', 'S_extremely stony', 'Soil_Type35', 'F_Leighcan', 'R_Rock_land', 'Soil_Type33', 'Slope', 'Soil_Type1', 'Horizontal_Distance_To_Fire_Points', 'Soil_Type9', 'GeoZone_7', 'Cover_Type', 'Soil_Type23', 'Soil_Type11', 'ClimZone_6', 'Wilderness_Area4', 'Soil_Type37', 'Wilderness_Area2', 'ClimZone_5', 'Elevation', 'Soil_Type15', 'Soil_Type10', 'F_Gateview', 'S_rubbly', 'Soil_Type25', 'Soil_Type29', 'Soil_Type26', 'Soil_Type20', 'Soil_Type28', 'Soil_Type19', 'ClimZone_7', 'F_Legault', 'Soil_Type13', 'GeoZone_1', 'GeoZone_2', 'Soil_Type34', 'Horizontal_Distance_To_Roadways', 'Soil_Type31', 'S_very stony', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Hydrology', 'ClimZone_4', 'Soil_Type7', 'Soil_Type18', 'Soil_Type17']
all_columns_with_id = ['Id', 'S_stony', 'Soil_Type39', 'R_Cryumbrepts', 'F_Moran', 'F_Como', 'Rock', 'ClimZone_2', 'Wilderness_Area3', 'F_Cathedral', 'Soil_Type8', 'Hillshade_Noon', 'Soil_Type16', 'Hillshade_3pm', 'F_Catamount', 'Soil_Type32', 'Aspect', 'Soil_Type24', 'F_Ratake', 'Soil_Type6', 'Soil_Type12', 'R_Cryorthents', 'R_Rock_outcrop', 'Hillshade_9am', 'Soil_Type4', 'Soil_Type14', 'Soil_Type40', 'F_Vanet', 'ClimZone_8', 'Soil_Type5', 'Soil_Type2', 'Soil_Type22', 'Soil_Type3', 'Soil_Type27', 'Wilderness_Area1', 'Soil_Type30', 'Soil_Type36', 'F_Bullwark', 'R_till_substratum', 'Soil_Type38', 'Soil_Type21', 'R_Ratake_families', 'S_extremely stony', 'Soil_Type35', 'F_Leighcan', 'R_Rock_land', 'Soil_Type33', 'Slope', 'Soil_Type1', 'Horizontal_Distance_To_Fire_Points', 'Soil_Type9', 'GeoZone_7', 'Cover_Type', 'Soil_Type23', 'Soil_Type11', 'ClimZone_6', 'Wilderness_Area4', 'Soil_Type37', 'Wilderness_Area2', 'ClimZone_5', 'Elevation', 'Soil_Type15', 'Soil_Type10', 'F_Gateview', 'S_rubbly', 'Soil_Type25', 'Soil_Type29', 'Soil_Type26', 'Soil_Type20', 'Soil_Type28', 'Soil_Type19', 'ClimZone_7', 'F_Legault', 'Soil_Type13', 'GeoZone_1', 'GeoZone_2', 'Soil_Type34', 'Horizontal_Distance_To_Roadways', 'Soil_Type31', 'S_very stony', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Hydrology', 'ClimZone_4', 'Soil_Type7', 'Soil_Type18', 'Soil_Type17']

for n_components in [9]:
    for eng in [True, False]:   
        mca = prince.MCA(
            n_components=n_components,
            n_iter=3,
            copy=True,
            check_input=True,
            engine='auto',
            random_state=42
        )

        decomposition = mca

        # Class parameters
        df_train = df_train
        df_test = df_test
        model = LGBMClassifier(learning_rate=0.01, max_depth=32, num_iteration=1024, num_leaves=512, n_jobs=-1)
        """model = LGBMClassifier()
        model = ExtraTreesClassifier(n_estimators=300, max_features = None, min_samples_leaf= 1, min_samples_split= 2) """
        add_eng_features = True
        add_climate = eng
        add_geographic = True
        add_family = True
        add_rocky = True
        add_stony = True
        keep_initial_rows = False
        columns_to_drop = ['ClimZone_3', 'GeoZone_5', 'F_Gothic', 'F_Troutville', 'F_Rogert', 'F_Bross', 'R_Limber_families', 'R_Aquolis', 'R_Cryoborolis', 'R_Cryaquolis', 'R_Borohemists', 'R_Cryaquepts', 'R_Cryaquolls', 'Vertical_Distance_To_Hydrology']
        decomposition=decomposition
        columns_to_decomp=dummies_only
        random_state = 2

        cl = ClassifTools(df_train, df_test, model,
            add_eng_features,
            add_climate, add_geographic, add_family, add_rocky, add_stony,
            keep_initial_rows, columns_to_drop,
            decomposition, columns_to_decomp,
            random_state)

        pred, acc, mat, pred_time = cl.test_predict(export_file=True, compute_local_metrics=True)
        export_metrics(acc, mat, pred_time, cl)
        print_results(acc, mat, pred_time, cl)

enriching the data...
number of features: 26
   -- took 117.78 sec
training model...




   -- took 198.83 sec
predicting test_set...
   -- took 456.1 sec
exporting file...
computing local metrics...




   -- took 164.47 sec
> SUCCESS
-------------------------
- Parameters:  {'model': LGBMClassifier(learning_rate=0.01, max_depth=32, num_iteration=1024,
               num_leaves=512), 'add_eng_features': True, 'add_climate': True, 'add_geographic': True, 'add_family': True, 'add_rocky': True, 'add_stony': True, 'keep_initial_rows': False, 'columns_to_drop': ['ClimZone_3', 'GeoZone_5', 'F_Gothic', 'F_Troutville', 'F_Rogert', 'F_Bross', 'R_Limber_families', 'R_Aquolis', 'R_Cryoborolis', 'R_Cryaquolis', 'R_Borohemists', 'R_Cryaquepts', 'R_Cryaquolls', 'Vertical_Distance_To_Hydrology'], 'decomposition': MCA(n_components=9, n_iter=3, random_state=42), 'columns_to_decomp': ['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4', 'ClimZone_2', 'ClimZone_4', 'ClimZone_5', 'ClimZone_6', 'ClimZone_7', 'ClimZone_8', 'GeoZone_1', 'GeoZone_2', 'GeoZone_7', 'F_Cathedral', 'F_Ratake', 'F_Vanet', 'F_Legault', 'F_Catamount', 'F_Bullwark', 'F_Gateview', 'F_Leighcan', 'F_Como', 'F



   -- took 148.12 sec
predicting test_set...
   -- took 336.5 sec
exporting file...
computing local metrics...




   -- took 140.32 sec
> SUCCESS
-------------------------
- Parameters:  {'model': LGBMClassifier(learning_rate=0.01, max_depth=32, num_iteration=1024,
               num_leaves=512), 'add_eng_features': False, 'add_climate': True, 'add_geographic': True, 'add_family': True, 'add_rocky': True, 'add_stony': True, 'keep_initial_rows': False, 'columns_to_drop': ['ClimZone_3', 'GeoZone_5', 'F_Gothic', 'F_Troutville', 'F_Rogert', 'F_Bross', 'R_Limber_families', 'R_Aquolis', 'R_Cryoborolis', 'R_Cryaquolis', 'R_Borohemists', 'R_Cryaquepts', 'R_Cryaquolls', 'Vertical_Distance_To_Hydrology'], 'decomposition': MCA(n_components=9, n_iter=3, random_state=42), 'columns_to_decomp': ['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4', 'ClimZone_2', 'ClimZone_4', 'ClimZone_5', 'ClimZone_6', 'ClimZone_7', 'ClimZone_8', 'GeoZone_1', 'GeoZone_2', 'GeoZone_7', 'F_Cathedral', 'F_Ratake', 'F_Vanet', 'F_Legault', 'F_Catamount', 'F_Bullwark', 'F_Gateview', 'F_Leighcan', 'F_Como', '

In [6]:
dummies_only = ['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4', 'ClimZone_2', 'ClimZone_4', 'ClimZone_5', 'ClimZone_6', 'ClimZone_7', 'ClimZone_8', 'GeoZone_1', 'GeoZone_2', 'GeoZone_7', 'F_Cathedral', 'F_Ratake', 'F_Vanet', 'F_Legault', 'F_Catamount', 'F_Bullwark', 'F_Gateview', 'F_Leighcan', 'F_Como', 'F_Moran', 'R_Rock_outcrop', 'R_Ratake_families', 'R_Rock_land', 'R_till_substratum', 'R_Cryumbrepts', 'R_Cryorthents', 'Rock', 'S_rubbly', 'S_stony', 'S_very stony', 'S_extremely stony']
all_columns_except_id = ['S_stony', 'Soil_Type39', 'R_Cryumbrepts', 'F_Moran', 'F_Como', 'Rock', 'ClimZone_2', 'Wilderness_Area3', 'F_Cathedral', 'Soil_Type8', 'Hillshade_Noon', 'Soil_Type16', 'Hillshade_3pm', 'F_Catamount', 'Soil_Type32', 'Aspect', 'Soil_Type24', 'F_Ratake', 'Soil_Type6', 'Soil_Type12', 'R_Cryorthents', 'R_Rock_outcrop', 'Hillshade_9am', 'Soil_Type4', 'Soil_Type14', 'Soil_Type40', 'F_Vanet', 'ClimZone_8', 'Soil_Type5', 'Soil_Type2', 'Soil_Type22', 'Soil_Type3', 'Soil_Type27', 'Wilderness_Area1', 'Soil_Type30', 'Soil_Type36', 'F_Bullwark', 'R_till_substratum', 'Soil_Type38', 'Soil_Type21', 'R_Ratake_families', 'S_extremely stony', 'Soil_Type35', 'F_Leighcan', 'R_Rock_land', 'Soil_Type33', 'Slope', 'Soil_Type1', 'Horizontal_Distance_To_Fire_Points', 'Soil_Type9', 'GeoZone_7', 'Cover_Type', 'Soil_Type23', 'Soil_Type11', 'ClimZone_6', 'Wilderness_Area4', 'Soil_Type37', 'Wilderness_Area2', 'ClimZone_5', 'Elevation', 'Soil_Type15', 'Soil_Type10', 'F_Gateview', 'S_rubbly', 'Soil_Type25', 'Soil_Type29', 'Soil_Type26', 'Soil_Type20', 'Soil_Type28', 'Soil_Type19', 'ClimZone_7', 'F_Legault', 'Soil_Type13', 'GeoZone_1', 'GeoZone_2', 'Soil_Type34', 'Horizontal_Distance_To_Roadways', 'Soil_Type31', 'S_very stony', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Hydrology', 'ClimZone_4', 'Soil_Type7', 'Soil_Type18', 'Soil_Type17']
all_columns_with_id = ['Id', 'S_stony', 'Soil_Type39', 'R_Cryumbrepts', 'F_Moran', 'F_Como', 'Rock', 'ClimZone_2', 'Wilderness_Area3', 'F_Cathedral', 'Soil_Type8', 'Hillshade_Noon', 'Soil_Type16', 'Hillshade_3pm', 'F_Catamount', 'Soil_Type32', 'Aspect', 'Soil_Type24', 'F_Ratake', 'Soil_Type6', 'Soil_Type12', 'R_Cryorthents', 'R_Rock_outcrop', 'Hillshade_9am', 'Soil_Type4', 'Soil_Type14', 'Soil_Type40', 'F_Vanet', 'ClimZone_8', 'Soil_Type5', 'Soil_Type2', 'Soil_Type22', 'Soil_Type3', 'Soil_Type27', 'Wilderness_Area1', 'Soil_Type30', 'Soil_Type36', 'F_Bullwark', 'R_till_substratum', 'Soil_Type38', 'Soil_Type21', 'R_Ratake_families', 'S_extremely stony', 'Soil_Type35', 'F_Leighcan', 'R_Rock_land', 'Soil_Type33', 'Slope', 'Soil_Type1', 'Horizontal_Distance_To_Fire_Points', 'Soil_Type9', 'GeoZone_7', 'Cover_Type', 'Soil_Type23', 'Soil_Type11', 'ClimZone_6', 'Wilderness_Area4', 'Soil_Type37', 'Wilderness_Area2', 'ClimZone_5', 'Elevation', 'Soil_Type15', 'Soil_Type10', 'F_Gateview', 'S_rubbly', 'Soil_Type25', 'Soil_Type29', 'Soil_Type26', 'Soil_Type20', 'Soil_Type28', 'Soil_Type19', 'ClimZone_7', 'F_Legault', 'Soil_Type13', 'GeoZone_1', 'GeoZone_2', 'Soil_Type34', 'Horizontal_Distance_To_Roadways', 'Soil_Type31', 'S_very stony', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Hydrology', 'ClimZone_4', 'Soil_Type7', 'Soil_Type18', 'Soil_Type17']

for en in [True, False]:
    for eng in [True, False]:   
        mca = prince.MCA(
            n_components=n_components,
            n_iter=3,
            copy=True,
            check_input=True,
            engine='auto',
            random_state=42
        )

        decomposition = mca

        # Class parameters
        df_train = df_train
        df_test = df_test
        model = LGBMClassifier(learning_rate=0.01, max_depth=32, num_iteration=1024, num_leaves=512, n_jobs=-1)
        """model = LGBMClassifier()
        model = ExtraTreesClassifier(n_estimators=300, max_features = None, min_samples_leaf= 1, min_samples_split= 2) """
        add_eng_features = True
        add_climate = en
        add_geographic = eng
        add_family = True
        add_rocky = True
        add_stony = True
        keep_initial_rows = False
        columns_to_drop = ['ClimZone_3', 'GeoZone_5', 'F_Gothic', 'F_Troutville', 'F_Rogert', 'F_Bross', 'R_Limber_families', 'R_Aquolis', 'R_Cryoborolis', 'R_Cryaquolis', 'R_Borohemists', 'R_Cryaquepts', 'R_Cryaquolls', 'Vertical_Distance_To_Hydrology']
        decomposition=None
        columns_to_decomp=None
        random_state = 2

        cl = ClassifTools(df_train, df_test, model,
            add_eng_features,
            add_climate, add_geographic, add_family, add_rocky, add_stony,
            keep_initial_rows, columns_to_drop,
            decomposition, columns_to_decomp,
            random_state)

        pred, acc, mat, pred_time = cl.test_predict(export_file=True, compute_local_metrics=True)
        export_metrics(acc, mat, pred_time, cl)
        print_results(acc, mat, pred_time, cl)

enriching the data...
number of features: 51
   -- took 55.27 sec
training model...




   -- took 94.87 sec
predicting test_set...
   -- took 395.79 sec
exporting file...
computing local metrics...




   -- took 103.41 sec
> SUCCESS
-------------------------
- Parameters:  {'model': LGBMClassifier(learning_rate=0.01, max_depth=32, num_iteration=1024,
               num_leaves=512), 'add_eng_features': True, 'add_climate': True, 'add_geographic': True, 'add_family': True, 'add_rocky': True, 'add_stony': True, 'keep_initial_rows': False, 'columns_to_drop': ['ClimZone_3', 'GeoZone_5', 'F_Gothic', 'F_Troutville', 'F_Rogert', 'F_Bross', 'R_Limber_families', 'R_Aquolis', 'R_Cryoborolis', 'R_Cryaquolis', 'R_Borohemists', 'R_Cryaquepts', 'R_Cryaquolls', 'Vertical_Distance_To_Hydrology'], 'decomposition': None, 'columns_to_decomp': None, 'random_state': 2}
- Execution time: 650.04
- Accuracy: 0.913
- Confusion matrix:
enriching the data...
number of features: 48
   -- took 54.14 sec
training model...




   -- took 105.62 sec
predicting test_set...
   -- took 396.54 sec
exporting file...
computing local metrics...




   -- took 86.12 sec
> SUCCESS
-------------------------
- Parameters:  {'model': LGBMClassifier(learning_rate=0.01, max_depth=32, num_iteration=1024,
               num_leaves=512), 'add_eng_features': True, 'add_climate': True, 'add_geographic': False, 'add_family': True, 'add_rocky': True, 'add_stony': True, 'keep_initial_rows': False, 'columns_to_drop': ['ClimZone_3', 'GeoZone_5', 'F_Gothic', 'F_Troutville', 'F_Rogert', 'F_Bross', 'R_Limber_families', 'R_Aquolis', 'R_Cryoborolis', 'R_Cryaquolis', 'R_Borohemists', 'R_Cryaquepts', 'R_Cryaquolls', 'Vertical_Distance_To_Hydrology'], 'decomposition': None, 'columns_to_decomp': None, 'random_state': 2}
- Execution time: 643.1
- Accuracy: 0.9137
- Confusion matrix:
enriching the data...
number of features: 45
   -- took 51.59 sec
training model...




   -- took 85.82 sec
predicting test_set...
   -- took 316.65 sec
exporting file...
computing local metrics...




   -- took 101.86 sec
> SUCCESS
-------------------------
- Parameters:  {'model': LGBMClassifier(learning_rate=0.01, max_depth=32, num_iteration=1024,
               num_leaves=512), 'add_eng_features': True, 'add_climate': False, 'add_geographic': True, 'add_family': True, 'add_rocky': True, 'add_stony': True, 'keep_initial_rows': False, 'columns_to_drop': ['ClimZone_3', 'GeoZone_5', 'F_Gothic', 'F_Troutville', 'F_Rogert', 'F_Bross', 'R_Limber_families', 'R_Aquolis', 'R_Cryoborolis', 'R_Cryaquolis', 'R_Borohemists', 'R_Cryaquepts', 'R_Cryaquolls', 'Vertical_Distance_To_Hydrology'], 'decomposition': None, 'columns_to_decomp': None, 'random_state': 2}
- Execution time: 556.54
- Accuracy: 0.9101
- Confusion matrix:
enriching the data...
number of features: 42
   -- took 52.16 sec
training model...




   -- took 85.59 sec
predicting test_set...
   -- took 367.46 sec
exporting file...
computing local metrics...




   -- took 84.13 sec
> SUCCESS
-------------------------
- Parameters:  {'model': LGBMClassifier(learning_rate=0.01, max_depth=32, num_iteration=1024,
               num_leaves=512), 'add_eng_features': True, 'add_climate': False, 'add_geographic': False, 'add_family': True, 'add_rocky': True, 'add_stony': True, 'keep_initial_rows': False, 'columns_to_drop': ['ClimZone_3', 'GeoZone_5', 'F_Gothic', 'F_Troutville', 'F_Rogert', 'F_Bross', 'R_Limber_families', 'R_Aquolis', 'R_Cryoborolis', 'R_Cryaquolis', 'R_Borohemists', 'R_Cryaquepts', 'R_Cryaquolls', 'Vertical_Distance_To_Hydrology'], 'decomposition': None, 'columns_to_decomp': None, 'random_state': 2}
- Execution time: 590.02
- Accuracy: 0.912
- Confusion matrix:


In [8]:
df_train.columns

Index(['Id', 'Elevation', 'Aspect', 'Slope',
       'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology',
       'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon',
       'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points',
       'Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3',
       'Wilderness_Area4', 'Soil_Type1', 'Soil_Type2', 'Soil_Type3',
       'Soil_Type4', 'Soil_Type5', 'Soil_Type6', 'Soil_Type7', 'Soil_Type8',
       'Soil_Type9', 'Soil_Type10', 'Soil_Type11', 'Soil_Type12',
       'Soil_Type13', 'Soil_Type14', 'Soil_Type15', 'Soil_Type16',
       'Soil_Type17', 'Soil_Type18', 'Soil_Type19', 'Soil_Type20',
       'Soil_Type21', 'Soil_Type22', 'Soil_Type23', 'Soil_Type24',
       'Soil_Type25', 'Soil_Type26', 'Soil_Type27', 'Soil_Type28',
       'Soil_Type29', 'Soil_Type30', 'Soil_Type31', 'Soil_Type32',
       'Soil_Type33', 'Soil_Type34', 'Soil_Type35', 'Soil_Type36',
       'Soil_Type37', 'Soil_Type38', 'Soil_Type39', 'Soil_

In [17]:
decomposition = mca
columns_to_decomp = dummies_only

mca = prince.MCA(
    n_components=9,
    n_iter=3,
    copy=True,
    check_input=True,
    engine='auto',
    random_state=42
)


# Class parameters
df_train = df_train
df_test = df_test
model = LGBMClassifier()
"""model = LGBMClassifier(learning_rate=0.01, max_depth=32, num_iteration=1024, num_leaves=512, n_jobs=-1)
model = ExtraTreesClassifier(n_estimators=300, max_features = None, min_samples_leaf= 1, min_samples_split= 2) """
add_eng_features = True
add_climate = True
add_geographic = True
add_family = True
add_rocky = True
add_stony = True
keep_initial_rows = False
columns_to_drop = ['ClimZone_3', 'GeoZone_5', 'F_Gothic', 'F_Troutville', 'F_Rogert', 'F_Bross', 'R_Limber_families', 'R_Aquolis', 'R_Cryoborolis', 'R_Cryaquolis', 'R_Borohemists', 'R_Cryaquepts', 'R_Cryaquolls']
decomposition=decomposition
columns_to_decomp=columns_to_decomp
random_state = 2

cl = ClassifTools(df_train, df_test, model,
    add_eng_features,
    add_climate, add_geographic, add_family, add_rocky, add_stony,
    keep_initial_rows, columns_to_drop,
    decomposition, columns_to_decomp,
    random_state)

pred, acc, mat, pred_time = cl.test_predict(export_file=True, compute_local_metrics=True)
export_metrics(acc, mat, pred_time, cl)
print_results(acc, mat, pred_time, cl)

enriching the data...
number of features: 27
   -- took 73.81 sec
training model...
   -- took 0.85 sec
predicting test_set...
   -- took 2.79 sec
exporting file...
computing local metrics...
   -- took 3.38 sec
> SUCCESS
-------------------------
- Parameters:  {'model': LGBMClassifier(), 'add_eng_features': True, 'add_climate': True, 'add_geographic': True, 'add_family': True, 'add_rocky': True, 'add_stony': True, 'keep_initial_rows': False, 'columns_to_drop': ['ClimZone_3', 'GeoZone_5', 'F_Gothic', 'F_Troutville', 'F_Rogert', 'F_Bross', 'R_Limber_families', 'R_Aquolis', 'R_Cryoborolis', 'R_Cryaquolis', 'R_Borohemists', 'R_Cryaquepts', 'R_Cryaquolls'], 'decomposition': MCA(n_components=9, n_iter=3, random_state=42), 'columns_to_decomp': ['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4', 'ClimZone_2', 'ClimZone_4', 'ClimZone_5', 'ClimZone_6', 'ClimZone_7', 'ClimZone_8', 'GeoZone_1', 'GeoZone_2', 'GeoZone_7', 'F_Cathedral', 'F_Ratake', 'F_Vanet', 'F_Legaul

Unnamed: 0,true6,true1,true5,true7,true2,true3,true4
pred6,343,72,0,0,4,0,24
pred1,42,328,14,0,22,8,3
pred5,0,3,366,12,4,26,0
pred7,0,0,4,429,0,3,0
pred2,0,14,2,0,393,4,0
pred3,0,0,33,15,2,408,0
pred4,7,0,0,0,0,0,439


In [12]:
df_modified.columns

Index(['Id', 'Elevation', 'Aspect', 'Slope',
       'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology',
       'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon',
       'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points',
       'Euclidian_Distance_To_Hydrology', 'Mean_Hillshade', 'Mean_HDistances',
       'Mean_Elevation_Vertical_Distance_Hydrology',
       'Mean_Distance_Hydrology_Firepoints',
       'Mean_Distance_Hydrology_Roadways', 'Mean_Distance_Firepoints_Roadways',
       'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9'],
      dtype='object')

In [13]:
df_train.columns

Index(['Id', 'Elevation', 'Aspect', 'Slope',
       'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology',
       'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon',
       'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points',
       'Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3',
       'Wilderness_Area4', 'Soil_Type1', 'Soil_Type2', 'Soil_Type3',
       'Soil_Type4', 'Soil_Type5', 'Soil_Type6', 'Soil_Type7', 'Soil_Type8',
       'Soil_Type9', 'Soil_Type10', 'Soil_Type11', 'Soil_Type12',
       'Soil_Type13', 'Soil_Type14', 'Soil_Type15', 'Soil_Type16',
       'Soil_Type17', 'Soil_Type18', 'Soil_Type19', 'Soil_Type20',
       'Soil_Type21', 'Soil_Type22', 'Soil_Type23', 'Soil_Type24',
       'Soil_Type25', 'Soil_Type26', 'Soil_Type27', 'Soil_Type28',
       'Soil_Type29', 'Soil_Type30', 'Soil_Type31', 'Soil_Type32',
       'Soil_Type33', 'Soil_Type34', 'Soil_Type35', 'Soil_Type36',
       'Soil_Type37', 'Soil_Type38', 'Soil_Type39', 'Soil_

In [16]:
df_modified[['C1', 'C4']]

Unnamed: 0,C1,C4
0,-0.466304,0.608712
1,-0.466304,0.608712
2,0.380818,0.029546
3,-0.183783,0.337768
4,-0.466304,0.608712
...,...,...
581007,2.338524,3.647529
581008,2.338524,3.647529
581009,2.338524,3.647529
581010,2.338524,3.647529
