In [10]:
# Importa librerías
import gc
import os
import logging
import datetime
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
import matplotlib.pyplot as plt
import featuretools as ft
from tqdm import tqdm_notebook
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from imblearn.datasets import make_imbalance
import lightgbm as lgb
import tpot
import datetime
from utilities import get_prediction_class, get_confussion_matrix, flip_negative_positive_class
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline, make_union
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from tpot.builtins import StackingEstimator
from sklearn.linear_model import LogisticRegression
warnings.filterwarnings('ignore')


# Algoritmos

Se va a trabajar con dos algoritmos que resultaron de la corrida del set de datos utilizando ***TPOT*** con el set de datos *undersampled*, juzgados por esa librería como los dos que mejor valor de validación cruzada lograban:

* En primer lugar, con un valor de *cross validation* de 0.920, resultó un modelo de tipo Naïve Bayes Gaussiano. Que un clasificador de tipo naïve Bayes haya sido seleccionado condice con lo encontrado en la etapa de descubrimiento de datos, donde notamos que algunas variables tenían una distribución de probabilidad marcadamente distinta entre las clases.

* En segundo lugar utilizaremos el modelo RandomForest que utilizamos durante la etapa de feature engineering. Debido al gran número de variables, muchas de las cuales eran irrelevantes, un algoritmo de tipo árbol de decisión logra eliminar el *overhead* de tener que considerar cada una de ellas en el entrenamiento del modelo (como pasaría en una red neuronal) estudiando la cantidad de información que provee cada una de ellas.

Se procede a correr los modelos generados por TPOT con los sets balanceados y el set original a fin de ver cuáles predicen mejor.

## Naïve Bayes


In [2]:
train = pd.read_csv('santander-customer-transaction-prediction/train.csv')
train_undersampled = pd.read_csv('santander-customer-transaction-prediction/train_undersampled.csv')
train_oversampled = pd.read_csv('santander-customer-transaction-prediction/train_oversampled.csv')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Columns: 202 entries, ID_code to var_199
dtypes: float64(200), int64(1), object(1)
memory usage: 308.2+ MB


In [3]:
prediction_target = train["target"]
prediction_data = train.drop(["target", "ID_code"], axis=1)

### Entrenamiento con set oversampled

In [124]:
# Split de casos de prueba y test
X = train_oversampled.drop(["ID_code", "target"], axis=1).values
Y = train_oversampled.target.values

# Split 2/3 entrenamiento, 1/3 validación
train_X, test_X, train_Y, test_Y = train_test_split(X, Y, random_state=42)

exported_pipeline = GaussianNB()

exported_pipeline.fit(train_X, train_Y)
results = exported_pipeline.predict(test_X)

In [125]:
print("Precision: ", precision_score(test_Y, results))
print("Recall: ", recall_score(test_Y, results))
print("F1 Score: ", f1_score(test_Y, results))
get_confussion_matrix(test_Y, results)

Precision:  0.809948549731515
Recall:  0.8025914464456665
F1 Score:  0.8062532149486725


Unnamed: 0,target_0,target_1
target_0,36575,8867
target_1,8459,36050


#### Resultados con set original

In [126]:
results = exported_pipeline.predict(prediction_data)
print("Precision: ", precision_score(prediction_target, results))
print("Recall: ", recall_score(prediction_target, results))
print("F1 Score: ", f1_score(prediction_target, results))
get_confussion_matrix(prediction_target, results)

Precision:  0.32444265916850773
Recall:  0.8037615683152552
F1 Score:  0.4622825091575092


Unnamed: 0,target_0,target_1
target_0,146266,3944
target_1,33636,16154


### Entrenamiento con set undersampled

In [127]:
# Split de casos de prueba y test
X = train_undersampled.drop(["ID_code", "target"], axis=1).values
Y = train_undersampled.target.values

# Split 2/3 entrenamiento, 1/3 validación
train_X, test_X, train_Y, test_Y = train_test_split(X, Y, random_state=42)

exported_pipeline = GaussianNB()

exported_pipeline.fit(train_X, train_Y)
results = exported_pipeline.predict(test_X)

In [128]:
print("Precision: ", precision_score(test_Y, results))
print("Recall: ", recall_score(test_Y, results))
print("F1 Score: ", f1_score(test_Y, results))
get_confussion_matrix(test_Y, results)

Precision:  0.8081313839375125
Recall:  0.798535523451415
F1 Score:  0.8033047979295241


Unnamed: 0,target_0,target_1
target_0,4038,1018
target_1,958,4035


#### Resultados con set original

In [129]:
results = exported_pipeline.predict(prediction_data)
print("Precision: ", precision_score(prediction_target, results))
print("Recall: ", recall_score(prediction_target, results))
print("F1 Score: ", f1_score(prediction_target, results))
get_confussion_matrix(prediction_target, results)

Precision:  0.323768599024116
Recall:  0.8022688824758683
F1 Score:  0.4613513784174308


Unnamed: 0,target_0,target_1
target_0,146225,3974
target_1,33677,16124


### Entrenamiento con set original

In [154]:
# Split de casos de prueba y test
X = train.drop(["ID_code", "target"], axis=1).values
Y = train.target.values

# Split 2/3 entrenamiento, 1/3 validación
train_X, test_X, train_Y, test_Y = train_test_split(X, Y, random_state=42)

exported_pipeline = GaussianNB()

exported_pipeline.fit(train_X, train_Y)
results = exported_pipeline.predict(test_X)

In [122]:
print("Precision: ", precision_score(test_Y, results))
print("Recall: ", recall_score(test_Y, results))
print("F1 Score: ", f1_score(test_Y, results))
get_confussion_matrix(test_Y, results)

Precision:  0.7168445121951219
Recall:  0.3673828125
F1 Score:  0.4857954545454545


Unnamed: 0,target_0,target_1
target_0,44137,3239
target_1,743,1881


## RandomForest

### Entrenamiento con set oversampled

In [145]:
features = train_oversampled.drop(['target', 'ID_code'], axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, train_oversampled['target'].values, random_state=42)

forest = RandomForestClassifier(max_depth=15, n_estimators=15, min_samples_leaf=25,
                                n_jobs=-1)

forest.fit(training_features, training_target)
results = forest.predict(testing_features)

In [146]:
print("Precision: ", precision_score(testing_target, results))
print("Recall: ", recall_score(testing_target, results))
print("F1 Score: ", f1_score(testing_target, results))
get_confussion_matrix(testing_target, results)

Precision:  0.8809404583065074
Recall:  0.8224948237860944
F1 Score:  0.8507149929767196


Unnamed: 0,target_0,target_1
target_0,40041,7973
target_1,4993,36944


#### Resultados con el set original

In [147]:
results = forest.predict(prediction_data)
print("Precision: ", precision_score(prediction_target, results))
print("Recall: ", recall_score(prediction_target, results))
print("F1 Score: ", f1_score(prediction_target, results))
get_confussion_matrix(prediction_target, results)

Precision:  0.5708136772149719
Recall:  0.824808438650612
F1 Score:  0.6746983048088077


Unnamed: 0,target_0,target_1
target_0,167438,3521
target_1,12464,16577


### Entrenamiento con el set undersampled

In [160]:
features = train_undersampled.drop(['target', 'ID_code'], axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, train_undersampled['target'].values, random_state=42)

forest = RandomForestClassifier(max_depth=15, n_estimators=15, min_samples_leaf=25,
                                n_jobs=-1)

forest.fit(training_features, training_target)
results = forest.predict(testing_features)

In [161]:
print("Precision: ", precision_score(testing_target, results))
print("Recall: ", recall_score(testing_target, results))
print("F1 Score: ", f1_score(testing_target, results))
get_confussion_matrix(testing_target, results)

Precision:  0.7139971722884266
Recall:  0.69958440530378
F1 Score:  0.70671731307477


Unnamed: 0,target_0,target_1
target_0,3580,1518
target_1,1416,3535


#### Resultados con el set original

In [162]:
results = forest.predict(prediction_data)
print("Precision: ", precision_score(prediction_target, results))
print("Recall: ", recall_score(prediction_target, results))
print("F1 Score: ", f1_score(prediction_target, results))
get_confussion_matrix(prediction_target, results)

Precision:  0.2511822939655716
Recall:  0.7928152054930839
F1 Score:  0.3814973543706754


Unnamed: 0,target_0,target_1
target_0,132400,4164
target_1,47502,15934


### Entrenamiento con set de datos original

In [13]:
features = train.drop(['target', 'ID_code'], axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, train['target'].values, random_state=42)

forest = RandomForestClassifier(max_depth=15, n_estimators=15, min_samples_leaf=25,
                                n_jobs=-1)

forest.fit(training_features, training_target)
results = forest.predict(testing_features)

In [15]:
# Se dan vuelta los valores de las clases porque sklearn devuelve automáticamente 0 en todas sus métricas si no 
# hay positivos (en nuestro caso, "1")

print("Precision: ", precision_score(testing_target, results))
print("Recall: ", recall_score(testing_target, results))
print("F1 Score: ", f1_score(testing_target, results))
get_confussion_matrix(testing_target, results)

Precision:  0.0
Recall:  0.0
F1 Score:  0.0


Unnamed: 0,target_0,target_1
target_0,44880,5120
target_1,0,0


In [16]:
results = forest.predict(prediction_data)
print("Precision: ", precision_score(prediction_target, results))
print("Recall: ", recall_score(prediction_target, results))
print("F1 Score: ", f1_score(prediction_target, results))
get_confussion_matrix(prediction_target, results)

Precision:  1.0
Recall:  0.0003980495571698676
F1 Score:  0.0007957823535263105


Unnamed: 0,target_0,target_1
target_0,179902,20090
target_1,0,8


# Conclusión

Podemos ver que aunque TPOT haya elegido Naïve Bayes como el mejor modelo, el que mayor precisión y recall resultó fue RandomForest entrenado con el set oversampled, que presentó una precisión de 0.57 y un recall de 0.82.