# Modelos de Machine Learning - Clasificación 

## Entender el caso de Negocio

Se desea saber que tan probable es que los clientes de una entidad financiera dejen de usar sus productos o servicios (attrition). Para esto se tiene un Dataset con información relacionada al comportamiento de los clientes teniendo en cuenta un mes de referencia y un indicador de attrition.

El campo attrition se categoriza en:

Attrition (1)

No es attrition (0)

Para generar tus soluciones tendrás que poner en práctica tus conocimientos en desarrollo de modelos predictivos

In [175]:
!pip install xgboost
!pip install lightgbm
!pip install catboost



## Librerías

In [176]:
# Import necessary libs

import os
import random as rnd
import pandas as pd
import numpy as np
import itertools
import gc
import networkx as nx
from sklearn import metrics
import warnings
from sklearn import preprocessing
warnings.filterwarnings('ignore')

## Modelos de Machine Learning
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn import preprocessing
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn import tree

## Métricas de los modelos
from sklearn.metrics import confusion_matrix
from sklearn.metrics import auc
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score

## Selección de Variables
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split

## Validación Cruzada
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

# visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Common seed value to be used whenever required
seed = 16
np.random.seed(seed)

In [177]:
pd.options.display.max_columns = 300
pd.options.display.max_rows = 300

### Leer los data set del caso de Negocio

In [178]:
## data 
dataset = pd.read_csv('train_clientes.csv',sep=';')
dataset['ID_CORRELATIVO'] =   dataset['ID_CORRELATIVO'].apply(lambda x: '{0:0>7}'.format(x))
dataset = dataset.sort_values(['ID_CORRELATIVO'])

## data requerimientos

dataset_req = pd.read_csv('train_requerimientos.csv',sep=';')
dataset_req['ID_CORRELATIVO'] =   dataset_req['ID_CORRELATIVO'].apply(lambda x: '{0:0>7}'.format(x))
dataset_req = dataset_req.sort_values(['ID_CORRELATIVO'])
dataset.head()

Unnamed: 0,ID_CORRELATIVO,CODMES,FLG_BANCARIZADO,RANG_INGRESO,FLAG_LIMA_PROVINCIA,EDAD,ANTIGUEDAD,ATTRITION,RANG_SDO_PASIVO_MENOS0,SDO_ACTIVO_MENOS0,SDO_ACTIVO_MENOS1,SDO_ACTIVO_MENOS2,SDO_ACTIVO_MENOS3,SDO_ACTIVO_MENOS4,SDO_ACTIVO_MENOS5,FLG_SEGURO_MENOS0,FLG_SEGURO_MENOS1,FLG_SEGURO_MENOS2,FLG_SEGURO_MENOS3,FLG_SEGURO_MENOS4,FLG_SEGURO_MENOS5,RANG_NRO_PRODUCTOS_MENOS0,FLG_NOMINA,NRO_ACCES_CANAL1_MENOS0,NRO_ACCES_CANAL1_MENOS1,NRO_ACCES_CANAL1_MENOS2,NRO_ACCES_CANAL1_MENOS3,NRO_ACCES_CANAL1_MENOS4,NRO_ACCES_CANAL1_MENOS5,NRO_ACCES_CANAL2_MENOS0,NRO_ACCES_CANAL2_MENOS1,NRO_ACCES_CANAL2_MENOS2,NRO_ACCES_CANAL2_MENOS3,NRO_ACCES_CANAL2_MENOS4,NRO_ACCES_CANAL2_MENOS5,NRO_ACCES_CANAL3_MENOS0,NRO_ACCES_CANAL3_MENOS1,NRO_ACCES_CANAL3_MENOS2,NRO_ACCES_CANAL3_MENOS3,NRO_ACCES_CANAL3_MENOS4,NRO_ACCES_CANAL3_MENOS5,NRO_ENTID_SSFF_MENOS0,NRO_ENTID_SSFF_MENOS1,NRO_ENTID_SSFF_MENOS2,NRO_ENTID_SSFF_MENOS3,NRO_ENTID_SSFF_MENOS4,NRO_ENTID_SSFF_MENOS5,FLG_SDO_OTSSFF_MENOS0,FLG_SDO_OTSSFF_MENOS1,FLG_SDO_OTSSFF_MENOS2,FLG_SDO_OTSSFF_MENOS3,FLG_SDO_OTSSFF_MENOS4,FLG_SDO_OTSSFF_MENOS5
50778,1,201208,1,Rang_ingreso_02,Lima,,0.0,1,Rango_SDO_01,0,0,0,0,0,0,0,0,0,0,0,0,Rango_02,1,1,0,0,0,0,0,0,0,0,0,0,0,9,27,16,26,5,0,0,0,0,0,0,0,0,0,0,0,0,0
68257,2,201208,1,Rang_ingreso_01,Provincia,27.0,1.0,0,Rango_SDO_01,0,0,0,0,0,0,1,1,1,1,1,1,Rango_03,0,0,0,0,0,2,1,1,4,3,9,5,0,1,0,0,0,4,0,4,4,4,4,4,4,1,1,1,1,1,1
53623,3,201208,0,Rang_ingreso_01,Provincia,34.0,0.0,1,Rango_SDO_01,0,0,0,0,0,0,0,0,0,0,0,0,Rango_02,1,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
45001,4,201208,1,Rang_ingreso_06,Lima,25.0,3.0,0,Rango_SDO_02,0,0,0,0,0,0,1,1,0,0,0,0,Rango_04,1,6,4,1,4,2,1,24,23,17,24,19,16,15,22,25,25,24,17,1,1,0,0,0,0,0,0,0,0,0,0
53741,5,201208,0,Rang_ingreso_01,,,0.0,0,Rango_SDO_02,0,0,0,0,0,0,0,0,0,0,0,0,Rango_02,1,0,0,0,0,0,0,0,0,0,0,0,0,15,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [179]:
dataset.CODMES.unique()

array([201208], dtype=int64)

In [180]:
dataset.ID_CORRELATIVO.nunique()

70000

In [181]:
dataset_req.head()

Unnamed: 0,ID_CORRELATIVO,TIPO_REQUERIMIENTO2,DICTAMEN,CODMES,PRODUCTO_SERVICIO_2,SUBMOTIVO_2
31641,1,Solicitud,PROCEDE TOTAL,201205,Producto 07,Submotivo 144
24276,2,Reclamo,PROCEDE TOTAL,201203,Producto 20,Submotivo 125
23891,4,Solicitud,PROCEDE TOTAL,201208,Producto 20,Submotivo 144
23895,4,Reclamo,NO PROCEDE,201204,Producto 18,Submotivo 125
23894,4,Reclamo,NO PROCEDE,201205,Producto 18,Submotivo 125


In [182]:
dataset_req.shape

(51417, 6)

In [183]:
dataset_req.ID_CORRELATIVO.nunique()

35026

In [184]:
sorted(dataset_req.CODMES.unique())

[201203, 201204, 201205, 201206, 201207, 201208]

In [185]:
dataset.shape

(70000, 53)

## Análisis Exploratorio de Datos (EDA)


In [186]:
# Type of variables and dataset information.
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 70000 entries, 50778 to 63754
Data columns (total 53 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   ID_CORRELATIVO             70000 non-null  object 
 1   CODMES                     70000 non-null  int64  
 2   FLG_BANCARIZADO            70000 non-null  int64  
 3   RANG_INGRESO               60583 non-null  object 
 4   FLAG_LIMA_PROVINCIA        66614 non-null  object 
 5   EDAD                       64674 non-null  float64
 6   ANTIGUEDAD                 68238 non-null  float64
 7   ATTRITION                  70000 non-null  int64  
 8   RANG_SDO_PASIVO_MENOS0     70000 non-null  object 
 9   SDO_ACTIVO_MENOS0          70000 non-null  int64  
 10  SDO_ACTIVO_MENOS1          70000 non-null  int64  
 11  SDO_ACTIVO_MENOS2          70000 non-null  int64  
 12  SDO_ACTIVO_MENOS3          70000 non-null  int64  
 13  SDO_ACTIVO_MENOS4          70000 non-null  int6

In [187]:
# Dataset characteristics
print("Number of instances in dataset = {}".format(dataset.shape[0]))
print("Total number of columns = {}".format(dataset.columns.shape[0]))
print("Column wise count of null values:-")
print(dataset.isnull().sum())

Number of instances in dataset = 70000
Total number of columns = 53
Column wise count of null values:-
ID_CORRELATIVO                  0
CODMES                          0
FLG_BANCARIZADO                 0
RANG_INGRESO                 9417
FLAG_LIMA_PROVINCIA          3386
EDAD                         5326
ANTIGUEDAD                   1762
ATTRITION                       0
RANG_SDO_PASIVO_MENOS0          0
SDO_ACTIVO_MENOS0               0
SDO_ACTIVO_MENOS1               0
SDO_ACTIVO_MENOS2               0
SDO_ACTIVO_MENOS3               0
SDO_ACTIVO_MENOS4               0
SDO_ACTIVO_MENOS5               0
FLG_SEGURO_MENOS0               0
FLG_SEGURO_MENOS1               0
FLG_SEGURO_MENOS2               0
FLG_SEGURO_MENOS3               0
FLG_SEGURO_MENOS4               0
FLG_SEGURO_MENOS5               0
RANG_NRO_PRODUCTOS_MENOS0       0
FLG_NOMINA                      0
NRO_ACCES_CANAL1_MENOS0         0
NRO_ACCES_CANAL1_MENOS1         0
NRO_ACCES_CANAL1_MENOS2         0
NRO_ACCES_CAN

### Columnas con análisis Estadístico

In [188]:
# Columns for age and antiquity
var_1 = ["EDAD", "ANTIGUEDAD"]

# Columns for SDO_ACTIVO
SDO_ACTIVO = ["SDO_ACTIVO_MENOS0", "SDO_ACTIVO_MENOS1", "SDO_ACTIVO_MENOS2", "SDO_ACTIVO_MENOS3", "SDO_ACTIVO_MENOS4","SDO_ACTIVO_MENOS5"]

# Columns for NRO_ENTID_SSFF_MENOS
NRO_ENTID_SSFF_MENOS = ["NRO_ENTID_SSFF_MENOS0", "NRO_ENTID_SSFF_MENOS1", "NRO_ENTID_SSFF_MENOS2", "NRO_ENTID_SSFF_MENOS3", "NRO_ENTID_SSFF_MENOS4", "NRO_ENTID_SSFF_MENOS5"]

# Columns for NRO_ACCES_CANAL
NRO_ACCES_CANAL = ["NRO_ACCES_CANAL1_MENOS0","NRO_ACCES_CANAL1_MENOS1","NRO_ACCES_CANAL1_MENOS2","NRO_ACCES_CANAL1_MENOS3","NRO_ACCES_CANAL1_MENOS4","NRO_ACCES_CANAL1_MENOS5",
                  "NRO_ACCES_CANAL2_MENOS0","NRO_ACCES_CANAL2_MENOS1","NRO_ACCES_CANAL2_MENOS2","NRO_ACCES_CANAL2_MENOS3","NRO_ACCES_CANAL2_MENOS4","NRO_ACCES_CANAL2_MENOS5",
                  "NRO_ACCES_CANAL3_MENOS0","NRO_ACCES_CANAL3_MENOS1","NRO_ACCES_CANAL3_MENOS2","NRO_ACCES_CANAL3_MENOS3","NRO_ACCES_CANAL3_MENOS4","NRO_ACCES_CANAL3_MENOS5"]

NRO_ACCES_CANAL1 = ["NRO_ACCES_CANAL1_MENOS0","NRO_ACCES_CANAL1_MENOS1","NRO_ACCES_CANAL1_MENOS2","NRO_ACCES_CANAL1_MENOS3","NRO_ACCES_CANAL1_MENOS4","NRO_ACCES_CANAL1_MENOS5"]
                    
NRO_ACCES_CANAL2 = ["NRO_ACCES_CANAL2_MENOS0","NRO_ACCES_CANAL2_MENOS1","NRO_ACCES_CANAL2_MENOS2","NRO_ACCES_CANAL2_MENOS3","NRO_ACCES_CANAL2_MENOS4","NRO_ACCES_CANAL2_MENOS5"]
NRO_ACCES_CANAL3 = ["NRO_ACCES_CANAL3_MENOS0","NRO_ACCES_CANAL3_MENOS1","NRO_ACCES_CANAL3_MENOS2","NRO_ACCES_CANAL3_MENOS3","NRO_ACCES_CANAL3_MENOS4","NRO_ACCES_CANAL3_MENOS5"]


target = ["ATTRITION"]

In [189]:
dataset[var_1].describe()

Unnamed: 0,EDAD,ANTIGUEDAD
count,64674.0,68238.0
mean,35.232474,2.549283
std,11.536038,3.205424
min,18.0,0.0
25%,27.0,0.0
50%,32.0,0.0
75%,41.0,5.0
max,98.0,23.0


In [190]:
dataset[NRO_ENTID_SSFF_MENOS].describe()

Unnamed: 0,NRO_ENTID_SSFF_MENOS0,NRO_ENTID_SSFF_MENOS1,NRO_ENTID_SSFF_MENOS2,NRO_ENTID_SSFF_MENOS3,NRO_ENTID_SSFF_MENOS4,NRO_ENTID_SSFF_MENOS5
count,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0
mean,2.030643,1.999243,1.968014,1.928971,1.8736,1.854386
std,1.932397,1.920776,1.907792,1.889,1.869672,1.853515
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,2.0,2.0,2.0,2.0,1.0,1.0
75%,3.0,3.0,3.0,3.0,3.0,3.0
max,11.0,11.0,11.0,11.0,11.0,11.0


In [191]:
dataset[NRO_ACCES_CANAL].describe()

Unnamed: 0,NRO_ACCES_CANAL1_MENOS0,NRO_ACCES_CANAL1_MENOS1,NRO_ACCES_CANAL1_MENOS2,NRO_ACCES_CANAL1_MENOS3,NRO_ACCES_CANAL1_MENOS4,NRO_ACCES_CANAL1_MENOS5,NRO_ACCES_CANAL2_MENOS0,NRO_ACCES_CANAL2_MENOS1,NRO_ACCES_CANAL2_MENOS2,NRO_ACCES_CANAL2_MENOS3,NRO_ACCES_CANAL2_MENOS4,NRO_ACCES_CANAL2_MENOS5,NRO_ACCES_CANAL3_MENOS0,NRO_ACCES_CANAL3_MENOS1,NRO_ACCES_CANAL3_MENOS2,NRO_ACCES_CANAL3_MENOS3,NRO_ACCES_CANAL3_MENOS4,NRO_ACCES_CANAL3_MENOS5
count,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0
mean,1.035086,1.020943,0.8671,0.891743,0.813557,0.8263,1.786986,1.739486,1.318186,1.2553,1.120829,1.1238,5.558,5.830871,4.551771,4.1462,3.565257,3.550743
std,2.746253,2.749709,2.512787,2.649147,2.514842,2.545633,4.52878,4.614673,3.92947,3.974264,3.732211,3.777387,7.776502,8.254698,7.2032,7.308573,7.179745,7.331213
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,2.0,1.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,9.0,7.0,6.0,4.0,4.0
max,31.0,31.0,30.0,31.0,30.0,30.0,31.0,31.0,30.0,31.0,30.0,31.0,245.0,220.0,163.0,121.0,273.0,163.0


In [192]:
len(dataset['ATTRITION'])

70000

In [193]:
print(dataset['ATTRITION'].value_counts())
print(dataset['ATTRITION'].value_counts(normalize=True))


ATTRITION
0    59178
1    10822
Name: count, dtype: int64
ATTRITION
0    0.8454
1    0.1546
Name: proportion, dtype: float64


In [194]:
print(dataset['FLAG_LIMA_PROVINCIA'].value_counts())
print(dataset['FLAG_LIMA_PROVINCIA'].value_counts(normalize=True))

FLAG_LIMA_PROVINCIA
Lima         41637
Provincia    24977
Name: count, dtype: int64
FLAG_LIMA_PROVINCIA
Lima         0.625049
Provincia    0.374951
Name: proportion, dtype: float64


## Vizualización de los Datos

In [195]:
sns.set() 

## Histograma de Edad con respecto al Target

In [196]:
g = sns.FacetGrid(dataset, col='ATTRITION',size=5, aspect=1.2)
g.map(plt.hist, 'EDAD', bins=10)
plt.show()
g.savefig("archivo.png")

TypeError: FacetGrid.__init__() got an unexpected keyword argument 'size'

## Boxplot de la Edad por cada Rango de Ingreso con respecto al Target

In [None]:
# Grouped boxplot
plt.figure(figsize=(16,8))
sns.boxplot(x="RANG_INGRESO", y="EDAD", hue="ATTRITION", data=dataset, palette="Set1")
plt.show()

## Gráfico de Dispersión de los NRO_ACCES_CANAL3

In [None]:
# Acortando el nombre de las variables
dataset_NRO_ACCES_CANAL3 = dataset[NRO_ACCES_CANAL3]
dataset_NRO_ACCES_CANAL3.columns = ['NRO_ACCES_CANAL3.0', 'NRO_ACCES_CANAL3.1',
       'NRO_ACCES_CANAL3.2', 'NRO_ACCES_CANAL3.3',
       'NRO_ACCES_CANAL3.4', 'NRO_ACCES_CANAL3.5']

In [None]:
sns.pairplot(dataset_NRO_ACCES_CANAL3); ### Gráfico de Dispersión


## Gráfico de Dispersión de Edad vs la Antiguedad por cada FLAG_LIMA_PROVINCIA respecto al Target

In [None]:
plt.figure(figsize=(16,8))
g = sns.FacetGrid(dataset, col="FLAG_LIMA_PROVINCIA", hue="ATTRITION",size=7)
g.map(plt.scatter, "EDAD","ANTIGUEDAD" ,alpha=.9)
g.add_legend();
#g.savefig("archivo1.png")

## Gráfico de Temporalidad de los último 6 meses de NRO_ACCES_CANAL1

In [None]:
df = pd.DataFrame(dataset[NRO_ACCES_CANAL1].mean().reset_index(),)
df.rename(columns={'index': 'CodMes', 0 : 'Promedio de NRO_ACCES_CANAL1'},inplace=True)
df

In [None]:
plt.figure(figsize=(20,12))
sns.lineplot(x='CodMes',y='Promedio de NRO_ACCES_CANAL1', data=df)

## Grafíco de calor y tabla de la correlación de Pearson 

In [None]:
dataset[NRO_ACCES_CANAL2].corr(method= 'pearson')

In [None]:
# Grouped boxplot
plt.figure(figsize=(16,8))
corr_matrix = dataset[NRO_ACCES_CANAL2].corr()
sns.heatmap(corr_matrix,annot=True, fmt=".1f") ## Gráfico de calor para las correlaciones

## Analizamos el Dataset de Requerimientos

In [None]:
dataset_req.isnull().sum()

In [None]:
dataset_req

## Feature Engineering de la Tabla Requerimientos

In [None]:
dataset_req.pivot_table(index='ID_CORRELATIVO',columns=['CODMES'],aggfunc='count').fillna(0)

### Tratamiento de valores Nulos



    Escenario en Variables continuas: Promedio
	Escenario en Variables enteros : Mediana
	Escenario en Variables categóricas : Moda 


In [None]:
dataset['EDAD'].isnull().sum()

In [None]:
dataset['EDAD'].mean()

In [None]:
dataset['EDAD'].median()

In [None]:
dataset['EDAD'].head() ##La  variable original

In [None]:
dataset['EDAD'].fillna(dataset['EDAD'].mean()).head() # Reemplazando la media

In [None]:
dataset['EDAD'].fillna(dataset['EDAD'].mode()[0]).head() # Reemplazando la mediana


### Para variables catregóricas aplicamos variables dummys

In [None]:
x = pd.get_dummies(dataset, columns=['RANG_INGRESO','FLAG_LIMA_PROVINCIA','RANG_SDO_PASIVO_MENOS0','RANG_NRO_PRODUCTOS_MENOS0']) 



In [None]:
x.info()

In [None]:
x.head()

### Procesamiento de datos

In [None]:
## selection of category variables
target = 'ATTRITION'
exclude = ['ID_CORRELATIVO','CODMES']

cols = [x for x in dataset.columns if x not in exclude + [target]]
cols_cat = dataset[cols].select_dtypes(['object']).columns.tolist()
index_categorical=[cols.index(x) for x in cols_cat]
print(exclude, '\n\n', index_categorical,cols_cat)


## For Training

for i in cols_cat:
    le = preprocessing.LabelEncoder()
    le.fit(list(dataset[i].dropna()))
    dataset.loc[~dataset[i].isnull(),i]=le.transform(dataset[i].dropna())

dataset[cols].head()

### Primeras inputaciones

In [None]:
### Imputation of the empty data
dataset['EDAD'] = dataset['EDAD'].fillna(dataset['EDAD'].mean())
dataset['ANTIGUEDAD'] = dataset['ANTIGUEDAD'].fillna(dataset['ANTIGUEDAD'].mean())
dataset['RANG_INGRESO'] = dataset['RANG_INGRESO'].fillna(dataset['RANG_INGRESO'].mode()[0])
dataset['FLAG_LIMA_PROVINCIA'] = dataset['FLAG_LIMA_PROVINCIA'].fillna(dataset['FLAG_LIMA_PROVINCIA'].mode()[0])

In [None]:
dataset.isnull().sum().sum()

### Logistic Regression

In [None]:
# Prepare the data
from sklearn.model_selection import train_test_split
X,y = dataset.drop(['ID_CORRELATIVO', 'CODMES','ATTRITION'], axis=1),dataset[["ATTRITION"]]
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.25, random_state=1)

# Initialize and fit the model
benchmark_model = LogisticRegression()
benchmark_model.fit(X_train, y_train)


# Separate the features and the target variable
predict_train_rl = benchmark_model.predict_proba(X_train)[:,1]
predict_test_rl = benchmark_model.predict_proba(X_test)[:,1]


# Print scores on both  predict_proba(best_X_train)[:,1]
print("auc on training in LogisticRegression data : {:.3f}".format(roc_auc_score(y_train, predict_train_rl) ))
print("auc on testing in LogisticRegression  data : {:.3f}".format(roc_auc_score(y_test, predict_test_rl) ))

### Decision Tree

In [None]:
# Prepare the data
from sklearn.model_selection import train_test_split
X,y = dataset.drop(['ID_CORRELATIVO', 'CODMES','ATTRITION'], axis=1),dataset[["ATTRITION"]]

X_train, X_test, y_train, y_test= train_test_split(X, y,test_size=0.25, random_state=1)

# Initialize and fit the model
benchmark_model = tree.DecisionTreeClassifier()
benchmark_model.fit(X_train, y_train)


# Separate the features and the target variable
predict_train_dt = benchmark_model.predict_proba(X_train)[:,1]
predict_test_dt = benchmark_model.predict_proba(X_test)[:,1]


# Print scores on both
print("auc on training in DecisionTree data : {:.3f}".format(roc_auc_score(y_train, predict_train_dt) ))
print("auc on testing in DecisionTree  data : {:.3f}".format(roc_auc_score(y_test, predict_test_dt) ))

### Feature Engineering con toda la data completa

In [None]:
## data 
dataset = pd.read_csv('train_clientes.csv',sep=';')
dataset['ID_CORRELATIVO'] =   dataset['ID_CORRELATIVO'].apply(lambda x: '{0:0>7}'.format(x))
dataset = dataset.sort_values(['ID_CORRELATIVO'])

## data requerimientos

dataset_req = pd.read_csv('train_requerimientos.csv',sep=';')
dataset_req['ID_CORRELATIVO'] =   dataset_req['ID_CORRELATIVO'].apply(lambda x: '{0:0>7}'.format(x))
dataset_req = dataset_req.sort_values(['ID_CORRELATIVO'])
dataset.head()

In [None]:
## Create the sum of zeros and nulls for each row without taking into account the target.
dataset_ = dataset.drop(labels="ATTRITION", axis=1)
#dataset.head()
dataset['Sum_null'] = dataset.apply(lambda x: x.isnull().sum(), axis=1)
dataset['Sum_zero'] = (dataset_ == 0).astype(int).sum(axis=1)


## create of new feature in train:
## sum
dataset["SUM_SDO_ACTIVO_MENOS"] = dataset["SDO_ACTIVO_MENOS0"]+dataset["SDO_ACTIVO_MENOS1"]+dataset["SDO_ACTIVO_MENOS2"]+dataset["SDO_ACTIVO_MENOS3"]+dataset["SDO_ACTIVO_MENOS4"]+dataset["SDO_ACTIVO_MENOS5"]
dataset["SUM_FLG_SEGURO_MENOS"] = dataset["FLG_SEGURO_MENOS0"]+dataset["FLG_SEGURO_MENOS1"]+dataset["FLG_SEGURO_MENOS2"]+dataset["FLG_SEGURO_MENOS3"]+dataset["FLG_SEGURO_MENOS4"]+dataset["FLG_SEGURO_MENOS5"]     
dataset["SUM_NRO_ACCES_CANAL1_MENOS"] = dataset["NRO_ACCES_CANAL1_MENOS0"]+dataset["NRO_ACCES_CANAL1_MENOS1"]+dataset["NRO_ACCES_CANAL1_MENOS2"]+dataset["NRO_ACCES_CANAL1_MENOS3"]+dataset["NRO_ACCES_CANAL1_MENOS4"]+dataset["NRO_ACCES_CANAL1_MENOS5"]  
dataset["SUM_NRO_ACCES_CANAL2_MENOS"] = dataset["NRO_ACCES_CANAL2_MENOS0"]+dataset["NRO_ACCES_CANAL2_MENOS1"]+dataset["NRO_ACCES_CANAL2_MENOS2"]+dataset["NRO_ACCES_CANAL2_MENOS3"]+dataset["NRO_ACCES_CANAL2_MENOS4"]+dataset["NRO_ACCES_CANAL2_MENOS5"]
dataset["SUM_NRO_ACCES_CANAL3_MENOS"] = dataset["NRO_ACCES_CANAL3_MENOS0"]+dataset["NRO_ACCES_CANAL3_MENOS1"]+dataset["NRO_ACCES_CANAL3_MENOS2"]+dataset["NRO_ACCES_CANAL3_MENOS3"]+dataset["NRO_ACCES_CANAL3_MENOS4"]+dataset["NRO_ACCES_CANAL3_MENOS5"]
dataset["SUM_NRO_ENTID_SSFF_MENOS"] = dataset["NRO_ENTID_SSFF_MENOS0"]+dataset["NRO_ENTID_SSFF_MENOS1"]+dataset["NRO_ENTID_SSFF_MENOS2"]+dataset["NRO_ENTID_SSFF_MENOS3"]+dataset["NRO_ENTID_SSFF_MENOS4"]+dataset["NRO_ENTID_SSFF_MENOS5"]
dataset["SUM_FLG_SDO_OTSSFF_MENOS"] = dataset["FLG_SDO_OTSSFF_MENOS0"]+dataset["FLG_SDO_OTSSFF_MENOS1"]+dataset["FLG_SDO_OTSSFF_MENOS2"]+dataset["FLG_SDO_OTSSFF_MENOS3"]+dataset["FLG_SDO_OTSSFF_MENOS4"]+dataset["FLG_SDO_OTSSFF_MENOS5"]

#medians and means

dataset["Median_NRO_ACCES_CANAL1_MENOS"]  = dataset[["NRO_ACCES_CANAL1_MENOS0","NRO_ACCES_CANAL1_MENOS1","NRO_ACCES_CANAL1_MENOS2","NRO_ACCES_CANAL1_MENOS3","NRO_ACCES_CANAL1_MENOS4","NRO_ACCES_CANAL1_MENOS5"]].median(axis=1)
dataset["Median_NRO_ACCES_CANAL2_MENOS"]  = dataset[["NRO_ACCES_CANAL2_MENOS0","NRO_ACCES_CANAL2_MENOS1","NRO_ACCES_CANAL2_MENOS2","NRO_ACCES_CANAL2_MENOS3","NRO_ACCES_CANAL2_MENOS4","NRO_ACCES_CANAL2_MENOS5"]].median(axis=1)
dataset["Median_NRO_ACCES_CANAL3_MENOS"]  = dataset[["NRO_ACCES_CANAL3_MENOS0","NRO_ACCES_CANAL3_MENOS1","NRO_ACCES_CANAL3_MENOS2","NRO_ACCES_CANAL3_MENOS3","NRO_ACCES_CANAL3_MENOS4","NRO_ACCES_CANAL3_MENOS5"]].median(axis=1)
dataset["Mean_NRO_ACCES_CANAL3_MENOS"]  = dataset[["NRO_ACCES_CANAL3_MENOS0","NRO_ACCES_CANAL3_MENOS1","NRO_ACCES_CANAL3_MENOS2","NRO_ACCES_CANAL3_MENOS3","NRO_ACCES_CANAL3_MENOS4","NRO_ACCES_CANAL3_MENOS5"]].mean(axis=1)
dataset["Mean_NRO_ACCES_CANAL_MENOS"] = dataset[["NRO_ACCES_CANAL1_MENOS0","NRO_ACCES_CANAL1_MENOS1","NRO_ACCES_CANAL1_MENOS2","NRO_ACCES_CANAL1_MENOS3","NRO_ACCES_CANAL1_MENOS4","NRO_ACCES_CANAL1_MENOS5","NRO_ACCES_CANAL2_MENOS0","NRO_ACCES_CANAL2_MENOS1","NRO_ACCES_CANAL2_MENOS2","NRO_ACCES_CANAL2_MENOS3","NRO_ACCES_CANAL2_MENOS4","NRO_ACCES_CANAL2_MENOS5","NRO_ACCES_CANAL3_MENOS0","NRO_ACCES_CANAL3_MENOS1","NRO_ACCES_CANAL3_MENOS2","NRO_ACCES_CANAL3_MENOS3","NRO_ACCES_CANAL3_MENOS4","NRO_ACCES_CANAL3_MENOS5"]].mean(axis=1)
dataset["Median_NRO_ACCES_CANAL_MENOS"] = dataset[["NRO_ACCES_CANAL1_MENOS0","NRO_ACCES_CANAL1_MENOS1","NRO_ACCES_CANAL1_MENOS2","NRO_ACCES_CANAL1_MENOS3","NRO_ACCES_CANAL1_MENOS4","NRO_ACCES_CANAL1_MENOS5","NRO_ACCES_CANAL2_MENOS0","NRO_ACCES_CANAL2_MENOS1","NRO_ACCES_CANAL2_MENOS2","NRO_ACCES_CANAL2_MENOS3","NRO_ACCES_CANAL2_MENOS4","NRO_ACCES_CANAL2_MENOS5","NRO_ACCES_CANAL3_MENOS0","NRO_ACCES_CANAL3_MENOS1","NRO_ACCES_CANAL3_MENOS2","NRO_ACCES_CANAL3_MENOS3","NRO_ACCES_CANAL3_MENOS4","NRO_ACCES_CANAL3_MENOS5"]].median(axis=1)


## selection of category variables
target = 'ATTRITION'
exclude = ['ID_CORRELATIVO','CODMES']

cols = [x for x in dataset.columns if x not in exclude + [target]]
cols_cat = dataset[cols].select_dtypes(['object']).columns.tolist()
index_categorical=[cols.index(x) for x in cols_cat]
print(exclude, '\n\n', index_categorical,cols_cat)


## For Training

for i in cols_cat:
    le = preprocessing.LabelEncoder()
    le.fit(list(dataset[i].dropna()))
    dataset.loc[~dataset[i].isnull(),i]=le.transform(dataset[i].dropna())


    
### Imputation of the empty data
dataset['EDAD'] = dataset['EDAD'].fillna(dataset['EDAD'].mean())
dataset['ANTIGUEDAD'] = dataset['ANTIGUEDAD'].fillna(dataset['ANTIGUEDAD'].mean())
dataset['EDAD*ANTIGUEDAD'] = dataset['EDAD']*dataset['ANTIGUEDAD']
dataset['RANG_INGRESO'] = dataset['RANG_INGRESO'].fillna(dataset['RANG_INGRESO'].mode()[0])
dataset['FLAG_LIMA_PROVINCIA'] = dataset['FLAG_LIMA_PROVINCIA'].fillna(dataset['FLAG_LIMA_PROVINCIA'].mode()[0])


### Pivot the requirement variables
dataset_req = dataset_req.pivot_table(index='ID_CORRELATIVO',columns=['CODMES'],aggfunc='count')

dataset_req = dataset_req.fillna(0)

## index
dataset_req.reset_index(inplace=True)


### Union of the dataset with the requirements variables

dataset = pd.merge(dataset, dataset_req, on =['ID_CORRELATIVO'],how='left')

## cerrar con ceros
dataset = dataset.fillna(0)

dataset.head()

In [None]:
dataset.isnull().sum().sum()

## Seleccion de variables

### Univariate feature selection


En la selección de características univariadas, usaremos SelectKBest que elimina todas las características, excepto las más altas Basandonos en la prueba de hipotesis chi-cuadrado. http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html#sklearn.feature_selection.SelectKBest


In [None]:
X,y = dataset.drop(['ID_CORRELATIVO', 'CODMES','ATTRITION'], axis=1),dataset[["ATTRITION"]]

In [None]:
print(X.shape,y.shape)

In [None]:
select_feature = SelectKBest(score_func=chi2, k=5).fit(X, y)

features = []
for feature in zip(X.columns, select_feature.scores_):
    features.append(feature)
    
features_total = pd.DataFrame(features).sort_values(1, ascending=False)
features_total.head(20)

### Recursive Feature Elimination

La eliminación de características recursivas (RFE) funciona mediante la eliminación recursiva de atributos y la construcción de un modelo en los atributos que permanecen.

Utiliza la precisión del modelo para identificar qué atributos (y combinación de atributos) contribuyen más a la predicción del atributo objetivo.

El siguiente ejemplo usa RFE con el algoritmo de regresión logística para seleccionar las 3 características principales. La elección del algoritmo no importa demasiado, siempre que sea hábil y consistente.

In [None]:
# Feature Extraction with RFE
model = LogisticRegression()
rfe = RFE(model, 20)
fit = rfe.fit(X, y)
print('Best feature by rfe:',X.columns[rfe.support_])

### Recursive feature elimination (RFE) with random forest

http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFECV.html Ahora no solo buscaremos las mejores características, sino que también encontraremos la cantidad de características que necesitamos para una mayor precisión.

In [None]:
# Create the RFE object and rank each pixel
'''
clf_rf_3 = RandomForestClassifier()      
rfe = RFE(estimator=clf_rf_3, n_features_to_select=20, step=1)
rfe = rfe.fit(X, y)

print('Best feature by rfe:',X.columns[rfe.support_])
'''

### Random Forest Features Importance

In [None]:
### selector de variables
rf_clf = RandomForestClassifier(n_estimators=200,random_state =123)

# Train the model
rf_clf.fit(X.values,y.values.ravel())

## ordenando  las mejores variables
features = []
for feature in zip(X.columns, rf_clf.feature_importances_):
    features.append(feature)
    
features_total = pd.DataFrame(features,columns=['Variables','Gain']).sort_values('Gain', ascending=False)

sfm = SelectFromModel(rf_clf, threshold=0.006)

# Train the selector
sfm.fit(X, y)

# Print the names of the most important features

variables = []
for feature_list_index in sfm.get_support(indices=True):
    variables.append(X.columns[feature_list_index])
    #variables
    
#features_total
variables

In [197]:
len(variables)

NameError: name 'variables' is not defined

In [None]:
len(features_total)

## Construcción de modelos de Machine Learning

### Selección de muestras de entrenamiento y validación 

In [None]:
# Prepare the data
X,y = dataset.drop(['ID_CORRELATIVO', 'CODMES','ATTRITION'], axis=1),dataset[["ATTRITION"]]
X_train, X_test, y_train, y_test= train_test_split(X, y,test_size=0.2, random_state=1)

In [None]:
## Usando la mejor selecciónde variables
best_X_train = X_train[variables]
best_X_test  = X_test[variables]

In [None]:
# Function to fit the classify and record its metrics
def pipeline(clas, X_train, y_train, X_test, y_test, **kwargs):
    # Dictionary to hold the properties
    clas_props = {}
    
    # Initialize and fit the classify 
    classify = clas(**kwargs)
    classify.fit(best_X_train, y_train)
    y_train_pre =  classify.predict_proba(best_X_train)[:,1]
    y_test_pre =  classify.predict_proba(best_X_test)[:,1]

    
    # Store the metrics for the classify
    clas_props["name"] = clas.__name__
    clas_props["train_score"] = roc_auc_score(y_train, y_train_pre)
    clas_props["test_score"] = roc_auc_score(y_test, y_test_pre)
    
    return clas_props

In [None]:
def execute_pipeline():
    # Create the list of algorithms
    classifys = [
        LogisticRegression,
        DecisionTreeClassifier,
        RandomForestClassifier,
        XGBClassifier,
        LGBMClassifier,
        CatBoostClassifier
    ]
    
    # To store the properties for each classify
    props = []
    
    """
    Iterate thorugh the list of classify,
    passing each thorugh the pipeline and
    storing its properites
    """ 
    for clas in classifys:
        properites = pipeline(clas, best_X_train, y_train, best_X_test, y_test)
        props.append(properites)
        
    return props

In [None]:
# Consolidate the properties into a DataFrame
def get_properties():
    # Obtain the properties after executing the pipeline
    properties = execute_pipeline()
    
    # Extract each individual property of the classify
    names = [prop["name"] for prop in properties]
    train_scores = [prop["train_score"] for prop in properties]
    test_scores = [prop["test_score"] for prop in properties]
    
    # Create a DataFrame from these properties
    df = pd.DataFrame(index=names, 
                    data = {
                            "Training scores auc": train_scores,
                            "Testing scores auc": test_scores,
                      }
                  )
    
    return df

In [None]:
properties = get_properties()
properties

In [None]:
# Plot to compare the performance of the algorithms on both datasets
ax = properties[["Training scores auc", "Testing scores auc"]].plot(kind="bar",title="Performance de los Modelos de Clasificación", figsize=(16, 8))
ax.set_ylabel("AUC Score", fontsize="large")

## Métricas de Evaluación para Clasificación para el  Modelo LightGBM

In [None]:
from time import time

# Initialize and fit the model
best_model = LGBMClassifier() 
best_model.fit(best_X_train, y_train)


# Separate the features and the target variable
predict_train_lg = best_model.predict_proba(best_X_train)[:,1]
predict_test_lg = best_model.predict_proba(best_X_test)[:,1]


# Print scores on both
print("auc o Roc on training in LGBMClassifier data : {:.3f}".format(roc_auc_score(y_train, predict_train_lg)))
print("auc o Roc on testing in LGBMClassifier  data : {:.3f}".format(roc_auc_score(y_test, predict_test_lg))) 

In [None]:
predict_lg_test = best_model.predict(best_X_test)
cm=confusion_matrix(y_test, predict_lg_test)
cm

In [None]:
predict_lg_test

In [None]:
## Metricas de Validación manual
predict_lg_test = best_model.predict(best_X_test)
cm=confusion_matrix(y_test, predict_lg_test)


TN = cm[0][0]
FN = cm[1][0]
TP = cm[1][1]
FP = cm[0][1]

ACC = accuracy_score(y_test, predict_lg_test)

# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/(TP+FN)
# Specificity or true negative rate
TNR = TN/(TN+FP) 
# Precision or positive predictive value
PPV = TP/(TP+FP)
# Negative predictive value
NPV = TN/(TN+FN)

# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)
AUC = roc_auc_score(y_test, predict_test_lg)
GINI = 2*(AUC-0.5)

print("AUC : {:.3f}".format(AUC))
print("INDICE DE GINI : {:.3f}".format(GINI))
print("ACCURACY : {:.3f}".format(ACC))
print("SENSIBILIDAD O RECALL: {:.3f}".format(TPR))
print("ESPECIFICIDAD : {:.3f}".format(TPR))
print("PPV O PRECISION : {:.3f}".format(PPV))
print("NPV : {:.3f}".format(NPV))

In [None]:
best_model = LGBMClassifier() 
best_model.fit(best_X_train, y_train)

In [None]:
predict_lg_test = best_model.predict(best_X_test)

In [None]:
print(metrics.classification_report(y_test, predict_lg_test))

## Cross Validation and Optimización de Modelos 

In [None]:
from pprint import pprint
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(best_model.get_params()) 

## Random Hyperparameter Grid

In [None]:
# Number of trees in random forest
iterations = [int(x) for x in np.linspace(start = 500, stop = 2500, num = 10)]
# Metrics eval
colsample_bytree = [0.6, 0.7, 0.8 , 0.9, 1]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(3, 10, num = 5)]
max_depth.append(None)
# Method of selecting samples for training each tree
learning_rate = [0.05, 0.1, 0.15 ,0.2]
# Create the random grid
random_grid = {'iterations': iterations,
               'colsample_bytree': colsample_bytree,
               'max_depth': max_depth,
               'learning_rate': learning_rate}
pprint(random_grid)

In [None]:
best_X_train.columns

In [None]:
best_X= X[variables]
best_X.columns

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
lgb = LGBMClassifier()
# Random search of parameters, using 5 fold cross validation, 
# search across 50 different combinations, and use all available cores
lgb_random = RandomizedSearchCV(estimator = lgb, param_distributions = random_grid, n_iter = 10, cv = 5, verbose=2, random_state=1416, n_jobs = -1)
# Fit the random search model
lgb_random.fit(best_X, y)

In [None]:
lgb_random.best_params_

In [None]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    accuracy = accuracy_score(test_labels,predictions)
    print('Model Performance')
    print('Accuracy = {:.12g}%.'.format(accuracy))
    return accuracy

In [None]:
base_model = LGBMClassifier()
base_model.fit(best_X_train, y_train)
base_accuracy = evaluate(base_model, best_X_test, y_test)

best_random = lgb_random.best_estimator_
best_random.fit(best_X_train, y_train)
random_accuracy = evaluate(best_random, best_X_test, y_test)
print('Improvement of {:.12g}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))

## Grid Search with Cross Validation

In [None]:
# Number of trees in random forest
iterations = [300, 400, 500, 600]
# Metrics eval
colsample_bytree = [0.7, 0.8, 0.9]
# Maximum number of levels in tree
max_depth = [3, 4, 6, 8]
# Method of selecting samples for training each tree
learning_rate = [0.1, 0.15 ,0.2]
# Create the random grid
param_grid = {'iterations': iterations,
               'colsample_bytree': colsample_bytree,
               'max_depth': max_depth,
               'learning_rate': learning_rate}
pprint(param_grid)

In [None]:
# Create a based model
lgb = LGBMClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = lgb, param_grid = param_grid, cv = 5, n_jobs = -1, verbose = 2)
# Fit the grid search to the data
grid_search.fit(best_X, y)
grid_search.best_params_

In [None]:
best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, best_X_test, y_test)
print('Improvement of {:.12g}%.'.format( 100 * (grid_accuracy - base_accuracy) / base_accuracy))