In [1]:
# path
import os

# Visualizaciones 
import plotly.express as px

# manejo de datos
import pandas as pd
import numpy as np

# Modelado
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

## Lectura de DF

In [2]:
# Cambiar directorio
os.chdir("..")

In [3]:
# Cargar los datos que tenemos disponibles
data: pd.DataFrame = pd.read_csv("data/churn.csv")


## EDA

Vemos las columnas y tipos de datos.

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


Exploración de los datos

In [5]:
data.head(5)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [6]:
# Nos deshacemos de las columnas que no contribuyen en mucho
data_filtered: pd.DataFrame = data.drop(data.columns[0:3], axis=1)

No tenemos nulos.

In [7]:
data_filtered.isna().sum()

CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

## Escalado

In [8]:
scaler: StandardScaler = StandardScaler()

In [9]:
# elijo las columnas a escalar.
numeric_cols: list[str] = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']

In [10]:
data_std: pd.DataFrame = pd.DataFrame(scaler.fit_transform(data_filtered[numeric_cols]), columns=numeric_cols)

In [11]:
data_std

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,EstimatedSalary
0,-0.326221,0.293517,-1.041760,-1.225848,-0.911583,0.021886
1,-0.440036,0.198164,-1.387538,0.117350,-0.911583,0.216534
2,-1.536794,0.293517,1.032908,1.333053,2.527057,0.240687
3,0.501521,0.007457,-1.387538,-1.225848,0.807737,-0.108918
4,2.063884,0.388871,-1.041760,0.785728,-0.911583,-0.365276
...,...,...,...,...,...,...
9995,1.246488,0.007457,-0.004426,-1.225848,0.807737,-0.066419
9996,-1.391939,-0.373958,1.724464,-0.306379,-0.911583,0.027988
9997,0.604988,-0.278604,0.687130,-1.225848,-0.911583,-1.008643
9998,1.256835,0.293517,-0.695982,-0.022608,0.807737,-0.125231


Obtengo variables dummy de las categóricas.

In [12]:
data_dummies: pd.DataFrame = pd.get_dummies(data_filtered[['Geography','Gender']], 
                                            dtype=int)

In [13]:
data_dummies.sample(5)

Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
2774,1,0,0,1,0
7670,1,0,0,1,0
493,1,0,0,1,0
8426,1,0,0,0,1
7102,1,0,0,0,1


Unifico df con los estandarizados y las dummies.

In [14]:
data_final: pd.DataFrame = pd.concat([data_std, data_dummies], axis=1)
data_final.columns

Index(['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts',
       'EstimatedSalary', 'Geography_France', 'Geography_Germany',
       'Geography_Spain', 'Gender_Female', 'Gender_Male'],
      dtype='object')

Unifico df con las columnas que son binarias, las cuales no fueron estandarizadas ni trabajadas como categoricas.

In [15]:
data_final = pd.concat([data_filtered[['HasCrCard', 'IsActiveMember','Exited']], data_final], axis=1) 

In [16]:
data_final

Unnamed: 0,HasCrCard,IsActiveMember,Exited,CreditScore,Age,Tenure,Balance,NumOfProducts,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,1,1,1,-0.326221,0.293517,-1.041760,-1.225848,-0.911583,0.021886,1,0,0,1,0
1,0,1,0,-0.440036,0.198164,-1.387538,0.117350,-0.911583,0.216534,0,0,1,1,0
2,1,0,1,-1.536794,0.293517,1.032908,1.333053,2.527057,0.240687,1,0,0,1,0
3,0,0,0,0.501521,0.007457,-1.387538,-1.225848,0.807737,-0.108918,1,0,0,1,0
4,1,1,0,2.063884,0.388871,-1.041760,0.785728,-0.911583,-0.365276,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1,0,0,1.246488,0.007457,-0.004426,-1.225848,0.807737,-0.066419,1,0,0,0,1
9996,1,1,0,-1.391939,-0.373958,1.724464,-0.306379,-0.911583,0.027988,1,0,0,0,1
9997,0,1,1,0.604988,-0.278604,0.687130,-1.225848,-0.911583,-1.008643,1,0,0,1,0
9998,1,0,1,1.256835,0.293517,-0.695982,-0.022608,0.807737,-0.125231,0,1,0,0,1


In [17]:
corr_matrix: np.ndarray = data_final.corr()

px.imshow(corr_matrix, title='Matriz de correlación', aspect='Auto', text_auto=True, zmin=-1, zmax=1)

Se observa que las variables no tienen correlaciones fuertes.

## Separación en test y train

In [18]:
# Generar los datos para poder separar la variable de respuesta de los datos que tenemos disponibles
X: pd.DataFrame = data_final.drop(columns=['Exited'])
y: pd.DataFrame = data_final['Exited']

In [19]:
# Separar los datos en datos de entrenamiento y testing
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42)

## Modelado

In [20]:
# Crear el modelo y entrenarlo
clf_lin =  LogisticRegression(random_state=42, solver='liblinear', class_weight='balanced')
clf_lin.fit(X_train, y_train)

In [21]:
y_pred = clf_lin.predict(X_test)

conf_matrix: np.ndarray = confusion_matrix(y_test, y_pred)

In [22]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.72      0.81      1607
           1       0.38      0.71      0.50       393

    accuracy                           0.72      2000
   macro avg       0.65      0.72      0.65      2000
weighted avg       0.81      0.72      0.75      2000



In [23]:
px.imshow(conf_matrix, title="Confusion Matrix", text_auto=True, aspect="auto")

Se observa una mejora sustancial en el modelo. Sin embargo, el recall no es bueno. Esto se debe al desbalance de los datos.

In [24]:
# Generar el binario del modelo para reutilizarlo, equivalencia de variables categoricas y caracteristicas del modelo
import pickle
pickle.dump(clf_lin, open("churn/models/model.pk", "wb"))