In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, f1_score
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn import preprocessing

In [2]:
## 1- Preparacion de los datos
# 1.1- Carga desde fichero y mostrar estadisticas
X = pd.read_csv('./dataset_churn_all.csv', index_col=0)

# Valores vacios se inicializan a 0.0 si se deciden incluirse. Leer REAMDE.md
#X['TotalCharges'].replace([' '],[0.0], inplace=True) 
X['Churn'].replace(['No','Yes'],[0,1], inplace=True)

# Valores vacios se eliminan, leer README.md
X = X.loc[(X['TotalCharges']!=' ')]

#Se muestran principales estadigrafos del modelo
print(X.describe())

       SeniorCitizen       tenure  MonthlyCharges        Churn
count    7032.000000  7032.000000     7032.000000  7032.000000
mean        0.162400    32.421786       64.798208     0.265785
std         0.368844    24.545260       30.085974     0.441782
min         0.000000     1.000000       18.250000     0.000000
25%         0.000000     9.000000       35.587500     0.000000
50%         0.000000    29.000000       70.350000     0.000000
75%         0.000000    55.000000       89.862500     1.000000
max         1.000000    72.000000      118.750000     1.000000


In [3]:
# 1.2 - Transformar variables tipo categorica a numericas
le = preprocessing.LabelEncoder()
X['gender'] = le.fit_transform(X['gender'])
X['Partner'] = le.fit_transform(X['Partner'])
X['Dependents'] = le.fit_transform(X['Dependents'])
X['PhoneService'] = le.fit_transform(X['PhoneService'])
X['MultipleLines'] = le.fit_transform(X['MultipleLines'])
X['InternetService'] = le.fit_transform(X['InternetService'])
X['OnlineSecurity'] = le.fit_transform(X['OnlineSecurity'])
X['OnlineBackup'] = le.fit_transform(X['OnlineBackup'])
X['DeviceProtection'] = le.fit_transform(X['DeviceProtection'])
X['TechSupport'] = le.fit_transform(X['TechSupport'])
X['StreamingTV'] = le.fit_transform(X['StreamingTV'])
X['StreamingMovies'] = le.fit_transform(X['StreamingMovies'])
X['Contract'] = le.fit_transform(X['Contract'])
X['PaperlessBilling'] = le.fit_transform(X['PaperlessBilling'])
X['PaymentMethod'] = le.fit_transform(X['PaymentMethod'])

In [4]:
# 2- Dividir la muestra en entrenamiento con 90% y prueba 10%

X1 = X.iloc[:int(len(X)*0.9)]
X2 = X.iloc[int(len(X)*0.1)*-1:]
X_train = X1.drop(['Churn'], axis=1)
X_test  = X2.drop(['Churn'], axis=1)

y_train = X1['Churn']
y_test  = X2['Churn']

In [None]:
# 3- Se selecciona algoritmo randomForest
#    Optimizar rendimiento del modelo
#    Buscar los parametros optimos con GridSearchCV, 
#    Utilizando validacion cruzada, para mejor generalizacion del modeo, evitando el sobre ajuste.
rf = RandomForestClassifier(criterion='entropy',random_state=42)

pipeline = Pipeline([('rf',  rf),])

# Parametros del espacio de busqueda asociado a RandomForest
parameters = {
	'rf__n_estimators': (100,200, 300),
	'rf__max_depth': (5, 15, 25),
	'rf__min_samples_split': (2,3),
	'rf__min_samples_leaf': (1,2 ),
	#'rf__class_weight': ({'No':1, 'Yes':4}, {'No':1, 'Yes':10}),
    'rf__class_weight': ({0:1, 1:4}, {0:1, 1:10}),
	'rf__max_features': (5,10,15)
}

# Metricas de precision a utilizar para la optimizacion de los parametros
scorers = {
    #'f1': make_scorer(f1_score, labels= ('No', 'Yes'), pos_label ='Yes')   
    'f1': make_scorer(f1_score)   
 }

# Buscar mejor ajuste
grid_search = GridSearchCV(pipeline, parameters,  n_jobs=-1, verbose=1, scoring=scorers['f1'], 
	   error_score='raise', cv=5)
grid_search.fit(X_train, y_train)	

Fitting 5 folds for each of 216 candidates, totalling 1080 fits


In [None]:
# Mostrar los parametros optimos del modelo
print("Mejor score: %0.3f" % grid_search.best_score_)
print("Mejor conjunto de parametros:")
best_parameters = grid_search.best_estimator_.get_params()

for param_name in sorted(parameters.keys()):
	print("\t%s: %r" % (param_name, best_parameters[param_name]))


In [None]:
#Mostrar el rendimiento del modelo
from sklearn.metrics import classification_report

predictions = grid_search.predict(X_test)
#print(classification_report(y_test, predictions, target_names=('No', 'Yes',), labels=('No', 'Yes',), digits=4))
print(classification_report(y_test, predictions, digits=4))

In [None]:
#Mostrar matrix de confusion
disp = ConfusionMatrixDisplay.from_predictions(y_test, predictions)

In [None]:
# Validacion cruzada del modelo con 5 particiones
import numpy as np
from sklearn.model_selection import cross_val_score
scores = cross_val_score(rf, X_test, y_test, cv=5 ) #por defecto 5 particiones
print(scores) #los valores individuales de cada partición
print(np.mean(scores)) #la media de acierto global

In [None]:
# Validacion cruzada del modelo con 7 particiones
scores = cross_val_score(rf, X_test, y_test, cv=7) #por defecto 5 particiones
print(scores) #los valores individuales de cada partición
print(np.mean(scores)) #la media de acierto global

In [None]:
# Validacion cruzada del modelo con 10 particiones
scores = cross_val_score(rf, X_test, y_test, cv=10) #por defecto 5 particiones
print(scores) #los valores individuales de cada partición
print(np.mean(scores)) #la media de acierto global

In [None]:
# Validacion cruzada del modelo con 15 particiones
scores = cross_val_score(rf, X_test, y_test, cv=15) #por defecto 5 particiones
print(scores) #los valores individuales de cada partición
print(np.mean(scores)) #la media de acierto global