In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from imblearn.over_sampling import SMOTE
import os
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
import pickle
from datetime import datetime


In [146]:
# Cambiar directorio
os.chdir("C:/Users/y0m01v1/Documents/laboratorio-machine-learning-main")

In [147]:
# Cargar los datos
data = pd.read_csv("data/churn.csv")

In [148]:
data.head(5)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [149]:
#Eliminar columnas innecesarias:
data = data.drop(data.columns[0:3], axis=1)

# Recorre todas las columnas restantes del DataFrame una por una
for i, column in enumerate(data.columns):
    #Verifica si la columna es de tipo categórico o texto
    if data[column].dtype == "object":
        # rellena valores faltantes con el valor más frecuente (la moda)
        data[column] = data[column].fillna(data[column].mode()[0])
        #Convierte la columna categórica en números
        data[column] = data[column].astype("category").cat.codes
    else:
        #Rellena los valores faltantes en columnas numéricas con la mediana de esa columna
        data[column] = data[column].fillna(data[column].median())

In [150]:
# Separar características y etiquetas
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

In [151]:
# Balancear con SMOTE
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

In [152]:
# Dividir en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.33, random_state=42)

In [None]:
# Diccionario con los hiperparámetros que queremos probar en la búsqueda aleatoria
parametros = {
    # Valores de regularización (inverso de la fuerza de regularización)
    'C': [0.01, 0.1, 1, 10, 100],
    # Tipo de penalización (L2 es la regularización de Ridge)
    'penalty': ['l2'],
    # Algoritmos para optimizar la función de costo
    'solver': ['lbfgs', 'saga'], 
    # Estrategia para clasificación multiclase
    'multi_class': ['multinomial']
}

# Crear una instancia del modelo de regresión logística con un número máximo de iteraciones
model = LogisticRegression(max_iter=1000)

# Configuramos la búsqueda aleatoria de hiperparámetros
# - model: el modelo base
# - parametros: el espacio de búsqueda
# - n_iter: número de combinaciones aleatorias a probar
# - cv: número de particiones para validación cruzada
# - scoring: métrica de evaluación (en este caso, precisión)
Search = RandomizedSearchCV(model, parametros, n_iter=10, cv=3, scoring='accuracy')

# Entrenamos el modelo con las combinaciones de hiperparámetros sobre los datos de entrenamiento
Search.fit(X_train, y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [None]:
print(Search.best_params_)

{'solver': 'lbfgs', 'penalty': 'l2', 'multi_class': 'multinomial', 'C': 1}


In [None]:
print(Search.best_score_)

0.7730084396605331


In [156]:
better_model = Search.best_estimator_

In [157]:
# Evaluar

y_pred = better_model.predict(X_test)

print("Matriz de confusión:\n", confusion_matrix(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))

Matriz de confusión:
 [[2015  647]
 [ 590 2004]]
Accuracy: 0.7646499238964992
Precision: 0.7559411542814033
Recall: 0.7725520431765613
F1 Score: 0.7641563393708294


In [None]:
# Crear carpeta si no existe
os.makedirs("../laboratorio-machine-learning-main/churn/models", exist_ok=True)

# Guardar el modelo con fecha en el nombre
modelo_path = f"../laboratorio-machine-learning-main/churn/models/better_model.pk"

with open(modelo_path, 'wb') as f:
    pickle.dump(better_model, f)

print(f"Modelo guardado en: {modelo_path}")

Modelo guardado en: ../laboratorio-machine-learning-main/churn/models/better_model.pk


In [None]:
features = X.columns.tolist()

with open("../laboratorio-machine-learning-main/churn/models/features_retrain.pk", 'wb') as f:
    pickle.dump(features, f)

print("Características guardadas.")

Características guardadas.


In [None]:
categorical_mappings = {}

for column in data.columns:
    if data[column].dtype.name == "category":
        categorical_mappings[column] = dict(enumerate(data[column].cat.categories))

with open("../laboratorio-machine-learning-main/churn/models/categorical_mappings.pk", 'wb') as f:
    pickle.dump(categorical_mappings, f)

print("Mapeos categóricos guardados.")

Mapeos categóricos guardados.
