In [37]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [31]:
# Leer csv
df_train = pd.read_csv('/content/drive/MyDrive/TP Proyecto/Entrega 3/Codigo E3/tiendas_caba_m2_train.csv')
df_val = pd.read_csv('/content/drive/MyDrive/TP Proyecto/Entrega 3/Codigo E3/tiendas_caba_m2_test.csv')

In [32]:
df_train = df_train.drop(columns=['customer_id'])
df_val = df_val.drop(columns=['customer_id'])

In [33]:
X_train = df_train.drop(columns=['cluster'])  # Variables independientes (predictoras)
y_train = df_train['cluster']  # Variable dependiente (target)

In [34]:
X_val = df_val.drop(columns=['cluster'])  # Variables independientes en validación
y_val = df_val['cluster']  # Variable dependiente en validación

In [35]:
# Identificar columnas booleanas
boolean_cols = X_train.columns[X_train.nunique() == 2]  # Columnas con solo 2 valores únicos (0 y 1)
non_boolean_cols = X_train.columns.difference(boolean_cols)  # Columnas no booleanas

# Escalar solo las columnas no booleanas
scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_val_scaled = X_val.copy()

X_train_scaled[non_boolean_cols] = scaler.fit_transform(X_train[non_boolean_cols])
X_val_scaled[non_boolean_cols] = scaler.transform(X_val[non_boolean_cols])

In [36]:
# Crear el modelo de regresión logística
model = LogisticRegression(max_iter=1000)

# Definir los parámetros para grid search
param_grid = {
    'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000],  # Ampliamos los valores de regularización
    'solver': ['lbfgs', 'liblinear', 'saga', 'newton-cg', 'sag'],  # Exploramos más solvers
    'penalty': ['l1', 'l2', 'none', 'elasticnet'],  # Incluimos elasticnet (solo con 'saga')
    'max_iter': [100, 200, 500, 1000, 2000],  # Más opciones de iteraciones
    'class_weight': [None, 'balanced'],  # Consideramos balance de clases
    'tol': [1e-5, 1e-4, 1e-3, 1e-2],  # Exploramos diferentes tolerancias
    'multi_class': ['ovr', 'multinomial'],  # Consideramos la clasificación binaria y multiclase
    'l1_ratio': [0.1, 0.5, 0.7, 0.9],  # Solo para 'saga' y 'elasticnet'
}
# GridSearchCV con validación cruzada
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')

# Entrenar el modelo con grid search
grid_search.fit(X_train_scaled, y_train)

# Mostrar los mejores parámetros
print("Mejores hiperparámetros encontrados:")
print(grid_search.best_params_)

# Predecir con el mejor modelo en el conjunto de validación
y_pred = grid_search.best_estimator_.predict(X_val_scaled)

# Evaluar el rendimiento
accuracy = accuracy_score(y_val, y_pred)
report = classification_report(y_val, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

[1;30;43mSe han truncado las últimas 5000 líneas del flujo de salida.[0m
187200 fits failed out of a total of 288000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
14400 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1194, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/pytho

Mejores hiperparámetros encontrados:
{'C': 0.001, 'class_weight': None, 'l1_ratio': 0.1, 'max_iter': 100, 'multi_class': 'ovr', 'penalty': 'l2', 'solver': 'liblinear', 'tol': 1e-05}
Accuracy: 0.42317380352644834
Classification Report:
              precision    recall  f1-score   support

           0       0.39      0.31      0.34       159
           1       0.44      0.70      0.54       169
           2       1.00      0.01      0.03        69

    accuracy                           0.42       397
   macro avg       0.61      0.34      0.30       397
weighted avg       0.52      0.42      0.37       397



