In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")

# Carga de datos
df_train = pd.read_csv("../data/train/train.csv")
df_test = pd.read_csv("../data/processed/df_test_proce.csv")

# Separación de características y target
X = df_train.drop('price_range', axis=1)
y = df_train['price_range']

# Escalado de características
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Preparación de los datos de test
X_test_scaled = scaler.transform(df_test)


Modelo 1 - Random Forest (Regularización + Cross-Validation)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Modelo Random Forest con cross-validation
rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
cv_scores_rf = cross_val_score(rf, X_train, y_train, cv=5)
rf.fit(X_train, y_train)
rf

In [None]:
y_pred_rf = rf.predict(X_val)
y_test_pred_rf = rf.predict(X_test_scaled)

# Evaluación
print("Random Forest Classifier:")
print("Cross-Validation Scores:", cv_scores_rf)
print("Mean CV Score:", cv_scores_rf.mean())
print("Accuracy on Validation:", accuracy_score(y_val, y_pred_rf))
print("Classification Report:\n", classification_report(y_val, y_pred_rf))


Modelo 2 - Support Vector Classifier (SVC) (Regularización + Cross-Validation)

In [None]:
from sklearn.svm import SVC

# Modelo SVC con regularización y cross-validation
svc = SVC(C=1.0, kernel='rbf', random_state=42)
cv_scores_svc = cross_val_score(svc, X_train, y_train, cv=5)
svc.fit(X_train, y_train)
svc


In [None]:
y_pred_svc = svc.predict(X_val)
y_test_pred_svc = svc.predict(X_test_scaled)

# Evaluación
print("Support Vector Classifier:")
print("Cross-Validation Scores:", cv_scores_svc)
print("Mean CV Score:", cv_scores_svc.mean())
print("Accuracy on Validation:", accuracy_score(y_val, y_pred_svc))
print("Classification Report:\n", classification_report(y_val, y_pred_svc))

Modelo 3 - Gradient Boosting (Regularización + Cross-Validation)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# Modelo Gradient Boosting con early stopping
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
cv_scores_gb = cross_val_score(gb, X_train, y_train, cv=5)
gb.fit(X_train, y_train)
gb

In [None]:
y_pred_gb = gb.predict(X_val)
y_test_pred_gb = gb.predict(X_test_scaled)

# Evaluación
print("Gradient Boosting Classifier:")
print("Cross-Validation Scores:", cv_scores_gb)
print("Mean CV Score:", cv_scores_gb.mean())
print("Accuracy on Validation:", accuracy_score(y_val, y_pred_gb))
print("Classification Report:\n", classification_report(y_val, y_pred_gb))


Modelo 4 - Logistic Regression (Regularización L2)

In [None]:
from sklearn.linear_model import LogisticRegression

# Modelo Logistic Regression con regularización L2
lr = LogisticRegression(C=1.0, solver='liblinear', random_state=42)
cv_scores_lr = cross_val_score(lr, X_train, y_train, cv=5)
lr.fit(X_train, y_train)
lr


In [None]:
y_pred_lr = lr.predict(X_val)
y_test_pred_lr = lr.predict(X_test_scaled)

# Evaluación
print("Logistic Regression:")
print("Cross-Validation Scores:", cv_scores_lr)
print("Mean CV Score:", cv_scores_lr.mean())
print("Accuracy on Validation:", accuracy_score(y_val, y_pred_lr))
print("Classification Report:\n", classification_report(y_val, y_pred_lr))

K-Means Clustering (Modelo No Supervisado)

In [None]:
from sklearn.cluster import KMeans

# Modelo KMeans con n_init especificado
kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
kmeans.fit(X_scaled)
clusters = kmeans.predict(X_val)
clusters_test = kmeans.predict(X_test_scaled)

# Evaluación
print("KMeans Clustering (Validation Data):")
print(pd.crosstab(y_val, clusters))

print("KMeans Clustering (Test Data):")
print(pd.crosstab(clusters_test, clusters_test))


Interpretación de Variables 

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Importancia de características para Random Forest
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]

# Gráfico de barras de importancia de características
plt.figure(figsize=(12, 6))
sns.barplot(x=[X.columns[i] for i in indices], y=importances[indices])
plt.title("Feature Importances - Random Forest")
plt.xticks(rotation=90)
plt.show()




In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

# Definir una función para crear y mostrar un heatmap de la matriz de confusión
def plot_confusion_matrix(y_true, y_pred, title):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=[f'Class {i}' for i in range(cm.shape[1])],
                yticklabels=[f'Class {i}' for i in range(cm.shape[0])])
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.title(title)
    plt.show()

# Aplicar a Random Forest
plot_confusion_matrix(y_val, y_pred_rf, 'Random Forest Confusion Matrix')

# Aplicar a SVC
plot_confusion_matrix(y_val, y_pred_svc, 'SVC Confusion Matrix')

# Aplicar a Gradient Boosting
plot_confusion_matrix(y_val, y_pred_gb, 'Gradient Boosting Confusion Matrix')

# Aplicar a Logistic Regression
plot_confusion_matrix(y_val, y_pred_lr, 'Logistic Regression Confusion Matrix')


In [None]:
print(f"Dimensiones de X_val: {X_val.shape}")
print(f"Dimensiones de y_val: {y_val.shape}")
print(f"Dimensiones de y_score_rf: {y_score_rf.shape}")


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize

# Binarizar las etiquetas para la clasificación multiclase
y_bin = label_binarize(y, classes=[0, 1, 2, 3])
n_classes = y_bin.shape[1]

def plot_roc_curve(y_true, y_score, n_classes, label):
    plt.figure(figsize=(12, 8))
    
    # Calcular la curva ROC para cada clase
    for i in range(n_classes):
        fpr, tpr, _ = roc_curve(y_true[:, i], y_score[:, i])
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, label=f'{label} (class {i}) AUC = {roc_auc:.2f}')
    
    # Configuración del gráfico
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'Receiver Operating Characteristic - {label}')
    plt.legend(loc='lower right')
    plt.grid(True)
    plt.show()

# Confirmar las dimensiones del conjunto de validación
print(f"Dimensiones de X_val: {X_val.shape}")
print(f"Dimensiones de y_val: {y_val.shape}")

# Entrenar modelos y obtener las predicciones de probabilidad
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

# Random Forest
rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf.fit(X_train, y_train)
y_score_rf = rf.predict_proba(X_val)  # Asegúrate de que esto sea para X_val
plot_roc_curve(y_bin[:len(y_val)], y_score_rf, n_classes, 'Random Forest')

# Support Vector Classifier
svc = SVC(C=1.0, kernel='rbf', probability=True, random_state=42)
svc.fit(X_train, y_train)
y_score_svc = svc.predict_proba(X_val)  # Asegúrate de que esto sea para X_val
plot_roc_curve(y_bin[:len(y_val)], y_score_svc, n_classes, 'SVC')

# Gradient Boosting
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
gb.fit(X_train, y_train)
y_score_gb = gb.predict_proba(X_val)  # Asegúrate de que esto sea para X_val
plot_roc_curve(y_bin[:len(y_val)], y_score_gb, n_classes, 'Gradient Boosting')

# Logistic Regression
lr = LogisticRegression(C=1.0, solver='liblinear', random_state=42)
lr.fit(X_train, y_train)
y_score_lr = lr.predict_proba(X_val)  # Asegúrate de que esto sea para X_val
plot_roc_curve(y_bin[:len(y_val)], y_score_lr, n_classes, 'Logistic Regression')
