<a href="https://colab.research.google.com/github/cesarmartinezg-lgtm/Entregas/blob/main/03%20-%20Enfoque_randomforest_ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importación de Librerías

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt

# Configuración Kaggle

In [None]:
# Establecer tus credenciales de Kaggle
os.environ['KAGGLE_USERNAME'] = 'cesarmartinezia'
os.environ['KAGGLE_KEY'] = '3bd3c2a5994356c24295cd5c6d8bba59'

In [None]:
# Crear manualmente el archivo kaggle.json a partir de esas variables
!mkdir -p ~/.kaggle
with open('/root/.kaggle/kaggle.json', 'w') as f:
    f.write('{"username":"%s","key":"%s"}' % (os.environ['KAGGLE_USERNAME'], os.environ['KAGGLE_KEY']))

In [None]:
# Ajustar permisos
!chmod 600 /root/.kaggle/kaggle.json

# Descarga de Dataset y cargue




In [None]:
# Descargar los datos de la competencia
!kaggle competitions download -c udea-ai-4-eng-20252-pruebas-saber-pro-colombia

Downloading udea-ai-4-eng-20252-pruebas-saber-pro-colombia.zip to /content
  0% 0.00/29.9M [00:00<?, ?B/s]
100% 29.9M/29.9M [00:00<00:00, 1.29GB/s]


In [None]:
!unzip udea-ai-4-eng-20252-pruebas-saber-pro-colombia.zip

Archive:  udea-ai-4-eng-20252-pruebas-saber-pro-colombia.zip
  inflating: submission_example.csv  
  inflating: test.csv                
  inflating: train.csv               


In [None]:
# Cargar datos
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
print(f"Train: {df_train.shape}, Test: {df_test.shape}")

Train: (692500, 21), Test: (296786, 20)


In [None]:
# Preprocesamiento
def preprocess(df, is_train=True):
    df = df.copy()
    df = df.drop(columns=[col for col in df.columns if "ID" in col.upper()], errors='ignore')
    y = df.pop('RENDIMIENTO_GLOBAL') if is_train and 'RENDIMIENTO_GLOBAL' in df.columns else None

    # Imputar nulos
    for col in df.select_dtypes(include=np.number).columns:
        df[col].fillna(df[col].median(), inplace=True)
    for col in df.select_dtypes(include="object").columns:
        df[col].fillna(df[col].mode()[0] if len(df[col].mode()) > 0 else "DESCONOCIDO", inplace=True)

    return df, y

X_train, y_train = preprocess(df_train, True)
X_test, _ = preprocess(df_test, False)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0] if len(df[col].mode()) > 0 else "DESCONOCIDO", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the int

In [None]:
# Encoding rápido
le_dict = {}
for col in X_train.select_dtypes(include="object").columns:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col].astype(str))

    # Manejar categorías nuevas
    test_vals = X_test[col].astype(str)
    for val in test_vals.unique():
        if val not in le.classes_:
            le.classes_ = np.append(le.classes_, val)
    X_test[col] = le.transform(test_vals)
    le_dict[col] = le

In [None]:
# Target encoding
le_target = LabelEncoder()
y_encoded = le_target.fit_transform(y_train)

# Escalar
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Split simple
X_tr, X_val, y_tr, y_val = train_test_split(X_train_scaled, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)


In [None]:
# Modelo 1: Random Forest básico
print("\n1. Random Forest básico...")
rf_base = RandomForestClassifier(n_estimators=50, max_depth=15, random_state=42, n_jobs=-1)
rf_base.fit(X_tr, y_tr)
print(f"   Val Accuracy: {rf_base.score(X_val, y_val):.4f}")

# Modelo 2: Random Forest optimizado (pocos parámetros)
print("\n2. Random Forest optimizado...")
rf_opt = RandomForestClassifier(n_estimators=100, max_depth=20, min_samples_split=5,
                                random_state=42, n_jobs=-1)
rf_opt.fit(X_tr, y_tr)
print(f"   Val Accuracy: {rf_opt.score(X_val, y_val):.4f}")


1. Random Forest básico...
   Val Accuracy: 0.3870

2. Random Forest optimizado...
   Val Accuracy: 0.3867


In [None]:
best_rf = rf_opt if rf_opt.score(X_val, y_val) > rf_base.score(X_val, y_val) else rf_base


# Feature importance (top 5)
print("\nTop 5 features más importantes:")
feat_imp = pd.DataFrame({
    'feature': X_train.columns,
    'importance': best_rf.feature_importances_
}).sort_values('importance', ascending=False).head(5)
print(feat_imp.to_string(index=False))


Top 5 features más importantes:
         feature  importance
E_PRGM_ACADEMICO    0.148197
     INDICADOR_1    0.116134
     INDICADOR_2    0.102740
     INDICADOR_4    0.085397
F_EDUCACIONMADRE    0.078466


In [None]:
# Entrenar con todos los datos
print("\n3. Entrenamiento final...")
final_rf = RandomForestClassifier(n_estimators=100, max_depth=20, min_samples_split=5,
                                  random_state=42, n_jobs=-1)
final_rf.fit(X_train_scaled, y_encoded)

# Predicción
y_pred = final_rf.predict(X_test_scaled)
y_pred_labels = le_target.inverse_transform(y_pred)

# Submission
submission = pd.DataFrame({'ID': test_ids, 'RENDIMIENTO_GLOBAL': y_pred_labels})
submission.to_csv('submission_rf_03.csv', index=False)

print(f"\n✓ Submission: submission_rf_03.csv")
print(f"\nDistribución de predicciones:")
print(submission['RENDIMIENTO_GLOBAL'].value_counts())


3. Entrenamiento final...

✓ Submission: submission_rf_03.csv

Distribución de predicciones:
RENDIMIENTO_GLOBAL
alto          93835
bajo          87602
medio-bajo    60694
medio-alto    54655
Name: count, dtype: int64
