<a href="https://colab.research.google.com/github/Herrera1022/Proyecto-Kaggle/blob/main/03_modelo_con_preprocesado_de_tal_forma_y_SVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ===============================================================
# 03 - modelo con preprocesado de tal forma y SVM.ipynb
# ===============================================================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# ================================
# 1. Cargar datos
# ================================
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

print("Columns in train DataFrame:", train.columns)

test_ids = test['ID']

X = train.drop(["RENDIMIENTO_GLOBAL", "ID"], axis=1)
y = train["RENDIMIENTO_GLOBAL"]

test_features = test.drop(["ID"], axis=1)

numerical_cols = X.select_dtypes(include=np.number).columns.tolist()
categorical_cols = X.select_dtypes(include='object').columns.tolist()

# ================================
# PIPELINES OPTIMIZADOS
# ================================
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

# ================================
# 2. Preprocesamiento
# ================================
X_processed = preprocessor.fit_transform(X)
test_processed = preprocessor.transform(test_features)

# ================================
# 3. División train / valid
# ================================
X_train, X_valid, y_train, y_valid = train_test_split(
    X_processed, y, test_size=0.2, random_state=42
)

# ================================
# 4. Modelo SVM ultrarrápido
# ================================
svm_model = LinearSVC(
    C=1.0,
    max_iter=7000
)
svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_valid)
acc = accuracy_score(y_valid, y_pred)

print("Accuracy SVM:", acc)

# ================================
# 5. Predicciones para Kaggle
# ================================
test_predictions = svm_model.predict(test_processed)

submission = pd.DataFrame({
    "id": test_ids,
    "label": test_predictions
})

submission.to_csv("submission_svm.csv", index=False)

print("Archivo submission_svm.csv generado correctamente.")


Columns in train DataFrame: Index(['ID', 'PERIODO_ACADEMICO', 'E_PRGM_ACADEMICO', 'E_PRGM_DEPARTAMENTO',
       'E_VALORMATRICULAUNIVERSIDAD', 'E_HORASSEMANATRABAJA',
       'F_ESTRATOVIVIENDA', 'F_TIENEINTERNET', 'F_EDUCACIONPADRE',
       'F_TIENELAVADORA', 'F_TIENEAUTOMOVIL', 'E_PRIVADO_LIBERTAD',
       'E_PAGOMATRICULAPROPIO', 'F_TIENECOMPUTADOR', 'F_TIENEINTERNET.1',
       'F_EDUCACIONMADRE', 'RENDIMIENTO_GLOBAL', 'INDICADOR_1', 'INDICADOR_2',
       'INDICADOR_3', 'INDICADOR_4'],
      dtype='object')
Accuracy SVM: 0.37609386281588447
Archivo submission_svm.csv generado correctamente.
