# Entrenamiento de modelos relacionados con Support Vector Machines

In [None]:
# General import and load data
import time
import pandas as pd
import numpy as np
from pandas import Series, DataFrame

from sklearn.utils import resample

# Training and test spliting
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# Preprocessing 
from sklearn.preprocessing import StandardScaler

# Estimators
from sklearn.svm import SVC

# Evaluation
from sklearn import metrics
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report

# Optimization
from sklearn.model_selection import GridSearchCV

# Visualisation
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(color_codes=True)

In [None]:
# constants
data_folder="../../../data"
raw_data_folder = f"{data_folder}/raw"
processed_data_folder = f"{data_folder}/processed"
submissions_folder = f"{data_folder}/submissions"

original_train_dataset_path = f"{raw_data_folder}/train.csv"
original_test_dataset_path = f"{raw_data_folder}/test_nolabel.csv"

train_dataset_processed_path = f"{processed_data_folder}/train_processed.csv"
train_dataset_balanced_processed_path = f"{processed_data_folder}/train_balanced_processed.csv"
test_nolabel_processed_path = f"{processed_data_folder}/test_nolabel_processed.csv"

## Preparación

Carga de datos y creación de dataframes para el posterior entrenamiento y predicción

In [None]:
train_df = pd.read_csv(train_dataset_processed_path)
train_balanced_df = pd.read_csv(train_dataset_balanced_processed_path)
test_nolabel_df = pd.read_csv(test_nolabel_processed_path)

In [None]:
def train_svc_with_df(
        df: pd.DataFrame, 
        columns: list[str],
        kernel: str = "rbf",
) -> tuple[SVC, classification_report]:
    
    model = SVC(
        kernel=kernel, 
        random_state=42,
        gamma="auto"
    )

    x = df[columns]
    y = df["Accept"]

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
    
    trained_model = model.fit(x_train, y_train)

    y_pred = trained_model.predict(x_test)

    # Evaluar el modelo
    # cm = confusion_matrix(y_test, y_pred)
    # acc = accuracy_score(y_test, y_pred)
    # prec = precision_score(y_test, y_pred)
    # rec = recall_score(y_test, y_pred)
    # f1 = f1_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    return trained_model, report

   

Prueba básica

In [None]:
columns_to_train = train_balanced_df.columns.to_list()
columns_to_train.remove("Accept")

svc_trained, report = train_svc_with_df(
    df=train_balanced_df,
    columns=columns_to_train
)

print(report)

              precision    recall  f1-score   support

           0       0.62      0.71      0.66       858
           1       0.68      0.59      0.63       898

    accuracy                           0.65      1756
   macro avg       0.65      0.65      0.65      1756
weighted avg       0.65      0.65      0.65      1756



Función para realizar un gridsearch con las variables provistas

In [None]:
def train_svc_using_gridsearch(
        df: pd.DataFrame, 
        columns: list[str],
        param_grid: dict[list]
) -> tuple[SVC, classification_report]:
    
    grid_SVC = GridSearchCV(
        SVC(random_state=42),
        param_grid=param_grid,
        verbose=3,
        n_jobs=-1
    )

    x = df[columns]
    y = df["Accept"]

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
    
    grid_SVC.fit(x_train, y_train)
    print(f"Best params found: {grid_SVC.best_params_}")

    y_pred = grid_SVC.predict(x_test)

    # Evaluar el modelo
    report = classification_report(y_test, y_pred)

    return grid_SVC.best_estimator_, report
 

In [None]:
train_balanced_df.columns

Index(['NewExist', 'UrbanRural', 'RevLineCr', 'LowDoc', 'Accept',
       'BankStateInOhio', 'ApprovalDateMonth', 'ApprovalFYGrouped',
       'NoEmpGrouped', 'CreateJobBinary', 'RetainedJobBinary', 'IsFranchise',
       'DisbursementGrossGrouped'],
      dtype='object')

SVC con grid

In [None]:
columns_to_train_grid_all = [
    'NewExist', 
    'UrbanRural', 
    'RevLineCr',
    'LowDoc',
    'BankStateInOhio', 
    'ApprovalDateMonth', 
    'ApprovalFYGrouped',
    'NoEmpGrouped', 
    'CreateJobBinary', 
    'RetainedJobBinary', 
    'IsFranchise',
    'DisbursementGrossGrouped'
]

svc_grid_best, report =  train_svc_using_gridsearch(
    df=train_balanced_df,
    columns=columns_to_train_grid_all,
    param_grid={
        "kernel": ["rbf", "linear", "sigmoid"], # "rbf", "linear", "sigmoid", "poly"
        #"degree": [1, 2, 3, 4, 5], # Solo afecta al kernel poly
        "gamma": ["auto", "scale", 1, 0.1, 0.01],
        "C": range(1, 10, 100),
        "shrinking": [True, False],
        "decision_function_shape": ["ovo", "ovr"]
    },
)
 
print(report)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


[CV 1/5] END C=1, decision_function_shape=ovo, gamma=auto, kernel=rbf, shrinking=True;, score=0.676 total time=   0.8s
[CV 1/5] END C=1, decision_function_shape=ovo, gamma=auto, kernel=linear, shrinking=False;, score=0.587 total time=   0.8s
[CV 3/5] END C=1, decision_function_shape=ovo, gamma=auto, kernel=rbf, shrinking=False;, score=0.665 total time=   0.7s
[CV 5/5] END C=1, decision_function_shape=ovo, gamma=auto, kernel=sigmoid, shrinking=True;, score=0.504 total time=   0.6s
[CV 3/5] END C=1, decision_function_shape=ovo, gamma=auto, kernel=rbf, shrinking=True;, score=0.665 total time=   0.7s
[CV 1/5] END C=1, decision_function_shape=ovo, gamma=auto, kernel=sigmoid, shrinking=True;, score=0.505 total time=   0.7s
[CV 3/5] END C=1, decision_function_shape=ovo, gamma=auto, kernel=linear, shrinking=False;, score=0.628 total time=   0.9s
[CV 4/5] END C=1, decision_function_shape=ovo, gamma=auto, kernel=rbf, shrinking=True;, score=0.673 total time=   0.9s
[CV 4/5] END C=1, decision_func

In [None]:
from imblearn.over_sampling import SMOTE

# Paso 1: Separar X e y
columns_to_train = train_balanced_df.columns.to_list()
columns_to_train.remove("Accept")

X = train_balanced_df[columns_to_train]
y = train_balanced_df["Accept"]

# Paso 2: Aplicar SMOTE
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# Paso 3: Crear nuevo DataFrame balanceado tras SMOTE
train_smote_df = pd.DataFrame(X_res, columns=columns_to_train)
train_smote_df["Accept"] = y_res

# Paso 4: Entrenar modelo con el nuevo DataFrame
svc_trained, report = train_svc_with_df(
    df=train_smote_df,
    columns=columns_to_train
)

print(report)


              precision    recall  f1-score   support

           0       0.62      0.71      0.66       858
           1       0.68      0.59      0.63       898

    accuracy                           0.65      1756
   macro avg       0.65      0.65      0.65      1756
weighted avg       0.65      0.65      0.65      1756



In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

# 1. Separar X e y
X = train_balanced_df.drop("Accept", axis=1)
y = train_balanced_df["Accept"]

# 2. Separar en train y test ANTES del SMOTE
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

# 3. Aplicar SMOTE solo sobre el conjunto de entrenamiento
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_train, y_train)

# 4. Búsqueda de hiperparámetros con GridSearchCV
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['rbf', 'linear'],
    'gamma': ['scale', 0.01, 0.1]
}

grid = GridSearchCV(SVC(), param_grid, scoring='f1', cv=5)
grid.fit(X_res, y_res)

# 5. Evaluación del mejor modelo sobre el test original
best_svc = grid.best_estimator_
print("Mejores parámetros:", grid.best_params_)

y_pred = best_svc.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)


Mejores parámetros: {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
              precision    recall  f1-score   support

           0       0.65      0.64      0.64       878
           1       0.65      0.66      0.65       878

    accuracy                           0.65      1756
   macro avg       0.65      0.65      0.65      1756
weighted avg       0.65      0.65      0.65      1756



In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

# === 1. PREPARACIÓN GENERAL ===

X = train_balanced_df.drop("Accept", axis=1)
y = train_balanced_df["Accept"]

# Separación train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

# === 2. SVM CON SMOTE ===
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_train, y_train)

param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['rbf', 'linear'],
    'gamma': ['scale', 0.01, 0.1]
}

grid_smote = GridSearchCV(SVC(), param_grid, scoring='f1', cv=5)
grid_smote.fit(X_res, y_res)
svc_smote = grid_smote.best_estimator_

print(" SVM + SMOTE - Mejores parámetros:", grid_smote.best_params_)
print(" SVM + SMOTE Report:")
print(classification_report(y_test, svc_smote.predict(X_test)))

# === 3. SVM CON CLASS_WEIGHT='balanced' (sin SMOTE) ===
svc_balanced = SVC(C=10, gamma=0.01, kernel='rbf', class_weight='balanced')
svc_balanced.fit(X_train, y_train)

print(" SVM + class_weight='balanced' Report:")
print(classification_report(y_test, svc_balanced.predict(X_test)))


📌 SVM + SMOTE - Mejores parámetros: {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
🔍 SVM + SMOTE Report:
              precision    recall  f1-score   support

           0       0.65      0.64      0.64       878
           1       0.65      0.66      0.65       878

    accuracy                           0.65      1756
   macro avg       0.65      0.65      0.65      1756
weighted avg       0.65      0.65      0.65      1756

🔍 SVM + class_weight='balanced' Report:
              precision    recall  f1-score   support

           0       0.65      0.64      0.64       878
           1       0.65      0.66      0.65       878

    accuracy                           0.65      1756
   macro avg       0.65      0.65      0.65      1756
weighted avg       0.65      0.65      0.65      1756



In [None]:
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report

#  1. Cargar datos
df = pd.read_csv(train_dataset_balanced_processed_path)
X = df.drop("Accept", axis=1)
y = df["Accept"]

#  2. Selección de features (top 8, por ejemplo)
selector = SelectKBest(score_func=f_classif, k=8)
X_selected = selector.fit_transform(X, y)

# Opcional: ver qué columnas fueron seleccionadas
selected_features = X.columns[selector.get_support()]
print(" Columnas seleccionadas:", list(selected_features))

#  3. Split
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.3, stratify=y, random_state=42
)

# 4. Entrenar SVM con class_weight
model = SVC(C=10, gamma=0.01, kernel='rbf', class_weight='balanced', random_state=42)
model.fit(X_train, y_train)

# 5. Evaluar
y_pred = model.predict(X_test)
print(" SVM con selección de features y class_weight='balanced'")
print(classification_report(y_test, y_pred))


📌 Columnas seleccionadas: ['UrbanRural', 'RevLineCr', 'BankStateInOhio', 'ApprovalFYGrouped', 'NoEmpGrouped', 'RetainedJobBinary', 'IsFranchise', 'DisbursementGrossGrouped']
🔍 SVM con selección de features y class_weight='balanced'
              precision    recall  f1-score   support

           0       0.65      0.59      0.62       878
           1       0.62      0.68      0.65       878

    accuracy                           0.63      1756
   macro avg       0.64      0.63      0.63      1756
weighted avg       0.64      0.63      0.63      1756



In [None]:
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# 2. Selección de las 8 mejores variables
selector = SelectKBest(score_func=f_classif, k=8)
X_selected = selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support()]
print(" Columnas seleccionadas:", list(selected_features))

# 3. División train/test
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.3, stratify=y, random_state=42
)

# 4. Búsqueda de hiperparámetros
param_grid = {
    'C': [0.1, 1, 10, 20],
    'gamma': ['scale', 0.01, 0.1],
    'kernel': ['rbf']
}

grid = GridSearchCV(SVC(class_weight='balanced'), param_grid, scoring='f1', cv=5)
grid.fit(X_train, y_train)

# 5. Evaluación
best_model = grid.best_estimator_
print(" Mejores parámetros:", grid.best_params_)

y_pred = best_model.predict(X_test)
print(" Clasificación con modelo optimizado (8 features)")
print(classification_report(y_test, y_pred))


📌 Columnas seleccionadas: ['UrbanRural', 'RevLineCr', 'BankStateInOhio', 'ApprovalFYGrouped', 'NoEmpGrouped', 'RetainedJobBinary', 'IsFranchise', 'DisbursementGrossGrouped']
🎯 Mejores parámetros: {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
🔍 Clasificación con modelo optimizado (8 features)
              precision    recall  f1-score   support

           0       0.65      0.59      0.62       878
           1       0.62      0.68      0.65       878

    accuracy                           0.63      1756
   macro avg       0.64      0.63      0.63      1756
weighted avg       0.64      0.63      0.63      1756



## Creación de entregable

In [None]:
def create_submission(
        model: SVC,
        test_df: pd.DataFrame,
        submission_name: str
):
    
    test_ids = test_df["id"]

    test_prediction_columns = test_df.columns.to_list()
    test_prediction_columns.remove("id")
    test_prediction_data = test_df[test_prediction_columns]

    predcition = model.predict(test_prediction_data)

    predcition_df = pd.DataFrame({
        "id": test_ids,
        "Accept": predcition
    })

    predcition_df.to_csv(f"{submissions_folder}/{submission_name}.csv", sep=",", index=False)

In [None]:
create_submission(
    model=svc_trained,
    test_df=test_nolabel_df,
    submission_name=f"SVC_rbf_{time.time()}"
)