# Entrenamiento de modelos relacionados con Support Vector Machines

In [14]:
# General import and load data
import pandas as pd
import numpy as np
from pandas import Series, DataFrame

from sklearn.utils import resample

# Training and test spliting
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# Preprocessing 
from sklearn.preprocessing import StandardScaler

# Estimators
from sklearn.svm import SVC

# Evaluation
from sklearn import metrics
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report

# Optimization
from sklearn.model_selection import GridSearchCV

# Visualisation
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(color_codes=True)

In [15]:
# constants
data_folder="../../../data"
raw_data_folder = f"{data_folder}/raw"
processed_data_folder = f"{data_folder}/processed"
submissions_folder = f"{data_folder}/submissions"

original_train_dataset_path = f"{raw_data_folder}/train.csv"
original_test_dataset_path = f"{raw_data_folder}/test_nolabel.csv"

train_dataset_processed_path = f"{processed_data_folder}/train_processed.csv"
train_dataset_balanced_processed_path = f"{processed_data_folder}/train_balanced_processed.csv"
test_nolabel_processed_path = f"{processed_data_folder}/test_nolabel_processed.csv"

## Preparación

Carga de datos y creación de dataframes para el posterior entrenamiento y predicción

In [16]:
train_df = pd.read_csv(train_dataset_processed_path)
train_balanced_df = pd.read_csv(train_dataset_balanced_processed_path)
test_nolabel_df = pd.read_csv(test_nolabel_processed_path)

In [17]:
def train_svc_with_df(
        df: pd.DataFrame, 
        columns: list[str],
        kernel: str = "rbf",
) -> tuple[SVC, classification_report]:
    
    model = SVC(
        kernel=kernel, 
        random_state=42,
        gamma="auto"
    )

    x = df[columns]
    y = df["Accept"]

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
    
    trained_model = model.fit(x_train, y_train)

    y_pred = trained_model.predict(x_test)

    # Evaluar el modelo
    # cm = confusion_matrix(y_test, y_pred)
    # acc = accuracy_score(y_test, y_pred)
    # prec = precision_score(y_test, y_pred)
    # rec = recall_score(y_test, y_pred)
    # f1 = f1_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    return trained_model, report

   

Prueba básica

In [18]:
columns_to_train = train_balanced_df.columns.to_list()
columns_to_train.remove("Accept")

svc_trained, report = train_svc_with_df(
    df=train_balanced_df,
    columns=columns_to_train
)

print(report)

              precision    recall  f1-score   support

           0       0.62      0.71      0.66       858
           1       0.68      0.59      0.63       898

    accuracy                           0.65      1756
   macro avg       0.65      0.65      0.65      1756
weighted avg       0.65      0.65      0.65      1756



Probar efecto del kernel

In [6]:
columns_to_train = train_balanced_df.columns.to_list()
columns_to_train.remove("Accept")

for kernel in ["linear", "poly", "rbf", "sigmoid", "precomputed"]:
    svc_trained, report = train_svc_with_df(
        df=train_balanced_df,
        columns=columns_to_train,
        class_weight=None
    )
    print(f"Report utilizando el kernel {kernel}")
    print(report)

Report utilizando el kernel linear
              precision    recall  f1-score   support

           0       0.62      0.71      0.66       858
           1       0.68      0.59      0.63       898

    accuracy                           0.65      1756
   macro avg       0.65      0.65      0.65      1756
weighted avg       0.65      0.65      0.65      1756

Report utilizando el kernel poly
              precision    recall  f1-score   support

           0       0.62      0.71      0.66       858
           1       0.68      0.59      0.63       898

    accuracy                           0.65      1756
   macro avg       0.65      0.65      0.65      1756
weighted avg       0.65      0.65      0.65      1756

Report utilizando el kernel rbf
              precision    recall  f1-score   support

           0       0.62      0.71      0.66       858
           1       0.68      0.59      0.63       898

    accuracy                           0.65      1756
   macro avg       0.65     

## Creación de entregable

In [7]:
def create_submission(
        model: SVC,
        test_df: pd.DataFrame,
        submission_name: str
):
    
    test_ids = test_df["id"]

    test_prediction_columns = test_df.columns.to_list()
    test_prediction_columns.remove("id")
    test_prediction_data = test_df[test_prediction_columns]

    predcition = model.predict(test_prediction_data)

    predcition_df = pd.DataFrame({
        "id": test_ids,
        "Accept": predcition
    })

    predcition_df.to_csv(f"{submissions_folder}/{submission_name}.csv", sep=",", index=False)

In [8]:
create_submission(
    model=svc_trained,
    test_df=test_nolabel_df,
    submission_name="SVC_rbf"
)