# Importación de los datos

In [20]:
import pandas as pd

penguins = pd.read_csv('../datasets/penguins_size.txt')
print(penguins.head())

  species     island  culmen_length_mm  culmen_depth_mm  flipper_length_mm  \
0  Adelie  Torgersen              39.1             18.7                181   
1  Adelie  Torgersen              39.5             17.4                186   
2  Adelie  Torgersen              40.3             18.0                195   
3  Adelie  Torgersen              36.7             19.3                193   
4  Adelie  Torgersen              39.3             20.6                190   

   body_mass_g     sex  
0         3750    MALE  
1         3800  FEMALE  
2         3250  FEMALE  
3         3450  FEMALE  
4         3650    MALE  


# Separación de datos: 80% train, 20% test

In [21]:
from sklearn.model_selection import train_test_split

X = penguins.drop(columns=["species"])  # Characteristics without species
y = penguins["species"]                 # Label species

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.2, random_state = 42, stratify = y
)

# Estandarización
## One-Hot con DictVectorizer
## Normalización con StandardScaler
## Preparación de la columna objetivo

In [22]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import LabelEncoder, StandardScaler
import numpy as np

# Separate categorical and numerical columns
categorical_cols = X_train.select_dtypes(include=["object"]).columns
numerical_cols = X_train.select_dtypes(exclude=["object"]).columns

# Apply One-Hot Encoding
vectorizer = DictVectorizer(sparse=False)
X_train_dict = X_train[categorical_cols].to_dict(orient="records")
X_test_dict = X_test[categorical_cols].to_dict(orient="records")
X_train_transform = vectorizer.fit_transform(X_train_dict)
X_test_transform = vectorizer.transform(X_test_dict)

# Convert categorical columns for vectorizer
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train[numerical_cols])
X_test_std = scaler.transform(X_test[numerical_cols])

# Combine transformed data
X_train_prepared = np.hstack((X_train_std, X_train_transform))
X_test_prepared = np.hstack((X_test_std, X_test_transform))

# Normalize target column
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Entrenamiento de los modelos
## Logistic Regression
### Creamos el modelo, lo entrenamos y lo serializamos

In [23]:
import pickle
from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression(
    random_state=42,
    max_iter=1000
)
logistic_model.fit(X_train_prepared, y_train_encoded)

with open('../models/logistic.pck', 'wb') as file:
    pickle.dump((logistic_model, scaler, vectorizer), file)

## SVM
### Creamos el modelo, lo entrenamos y lo serializamos

In [24]:
from sklearn.svm import SVC

svm_model = SVC(
    kernel='linear',
    C=1.0,
    random_state=42,
    probability=True
)
svm_model.fit(X_train_prepared, y_train_encoded)

with open('../models/svm.pck', 'wb') as file:
    pickle.dump((svm_model, scaler, vectorizer), file)

## Decision Trees
### Creamos el modelo, lo entrenamos y lo serializamos

In [25]:
from sklearn.tree import DecisionTreeClassifier

dtree_model = DecisionTreeClassifier(
    criterion='gini',
    max_depth=4,
    random_state=42
)
dtree_model.fit(X_train_prepared, y_train_encoded)

with open('../models/dtree.pck', 'wb') as file:
    pickle.dump((dtree_model, scaler, vectorizer), file)

## KNN
### Creamos el modelo, lo entrenamos y lo serializamos

In [26]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(
    n_neighbors=3,
    weights='uniform',
    p=2,
    metric='minkowski'
)
knn_model.fit(X_train_prepared, y_train_encoded)

with open('../models/knn.pck', 'wb') as file:
    pickle.dump((knn_model, scaler, vectorizer), file)