In [10]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction import DictVectorizer

df = sns.load_dataset("penguins")
df.head() 

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [11]:
df.dtypes

species               object
island                object
bill_length_mm       float64
bill_depth_mm        float64
flipper_length_mm    float64
body_mass_g          float64
sex                   object
dtype: object

In [12]:
# Eliminar files amb valors NA
df = df.dropna()

In [13]:
# Dividir dades en variables predictors i objectiu
X = df.drop(columns=['species'])
y = df['species']

In [14]:
# Dividir en entrenament (80%) i prova (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
# Comprovació de la mida dels conjunts
print(f"Mida del conjunt d'entrenament: {X_train.shape[0]} exemples")
print(f"Mida del conjunt de prova: {X_test.shape[0]} exemples")

Mida del conjunt d'entrenament: 266 exemples
Mida del conjunt de prova: 67 exemples


In [16]:
# Codificar variables categòriques i normalitzar numèriques
dv = DictVectorizer(sparse=False)
scaler = StandardScaler()

X_train_dict = X_train.to_dict(orient='records')
X_test_dict = X_test.to_dict(orient='records')

X_train_enc = dv.fit_transform(X_train_dict)
X_test_enc = dv.transform(X_test_dict)

X_train_scaled = scaler.fit_transform(X_train_enc)
X_test_scaled = scaler.transform(X_test_enc)

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Definir models
models = {
    "Logistic Regression": LogisticRegression(max_iter=200),
    "SVM": SVC(probability=True),
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier()
}

# Entrenar i avaluar models
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)
    print(f"{name}: Accuracy = {acc:.2f}")

Logistic Regression: Accuracy = 1.00
SVM: Accuracy = 1.00
Decision Tree: Accuracy = 1.00
KNN: Accuracy = 0.99


In [20]:
import os

# Crear el directori si no existeix
os.makedirs("models", exist_ok=True)

# Guardar els models
import pickle

for name, model in models.items():
    file_path = f'models/{name.replace(" ", "_").lower()}_model.pkl'
    with open(file_path, 'wb') as f:
        pickle.dump((dv, scaler, model), f)
    print(f"Model guardat a: {file_path}")


Model guardat a: models/logistic_regression_model.pkl
Model guardat a: models/svm_model.pkl
Model guardat a: models/decision_tree_model.pkl
Model guardat a: models/knn_model.pkl
