In [16]:
# Donwload the penguins dataset from seaborn
import pandas as pd

import seaborn as sns

# Load the penguins dataset
df = sns.load_dataset("penguins")
# Drop the rows with missing values
df.dropna(inplace=True)
# Convert the species column to a numeric value
df['species'] = pd.factorize(df['species'])[0].astype(int)

df.species.head()

0    0
1    0
2    0
4    0
5    0
Name: species, dtype: int64

In [32]:
from sklearn.model_selection import train_test_split
# Split the dataset into train, validation and test sets
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=1)

df_train, df_val = train_test_split(
    df_train_full, test_size=0.33, random_state=1)
y_train = df_train.species.values
y_val = df_val.species.values

del df_train['species']
del df_val['species']

df_train.head().T

Unnamed: 0,136,158,298,83,45
island,Dream,Dream,Biscoe,Torgersen,Dream
bill_length_mm,35.6,46.1,45.2,35.1,39.6
bill_depth_mm,17.5,18.2,13.8,19.4,18.8
flipper_length_mm,191.0,178.0,215.0,193.0,190.0
body_mass_g,3175.0,3250.0,4750.0,4200.0,4600.0
sex,Female,Female,Female,Male,Male


In [33]:
categorical = ['island', 'sex']
numerical = ['bill_length_mm', 'bill_depth_mm',
             'flipper_length_mm', 'body_mass_g']

In [34]:
from sklearn.metrics import mutual_info_score

# mutual information for categorical variables
def calculate_mi(col): return mutual_info_score(col, df_train_full.species)

df_mi = df_train_full[categorical].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')
df_mi

Unnamed: 0,MI
island,0.498816
sex,0.002896


In [35]:
# Correlation with the target for numerical variables
print(df_train_full[numerical].corrwith(df_train_full.species))

bill_length_mm       0.741803
bill_depth_mm       -0.742992
flipper_length_mm    0.863216
body_mass_g          0.770591
dtype: float64


In [36]:
# More correlation with variable flipper_length_mm 
# Important variable island

# Convert the dataframe to a dictionary
train_dict = df_train[categorical + numerical].to_dict(orient='records')  
train_dict[0] 

{'island': 'Dream',
 'sex': 'Female',
 'bill_length_mm': 35.6,
 'bill_depth_mm': 17.5,
 'flipper_length_mm': 191.0,
 'body_mass_g': 3175.0}

In [37]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

In [38]:
X_train = dv.transform(train_dict)
X_train[0]

array([1.750e+01, 3.560e+01, 3.175e+03, 1.910e+02, 0.000e+00, 1.000e+00,
       0.000e+00, 1.000e+00, 0.000e+00])

In [39]:
dv.get_feature_names_out()

array(['bill_depth_mm', 'bill_length_mm', 'body_mass_g',
       'flipper_length_mm', 'island=Biscoe', 'island=Dream',
       'island=Torgersen', 'sex=Female', 'sex=Male'], dtype=object)

## Logistic Regression

In [40]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='liblinear')
model.fit(X_train, y_train)

In [45]:
val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [46]:
y_pred = model.predict_proba(X_val)
y_pred

array([[9.99469548e-01, 4.78469131e-06, 5.25667030e-04],
       [9.98845771e-01, 8.20188205e-04, 3.34040656e-04],
       [3.32932990e-05, 3.48984259e-04, 9.99617722e-01],
       [1.77219874e-02, 9.82145421e-01, 1.32591569e-04],
       [4.02827407e-01, 5.89987299e-01, 7.18529372e-03],
       [9.93494941e-01, 6.06190733e-03, 4.43151218e-04],
       [9.91645613e-01, 1.41882982e-03, 6.93555672e-03],
       [9.21721142e-01, 7.74208128e-02, 8.58044894e-04],
       [7.65853731e-06, 3.27827629e-03, 9.96714065e-01],
       [4.54858213e-01, 5.40635085e-01, 4.50670212e-03],
       [9.96971487e-01, 2.07535084e-03, 9.53162177e-04],
       [8.43618954e-03, 1.49532510e-06, 9.91562315e-01],
       [9.99710809e-01, 8.41122551e-06, 2.80779296e-04],
       [9.97551512e-01, 2.27178563e-03, 1.76702115e-04],
       [1.80163592e-03, 2.42461843e-07, 9.98198122e-01],
       [5.41951029e-09, 9.98951041e-01, 1.04895379e-03],
       [1.33665002e-03, 9.98591831e-01, 7.15187553e-05],
       [9.61112204e-01, 3.88448

## SVM

In [41]:
from sklearn.svm import SVC

svm = SVC(kernel='linear', C=1.0, random_state=1, probability=True)

svm.fit(X_train, y_train)

## Decission Trees

In [42]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(criterion='gini', max_depth=4,
                            random_state=1)
dt.fit(X_train, y_train)

## KNN

In [43]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3, p=2, metric='minkowski')

knn.fit(X_train, y_train)

In [47]:
import pickle

with open('../models/species-model.pck', 'wb') as f:
    pickle.dump((dv, model), f)
with open('../models/species-svm.pck', 'wb') as f:
    pickle.dump((dv, svm), f)
with open('../models/species-dt.pck', 'wb') as f:
    pickle.dump((dv, dt), f)
with open('../models/species-knn.pck', 'wb') as f:
    pickle.dump((dv, knn), f)