In [293]:
# Use black formatter
%load_ext lab_black

import numpy as np
import pandas as pd

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, accuracy_score

RANDOM_SEED = 42

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


#### Carga y procesamiento de datos

In [294]:
FILE_PATH = "/tf/notebooks/CEIA-inteligencia_artificial/TP_final/dataset/possum.csv"

original_df = pd.read_csv(FILE_PATH, index_col=0)
df = original_df.copy()
df

Unnamed: 0_level_0,site,Pop,sex,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly
case,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,1,Vic,m,8.0,94.1,60.4,89.0,36.0,74.5,54.5,15.2,28.0,36.0
2,1,Vic,f,6.0,92.5,57.6,91.5,36.5,72.5,51.2,16.0,28.5,33.0
3,1,Vic,f,6.0,94.0,60.0,95.5,39.0,75.4,51.9,15.5,30.0,34.0
4,1,Vic,f,6.0,93.2,57.1,92.0,38.0,76.1,52.2,15.2,28.0,34.0
5,1,Vic,f,2.0,91.5,56.3,85.5,36.0,71.0,53.2,15.1,28.5,33.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
100,7,other,m,1.0,89.5,56.0,81.5,36.5,66.0,46.8,14.8,23.0,27.0
101,7,other,m,1.0,88.6,54.7,82.5,39.0,64.4,48.0,14.0,25.0,33.0
102,7,other,f,6.0,92.4,55.0,89.0,38.0,63.5,45.4,13.0,25.0,30.0
103,7,other,m,4.0,91.5,55.2,82.5,36.5,62.9,45.9,15.4,25.0,29.0


Convertimos las columnas Pop y sex valores numericos 0 o 1 para poder trabajar con ellas. A continuación se muestra como se realizará el encoding.

|  Columna   |    valor   |  Encoding  | 
| :--------: | :--------: | :--------: |
| Pop        | Vic        | 1          |
| Pop        | other      | 0          |
| sex        | m          | 1          |
| sex        | f          | 0          |

In [295]:
pop_binary = pd.get_dummies(df.Pop)
pop_binary.pop("other")
pop_binary.columns = pop_binary.columns.str.lower()
sex_binary = pd.get_dummies(df.sex)
sex_binary.pop("f")

df = df.join([pop_binary, sex_binary])
df.pop("Pop")
df.pop("sex")
df

Unnamed: 0_level_0,site,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly,vic,m
case,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,1,8.0,94.1,60.4,89.0,36.0,74.5,54.5,15.2,28.0,36.0,1,1
2,1,6.0,92.5,57.6,91.5,36.5,72.5,51.2,16.0,28.5,33.0,1,0
3,1,6.0,94.0,60.0,95.5,39.0,75.4,51.9,15.5,30.0,34.0,1,0
4,1,6.0,93.2,57.1,92.0,38.0,76.1,52.2,15.2,28.0,34.0,1,0
5,1,2.0,91.5,56.3,85.5,36.0,71.0,53.2,15.1,28.5,33.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
100,7,1.0,89.5,56.0,81.5,36.5,66.0,46.8,14.8,23.0,27.0,0,1
101,7,1.0,88.6,54.7,82.5,39.0,64.4,48.0,14.0,25.0,33.0,0,1
102,7,6.0,92.4,55.0,89.0,38.0,63.5,45.4,13.0,25.0,30.0,0,0
103,7,4.0,91.5,55.2,82.5,36.5,62.9,45.9,15.4,25.0,29.0,0,1


Chequeamos si hay observaciones con valor NaN.

In [296]:
df.isnull().sum()

site        0
age         2
hdlngth     0
skullw      0
totlngth    0
taill       0
footlgth    1
earconch    0
eye         0
chest       0
belly       0
vic         0
m           0
dtype: int64

Como faltan algunos datos en la columna age, cuyos valores son números enteros, no seria buena idea completar con valores como por ejemplo la media, por eso se decide repetir el valor de la columna anterior. Esto se leva a cabo utilizando el parámetro *method="pad"* en la funcion *fillna*.

In [297]:
df.fillna(method="pad", inplace=True)
df.isnull().sum()

site        0
age         0
hdlngth     0
skullw      0
totlngth    0
taill       0
footlgth    0
earconch    0
eye         0
chest       0
belly       0
vic         0
m           0
dtype: int64

#### a) Train test split
Es una buena práctica realizar primero la división del set de datos en conjuntos de train y test. De esta manera evitamos contaminar accidentalmente el conjunto de entrenamiento con información de test. Si esto sucediese posiblemente los resultados al evaluar el modelo en el set de test arrojarian valores superiores, pero al utilizarse el modelo en producción la performance sería menor.
Debido a que hay 104 muestras una division 80/20 es un buen ratio para los sets de train y test.

In [298]:
X = df.drop("m", axis=1).to_numpy()
y = df["m"].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_SEED
)

In [299]:
pca = PCA(n_components=0.9, random_state=RANDOM_SEED).fit(X_train)
X_train_pca = pca.transform(X_train)
print("Componentes que acumulan 90% de la varianza:", pca.n_components_)

Componentes que acumulan 90% de la varianza: 5


In [300]:
scaler = StandardScaler().fit(X_train_pca)
X_train_pca_scaled = scaler.transform(X_train_pca)

In [301]:
logistic_regression = LogisticRegression()
logistic_regression.fit(X_train_pca, y_train)


def print_metrics(y_true, y_pred):
    metrics = {
        "Accuracy": accuracy_score,
        "Precision": precision_score,
        "Recall": recall_score,
    }
    for name, metric in metrics.items():
        result = metric(y_true, y_pred)
        print(f"{name}: {result}")


X_test_pca = pca.transform(X_test)
X_test_pca_scaled = scaler.transform(X_test_pca)
print_metrics(y_test, logistic_regression.predict(X_test_pca_scaled))

Accuracy: 0.7142857142857143
Precision: 0.7058823529411765
Recall: 0.9230769230769231
