# 01-03: Classificazione Logistic Regression e PyTorch

**3.1 Si usino i dati dell'Es 2 per prevedere la variabile group, usando tutte le altre tranne age. È richiesto di fornire la previsione per ciascuno degli ultimi 200 id, sia "hard" (Adult/Senior), sia "soft" con le probabilità delle due classi.**

In [2574]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA

In [None]:
base = pd.read_excel("inml25tst01.xlsx", sheet_name="Es 2")
df = base.drop(columns=base.columns[0:9])

df = df.set_index("id")
df = df.sort_index()          # ordina per id

df_test  = df.tail(200)
df_train = df.iloc[:-200]

# ALTERNATIVA
# df_train = df[df["age"].isna() != True]
# df_test = df[df["age"].isna()]

In [2576]:
X_train = df_train.drop(columns=["age", "group"])
y_train = df_train["group"].values

print(df_train.shape)
print(X_train.columns)

X_train = X_train.values

print(X_train.shape)
print(y_train.shape)

(2078, 9)
Index(['gender', 'PA', 'BMI', 'GLU', 'diabetic', 'GLT', 'insulin'], dtype='object')
(2078, 7)
(2078,)


In [2577]:
X_test = df_test.drop(columns=["age", "group"])

print(df_test.shape)
print(X_test.columns)

X_test = X_test.values

print(X_test.shape)

(200, 9)
Index(['gender', 'PA', 'BMI', 'GLU', 'diabetic', 'GLT', 'insulin'], dtype='object')
(200, 7)


**ATTENZIONE.** PRIMA LA CALCOLI E POI LA USI, PERCHÈ X_TRAIN CAMBIA, E X_TEST HA I VALORI SBALLATI

In [2578]:
mean = np.mean(X_train, axis=0)

In [2579]:
X_train = X_train - mean
X_test = X_test - mean

In [2580]:
pca = PCA(n_components=2)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

In [2581]:
mean = np.mean(X_train, axis=0)
std = np.std(X_train, axis=0, ddof=1)

In [2582]:
X_train = ( X_train - mean) / std
X_test = ( X_test - mean) / std

In [2583]:
print(np.unique(y_train, return_counts=True))
y_train = (y_train == "Senior").astype(int)
print(np.unique(y_train, return_counts=True))

(array(['Adult', 'Senior'], dtype=object), array([1754,  324]))
(array([0, 1]), array([1754,  324]))


## Predizione (SKLEARN)

In [2584]:
from sklearn.linear_model import LogisticRegression

**IMPORTANTE.** Le due classi sono molto sbilanciate!!! quindi mettiamo `class_weight = "balanced"`

In [2585]:
model = LogisticRegression(class_weight="balanced")
model.fit(X_train, y_train)
hard_pred = model.predict(X_test)
soft_pred = model.predict_proba(X_test)

In [2586]:
print(soft_pred[0])
print(np.unique(hard_pred, return_counts=True))

[0.37712635 0.62287365]
(array([0, 1]), array([142,  58]))


In [2598]:
result = pd.DataFrame(soft_pred, columns=["Adult", "Senior"])
result["Hard-Class"] = hard_pred
result.head(10)

Unnamed: 0,Adult,Senior,Hard-Class
0,0.46815,0.53185,1
1,0.337974,0.662026,1
2,0.269127,0.730873,1
3,0.23919,0.76081,1
4,0.412775,0.587225,1
5,0.426228,0.573772,1
6,0.537999,0.462001,0
7,0.558185,0.441815,0
8,0.368082,0.631918,1
9,0.389986,0.610014,1


## Predizione (PyTorch)

In [2587]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [2588]:
X_train_tensor = torch.from_numpy(X_train).to(torch.float)
y_train_tensor = torch.from_numpy(y_train).to(torch.float)
X_test_tensor = torch.from_numpy(X_test).to(torch.float)

In [2589]:
epochs = 20
lr = 1e-4
batch = 16

In [2590]:
model = nn.Linear(X_train_tensor.shape[1], 1)
optimizer = optim.Adam(model.parameters(), lr=lr)

In [2591]:
n_pos = (y_train_tensor == 1).sum()
n_neg = (y_train_tensor == 0).sum()

pos_weight = n_neg / n_pos   # >1 se i positivi sono rari
loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

In [None]:
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch, shuffle=True)

In [2593]:
model.train()
for epoch in range(epochs):
    
    epoch_loss = 0
    n_batches = 0
    for data, target in train_loader:

        out = model(data).squeeze()
        loss = loss_fn(out, target)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        n_batches += 1

    print(f"Epoch {epoch} - Loss {round(epoch_loss/n_batches, 4)}")

Epoch 0 - Loss 1.491
Epoch 1 - Loss 1.4842
Epoch 2 - Loss 1.4775
Epoch 3 - Loss 1.4709
Epoch 4 - Loss 1.4643
Epoch 5 - Loss 1.4579
Epoch 6 - Loss 1.4515
Epoch 7 - Loss 1.4452
Epoch 8 - Loss 1.439
Epoch 9 - Loss 1.4329
Epoch 10 - Loss 1.4268
Epoch 11 - Loss 1.4209
Epoch 12 - Loss 1.415
Epoch 13 - Loss 1.4092
Epoch 14 - Loss 1.4034
Epoch 15 - Loss 1.3978
Epoch 16 - Loss 1.3922
Epoch 17 - Loss 1.3867
Epoch 18 - Loss 1.3812
Epoch 19 - Loss 1.3759


In [2594]:
hard_pred = []
soft_pred = []
model.eval()
with torch.no_grad():

    for sample in X_test_tensor:

        logit = model(sample)

        logit = torch.sigmoid(logit)

        label = (logit >= 0.5)
        
        logit = logit.detach().tolist()[0]
        label = label.detach().to(int).tolist()[0]

        soft_pred.append((1-logit, logit))
        hard_pred.append(label)
        
print(soft_pred[0])
print(np.unique(hard_pred, return_counts=True))

(0.46814966201782227, 0.5318503379821777)
(array([0, 1]), array([ 19, 181]))


In [2599]:
result = pd.DataFrame(soft_pred, columns=["Adult", "Senior"])
result["Hard-Class"] = hard_pred
result.head(10)

Unnamed: 0,Adult,Senior,Hard-Class
0,0.46815,0.53185,1
1,0.337974,0.662026,1
2,0.269127,0.730873,1
3,0.23919,0.76081,1
4,0.412775,0.587225,1
5,0.426228,0.573772,1
6,0.537999,0.462001,0
7,0.558185,0.441815,0
8,0.368082,0.631918,1
9,0.389986,0.610014,1
