# 01-03: Classificazione Logistic Regression e PyTorch

**3.1 Si usino i dati dell'Es 2 per prevedere la variabile group, usando tutte le altre tranne age. È richiesto di fornire la previsione per ciascuno degli ultimi 200 id, sia "hard" (Adult/Senior), sia "soft" con le probabilità delle due classi.**

## EDA e PCA

In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA

In [2]:
base = pd.read_excel("inml25tst01.xlsx", sheet_name="Es 2")
df = base.drop(columns=base.columns[0:9])

df = df.set_index("id")
df = df.sort_index()          # ordina per id

df_test  = df.tail(200)
df_train = df.iloc[:-200]

# ALTERNATIVA
# df_train = df[df["age"].isna() != True]
# df_test = df[df["age"].isna()]

In [3]:
X_train = df_train.drop(columns=["age", "group"])
y_train = df_train["group"].values

print(df_train.shape)
print(X_train.columns)

X_train = X_train.values

print(X_train.shape)
print(y_train.shape)

(2078, 9)
Index(['gender', 'PA', 'BMI', 'GLU', 'diabetic', 'GLT', 'insulin'], dtype='object')
(2078, 7)
(2078,)


In [4]:
X_test = df_test.drop(columns=["age", "group"])

print(df_test.shape)
print(X_test.columns)

X_test = X_test.values

print(X_test.shape)

(200, 9)
Index(['gender', 'PA', 'BMI', 'GLU', 'diabetic', 'GLT', 'insulin'], dtype='object')
(200, 7)


**ATTENZIONE.** PRIMA LA CALCOLI E POI LA USI, PERCHÈ X_TRAIN CAMBIA, E X_TEST HA I VALORI SBALLATI

In [5]:
mean = np.mean(X_train, axis=0)

In [6]:
X_train = X_train - mean
X_test = X_test - mean

In [7]:
pca = PCA(n_components=2)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

In [8]:
mean = np.mean(X_train, axis=0)
std = np.std(X_train, axis=0, ddof=1)

In [9]:
X_train = ( X_train - mean) / std
X_test = ( X_test - mean) / std

In [10]:
print(np.unique(y_train, return_counts=True))
y_train = (y_train == "Senior").astype(int)
print(np.unique(y_train, return_counts=True))

(array(['Adult', 'Senior'], dtype=object), array([1754,  324]))
(array([0, 1]), array([1754,  324]))


## Logistic Regression (Classificazione Binaria) con sklearn

In [11]:
from sklearn.linear_model import LogisticRegression

**IMPORTANTE.** Le due classi sono molto sbilanciate!!! quindi mettiamo `class_weight = "balanced"`

In [12]:
model = LogisticRegression(class_weight="balanced")
model.fit(X_train, y_train)
hard_pred = model.predict(X_test)
soft_pred = model.predict_proba(X_test)

In [13]:
print(soft_pred[0])
print(np.unique(hard_pred, return_counts=True))

[0.37712635 0.62287365]
(array([0, 1]), array([142,  58]))


In [14]:
result = pd.DataFrame(soft_pred, columns=["Adult", "Senior"])
result["Hard-Class"] = hard_pred
result.head(10)

Unnamed: 0,Adult,Senior,Hard-Class
0,0.377126,0.622874,1
1,0.484201,0.515799,1
2,0.580394,0.419606,0
3,0.77364,0.22636,0
4,0.46658,0.53342,1
5,0.497061,0.502939,1
6,0.344787,0.655213,1
7,0.433641,0.566359,1
8,0.532328,0.467672,0
9,0.552802,0.447198,0


## Classificazione Binaria con PyTorch

In [15]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [16]:
X_train_tensor = torch.from_numpy(X_train).to(torch.float)
y_train_tensor = torch.from_numpy(y_train).to(torch.float)
X_test_tensor = torch.from_numpy(X_test).to(torch.float)

In [17]:
epochs = 20
lr = 1e-4
batch = 16

In [18]:
model = nn.Linear(X_train_tensor.shape[1], 1)
optimizer = optim.Adam(model.parameters(), lr=lr)

In [19]:
n_pos = (y_train_tensor == 1).sum()
n_neg = (y_train_tensor == 0).sum()

pos_weight = n_neg / n_pos   # >1 se i positivi sono rari
loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

In [20]:
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch, shuffle=True)

In [21]:
model.train()
for epoch in range(epochs):
    
    epoch_loss = 0
    n_batches = 0
    for data, target in train_loader:

        out = model(data).squeeze()
        loss = loss_fn(out, target)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        n_batches += 1

    print(f"Epoch {epoch} - Loss {round(epoch_loss/n_batches, 4)}")

Epoch 0 - Loss 1.0951
Epoch 1 - Loss 1.0943
Epoch 2 - Loss 1.0942
Epoch 3 - Loss 1.0934
Epoch 4 - Loss 1.0925
Epoch 5 - Loss 1.092
Epoch 6 - Loss 1.0914
Epoch 7 - Loss 1.0922
Epoch 8 - Loss 1.0916
Epoch 9 - Loss 1.0901
Epoch 10 - Loss 1.0898
Epoch 11 - Loss 1.0898
Epoch 12 - Loss 1.0887
Epoch 13 - Loss 1.0891
Epoch 14 - Loss 1.0881
Epoch 15 - Loss 1.0877
Epoch 16 - Loss 1.0873
Epoch 17 - Loss 1.0871
Epoch 18 - Loss 1.0864
Epoch 19 - Loss 1.0857


In [22]:
hard_pred = []
soft_pred = []
model.eval()
with torch.no_grad():

    for sample in X_test_tensor:

        logit = model(sample)

        logit = torch.sigmoid(logit)

        label = (logit >= 0.5)
        
        logit = logit.detach().tolist()[0]
        label = label.detach().to(int).tolist()[0]

        soft_pred.append((1-logit, logit))
        hard_pred.append(label)
        
print(soft_pred[0])
print(np.unique(hard_pred, return_counts=True))

(0.4343456029891968, 0.5656543970108032)
(array([0, 1]), array([165,  35]))


In [23]:
result = pd.DataFrame(soft_pred, columns=["Adult", "Senior"])
result["Hard-Class"] = hard_pred
result.head(10)

Unnamed: 0,Adult,Senior,Hard-Class
0,0.434346,0.565654,1
1,0.551826,0.448174,0
2,0.646597,0.353403,0
3,0.809278,0.190722,0
4,0.524993,0.475007,0
5,0.550599,0.449401,0
6,0.394198,0.605802,1
7,0.474728,0.525272,1
8,0.590127,0.409873,0
9,0.605022,0.394978,0
