In [1]:
import pandas as pd

In [8]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

%matplotlib inline



In [2]:
train_feature = pd.read_csv("data/training_set_features.csv")
train_label = pd.read_csv("data/training_set_labels.csv")

In [9]:
train_label.columns

Index(['respondent_id', 'h1n1_vaccine', 'seasonal_vaccine'], dtype='object')

In [6]:
train = train_feature.merge(train_label, on="respondent_id")

In [14]:
num_train = train.select_dtypes(include="number")
cat_train = train.select_dtypes(include="object")
num_train.fillna(0, inplace=True)
cat_train.fillna("NA", inplace=True)

In [15]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [17]:
from sklearn.model_selection import train_test_split

In [16]:
X = train.drop(['respondent_id', 'h1n1_vaccine', 'seasonal_vaccine'], axis=1)
y1 = train.h1n1_vaccine
y2 = train.seasonal_vaccine

In [19]:
seed = 42
X1_train, X1_test, y1_train, y1_test = train_test_split(X, y1, test_size=0.33, random_state=seed)
X2_train, X2_test, y2_train, y2_test = train_test_split(X, y2, test_size=0.33, random_state=seed)

In [20]:
scaler1 = StandardScaler()
one1 = OneHotEncoder()

In [23]:
num_X = scaler1.fit_transform(X1_train.select_dtypes(include="number"))
cat_X = one1.fit_transform(X1_train.select_dtypes(include="object")).toarray()

In [26]:
X_train = np.concatenate([num_X, cat_X], axis=1)

In [33]:
num_X_test = scaler1.transform(X1_test.select_dtypes(include="number"))
cat_X_test = one1.transform(X1_test.select_dtypes(include="object")).toarray()
X_test = np.concatenate([num_X_test, cat_X_test], axis=1)

In [30]:
from sklearn.metrics import roc_auc_score

## Modeling

In [37]:
import warnings

warnings.filterwarnings("ignore")

### baseline model

In [36]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier

In [40]:
zip(["logistic", "random forest", "gbt", "lgb"], [lr, rf, gbc, lgc])

<zip at 0x2c15f4e1980>

In [42]:
lr = LogisticRegression()
rf = RandomForestClassifier()
gbc = GradientBoostingClassifier()
lgc = LGBMClassifier()
for name, model in zip(["logistic", "random forest", "gbt", "lgb"], [lr, rf, gbc, lgc]):
    print(name, model)
    model.fit(X_train, y1_train)
    score = roc_auc_score(y1_test, model.predict(X_test))
    print(name, score)

logistic LogisticRegression()
logistic 0.7093012878177386
random forest RandomForestClassifier()
random forest 0.6836263965950857
gbt GradientBoostingClassifier()
gbt 0.7163507751137407
lgb LGBMClassifier()
[LightGBM] [Info] Number of positive: 3806, number of negative: 14087
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000715 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 276
[LightGBM] [Info] Number of data points in the train set: 17893, number of used features: 111
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.212709 -> initscore=-1.308674
[LightGBM] [Info] Start training from score -1.308674
lgb 0.7266387661069703


In [43]:
import torch

In [44]:
import torch.nn as nn

In [48]:
from torch.utils.data import Dataset, DataLoader

In [54]:
from torch.optim import Adam

In [59]:
class Net(nn.Module):
    def __init__(self, in_features):
        super().__init__()
        self._classifier = nn.Sequential(
            nn.Linear(in_features=in_features, out_features=64),
            nn.ReLU(),
            nn.Linear(64,64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )
    def __call__(self,x):
        return self._classifier(x)

In [60]:
class FluData(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [61]:
train = FluData(X_train, y1_train.values)
test = FluData(X_test, y1_test.values)

In [62]:
train_loader = DataLoader(train, batch_size=128, shuffle=True)
test_loader = DataLoader(test, batch_size=128)

In [63]:
model = Net(in_features=X_train.shape[1])
optimizer = Adam(model.parameters())

In [68]:
for bx,by in train_loader:
    print(bx, by)
    break

tensor([[ 0.4202,  1.1941, -0.2268,  ...,  0.0000,  1.0000,  0.0000],
        [-0.6762, -0.4150, -0.2268,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.4202,  1.1941, -0.2268,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [ 1.5166,  1.1941, -0.2268,  ...,  0.0000,  0.0000,  0.0000],
        [-0.6762, -0.4150, -0.2268,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.4202,  1.1941, -0.2268,  ...,  0.0000,  1.0000,  0.0000]],
       dtype=torch.float64) tensor([0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 0])


In [73]:
fc1 = nn.Linear(in_features=112, out_features=64)

In [78]:
bx = bx.to(torch.float32)

In [64]:
loss_func = nn.BCELoss()

In [86]:
for epoch in range(30):
    total_loss = []
    for bx, by in train_loader:
        bx = bx.to(torch.float32)
        by = by.to(torch.float32).unsqueeze(-1)
        optimizer.zero_grad()
        preds = model(bx)
        loss = loss_func(preds,by)
        total_loss.append(loss.item())
        loss.backward()
        optimizer.step()
    print(f"epoch: {epoch}, loss: {np.sum(total_loss)}")

epoch: 0, loss: 16.268003445118666
epoch: 1, loss: 15.884165544062853
epoch: 2, loss: 15.74986532703042
epoch: 3, loss: 15.364351734519005
epoch: 4, loss: 14.978433512151241
epoch: 5, loss: 14.67753440886736
epoch: 6, loss: 14.72731278091669
epoch: 7, loss: 14.199302449822426
epoch: 8, loss: 13.648936126381159
epoch: 9, loss: 13.522769037634134
epoch: 10, loss: 12.930487006902695
epoch: 11, loss: 12.685715809464455
epoch: 12, loss: 12.683749370276928
epoch: 13, loss: 13.042882777750492
epoch: 14, loss: 11.972236808389425
epoch: 15, loss: 12.061587736010551
epoch: 16, loss: 11.941999055445194
epoch: 17, loss: 11.930406771600246
epoch: 18, loss: 11.273524982854724
epoch: 19, loss: 10.864340204745531
epoch: 20, loss: 10.368359858170152
epoch: 21, loss: 10.501597162336111
epoch: 22, loss: 10.62877612002194
epoch: 23, loss: 9.887769235298038
epoch: 24, loss: 9.814839897677302
epoch: 25, loss: 9.55767765082419
epoch: 26, loss: 9.239249454811215
epoch: 27, loss: 9.296334573999047
epoch: 28, l