In [1]:
import pandas as pd

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

%matplotlib inline

In [3]:
train_feature = pd.read_csv("data/training_set_features.csv")
train_label = pd.read_csv("data/training_set_labels.csv")

In [4]:
train_label.columns

Index(['respondent_id', 'h1n1_vaccine', 'seasonal_vaccine'], dtype='object')

In [5]:
train = train_feature.merge(train_label, on="respondent_id")

In [6]:
num_train = train.select_dtypes(include="number")
cat_train = train.select_dtypes(include="object")
num_train.fillna(0, inplace=True)
cat_train.fillna("NA", inplace=True)

In [20]:
train = pd.concat([num_train, cat_train], axis=1)

In [21]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
X = train.drop(['respondent_id', 'h1n1_vaccine', 'seasonal_vaccine'], axis=1)
y1 = train.h1n1_vaccine
y2 = train.seasonal_vaccine

In [24]:
seed = 42
X1_train, X1_test, y1_train, y1_test = train_test_split(X, y1, test_size=0.33, random_state=seed)
X2_train, X2_test, y2_train, y2_test = train_test_split(X, y2, test_size=0.33, random_state=seed)

In [25]:
scaler1 = StandardScaler()
one1 = OneHotEncoder()

In [26]:
num_X = scaler1.fit_transform(X1_train.select_dtypes(include="number"))
cat_X = one1.fit_transform(X1_train.select_dtypes(include="object")).toarray()

In [27]:
X_train = np.concatenate([num_X, cat_X], axis=1)

In [28]:
num_X_test = scaler1.transform(X1_test.select_dtypes(include="number"))
cat_X_test = one1.transform(X1_test.select_dtypes(include="object")).toarray()
X_test = np.concatenate([num_X_test, cat_X_test], axis=1)

In [29]:
from sklearn.metrics import roc_auc_score

## Modeling

In [30]:
import warnings

warnings.filterwarnings("ignore")

### baseline model

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier

In [32]:
lr = LogisticRegression()
rf = RandomForestClassifier()
gbc = GradientBoostingClassifier()
lgc = LGBMClassifier()
for name, model in zip(["logistic", "random forest", "gbt", "lgb"], [lr, rf, gbc, lgc]):
    print(name, model)
    model.fit(X_train, y1_train)
    score = roc_auc_score(y1_test, model.predict(X_test))
    print(name, score)

logistic LogisticRegression()
logistic 0.708961637989236
random forest RandomForestClassifier()
random forest 0.6909659002978622
gbt GradientBoostingClassifier()
gbt 0.7163507751137407
lgb LGBMClassifier()
[LightGBM] [Info] Number of positive: 3806, number of negative: 14087
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004054 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 276
[LightGBM] [Info] Number of data points in the train set: 17893, number of used features: 111
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.212709 -> initscore=-1.308674
[LightGBM] [Info] Start training from score -1.308674
lgb 0.7266387661069703


In [33]:
import torch

In [34]:
import torch.nn as nn

In [35]:
from torch.utils.data import Dataset, DataLoader

In [36]:
from torch.optim import Adam

In [37]:
class Net(nn.Module):
    def __init__(self, in_features):
        super().__init__()
        self._classifier = nn.Sequential(
            nn.Linear(in_features=in_features, out_features=64),
            nn.ReLU(),
            nn.Linear(64,64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )
    def __call__(self,x):
        return self._classifier(x)

In [38]:
class FluData(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [39]:
train = FluData(X_train, y1_train.values)
test = FluData(X_test, y1_test.values)

In [40]:
train_loader = DataLoader(train, batch_size=128, shuffle=True)
test_loader = DataLoader(test, batch_size=128)

In [41]:
model = Net(in_features=X_train.shape[1])
optimizer = Adam(model.parameters())

In [42]:
for bx,by in train_loader:
    print(bx, by)
    break

tensor([[-1.7725, -0.4150, -0.2268,  ...,  0.0000,  0.0000,  0.0000],
        [-0.6762, -0.4150, -0.2268,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.4202, -0.4150, -0.2268,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [-1.7725, -0.4150, -0.2268,  ...,  0.0000,  0.0000,  0.0000],
        [ 1.5166,  1.1941, -0.2268,  ...,  0.0000,  0.0000,  0.0000],
        [ 1.5166,  1.1941, -0.2268,  ...,  0.0000,  0.0000,  0.0000]],
       dtype=torch.float64) tensor([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1,
        0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1,
        0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 0, 1])


In [43]:
fc1 = nn.Linear(in_features=112, out_features=64)

In [44]:
bx = bx.to(torch.float32)

In [45]:
loss_func = nn.BCELoss()

In [46]:
for epoch in range(30):
    total_loss = []
    for bx, by in train_loader:
        bx = bx.to(torch.float32)
        by = by.to(torch.float32).unsqueeze(-1)
        optimizer.zero_grad()
        preds = model(bx)
        loss = loss_func(preds,by)
        total_loss.append(loss.item())
        loss.backward()
        optimizer.step()
    print(f"epoch: {epoch}, loss: {np.sum(total_loss)}")

epoch: 0, loss: 61.19473674893379
epoch: 1, loss: 49.49203509092331
epoch: 2, loss: 48.44885873794556
epoch: 3, loss: 47.6210373044014
epoch: 4, loss: 46.89392413198948
epoch: 5, loss: 46.294735223054886
epoch: 6, loss: 45.62666526436806
epoch: 7, loss: 44.89861769974232
epoch: 8, loss: 44.41703598201275
epoch: 9, loss: 43.66503143310547
epoch: 10, loss: 42.99497717618942
epoch: 11, loss: 42.48907433450222
epoch: 12, loss: 41.81256254017353
epoch: 13, loss: 41.13609582185745
epoch: 14, loss: 40.58562909066677
epoch: 15, loss: 39.85836145281792
epoch: 16, loss: 39.370929017663
epoch: 17, loss: 38.62346576154232
epoch: 18, loss: 37.86156851053238
epoch: 19, loss: 37.35821084678173
epoch: 20, loss: 36.737969756126404
epoch: 21, loss: 36.174087554216385
epoch: 22, loss: 35.33561825752258
epoch: 23, loss: 35.16393660008907
epoch: 24, loss: 34.00403270125389
epoch: 25, loss: 33.49304461479187
epoch: 26, loss: 32.93764427304268
epoch: 27, loss: 32.09045571833849
epoch: 28, loss: 31.5532971024

In [47]:
preds_list =[]
with torch.no_grad():
    for bx, by in test_loader:
            bx = bx.to(torch.float32)
            by = by.to(torch.float32).unsqueeze(-1)
            preds = model(bx).squeeze().numpy()
            preds_list.append(preds)

In [48]:
preds_total = np.concatenate(preds_list)

In [49]:
roc_auc_score(y1_test, preds_total)

0.8226605163355614

### Train Total

In [50]:
scaler = StandardScaler()
one = OneHotEncoder()
num_X = scaler.fit_transform(X.select_dtypes(include="number"))
cat_X = one.fit_transform(X.select_dtypes(include="object")).toarray()
train = np.concatenate([num_X, cat_X], axis=1)

In [51]:
train1 = FluData(train,train_label.h1n1_vaccine.values)
train_loader1 = DataLoader(train1, batch_size=128, shuffle=True)

train2 = FluData(train,train_label.seasonal_vaccine.values)
train_loader2 = DataLoader(train2, batch_size=128, shuffle=True)

In [54]:
dl_model1 = Net(train.shape[1])
dl_model2 = Net(train.shape[1])
optimizer1 = Adam(dl_model1.parameters())
optimizer2 = Adam(dl_model2.parameters())

In [55]:
loss_func1 = nn.BCELoss()
loss_func2 = nn.BCELoss()

In [56]:
for epoch in range(30):
    total_loss = []
    for bx, by in train_loader1:
        bx = bx.to(torch.float32)
        by = by.to(torch.float32).unsqueeze(-1)
        optimizer1.zero_grad()
        preds = dl_model1(bx)
        loss = loss_func1(preds,by)
        total_loss.append(loss.item())
        loss.backward()
        optimizer1.step()
    print(f"epoch: {epoch}, loss: {np.sum(total_loss)}")

epoch: 0, loss: 82.19745688140392
epoch: 1, loss: 73.24158783257008
epoch: 2, loss: 72.13579474389553
epoch: 3, loss: 71.18647213280201
epoch: 4, loss: 70.3893241584301
epoch: 5, loss: 69.32344183325768
epoch: 6, loss: 68.83778615295887
epoch: 7, loss: 68.02555726468563
epoch: 8, loss: 67.24103370308876
epoch: 9, loss: 66.39140017330647
epoch: 10, loss: 65.46930095553398
epoch: 11, loss: 64.6692261248827
epoch: 12, loss: 63.865453854203224
epoch: 13, loss: 63.22361247241497
epoch: 14, loss: 62.415653347969055
epoch: 15, loss: 61.79602658748627
epoch: 16, loss: 60.92969073355198
epoch: 17, loss: 60.22085377573967
epoch: 18, loss: 59.302819430828094
epoch: 19, loss: 58.575257539749146
epoch: 20, loss: 57.656210243701935
epoch: 21, loss: 57.18488320708275
epoch: 22, loss: 56.18044371902943
epoch: 23, loss: 55.578395307064056
epoch: 24, loss: 54.62758024036884
epoch: 25, loss: 53.72224359214306
epoch: 26, loss: 53.041159361600876
epoch: 27, loss: 52.32835765182972
epoch: 28, loss: 51.27681

In [57]:
for epoch in range(30):
    total_loss = []
    for bx, by in train_loader2:
        bx = bx.to(torch.float32)
        by = by.to(torch.float32).unsqueeze(-1)
        optimizer2.zero_grad()
        preds = dl_model2(bx)
        loss = loss_func2(preds,by)
        total_loss.append(loss.item())
        loss.backward()
        optimizer2.step()
    print(f"epoch: {epoch}, loss: {np.sum(total_loss)}")

epoch: 0, loss: 107.59683802723885
epoch: 1, loss: 98.06562685966492
epoch: 2, loss: 96.77404224872589
epoch: 3, loss: 95.57646906375885
epoch: 4, loss: 94.48101562261581
epoch: 5, loss: 93.56750163435936
epoch: 6, loss: 93.08668953180313
epoch: 7, loss: 91.93768462538719
epoch: 8, loss: 90.9887860417366
epoch: 9, loss: 90.1262591779232
epoch: 10, loss: 89.46503844857216
epoch: 11, loss: 88.38554492592812
epoch: 12, loss: 87.92498847842216
epoch: 13, loss: 87.13776084780693
epoch: 14, loss: 86.40534970164299
epoch: 15, loss: 85.62751168012619
epoch: 16, loss: 84.61547628045082
epoch: 17, loss: 84.05246189236641
epoch: 18, loss: 83.53932625055313
epoch: 19, loss: 82.65964731574059
epoch: 20, loss: 81.43663489818573
epoch: 21, loss: 80.82961362600327
epoch: 22, loss: 80.00619423389435
epoch: 23, loss: 79.20747229456902
epoch: 24, loss: 78.5254468023777
epoch: 25, loss: 77.68121254444122
epoch: 26, loss: 76.76557177305222
epoch: 27, loss: 76.16209131479263
epoch: 28, loss: 75.536470234394

In [61]:
test = pd.read_csv("data/test_set_features.csv")
test_ids = test.respondent_id
test = test.drop("respondent_id", axis=1)
num_test = test.select_dtypes(include="number")
cat_test = test.select_dtypes(include="object")
num_test.fillna(0, inplace=True)
cat_test.fillna("NA", inplace=True)

In [62]:
num_X_test = scaler.transform(num_test)
cat_X_test = one.transform(cat_test).toarray()
X_test = np.concatenate([num_X_test, cat_X_test], axis=1)

In [64]:
test_loader = DataLoader(FluData(X_test, np.array(range(len(X_test)))), batch_size=128)

In [65]:
preds_list1 =[]
preds_list2 =[]
with torch.no_grad():
    for bx, by in test_loader:
        bx = bx.to(torch.float32)
        by = by.to(torch.float32).unsqueeze(-1)
        preds1 = dl_model1(bx).squeeze().numpy()
        preds2 = dl_model2(bx).squeeze().numpy()
        preds_list1.append(preds1)
        preds_list2.append(preds2)

In [66]:
train_label.columns

Index(['respondent_id', 'h1n1_vaccine', 'seasonal_vaccine'], dtype='object')

In [67]:
df = pd.DataFrame()
df["respondent_id"] = test_ids
df['h1n1_vaccine'] = np.concatenate(preds_list1)
df['seasonal_vaccine'] = np.concatenate(preds_list2)

In [69]:
df.to_csv("submission.csv", index_label=False, index=False)

In [71]:
lgb1 = LGBMClassifier()
lgb2 = LGBMClassifier()
lgb1.fit(train, train_label.h1n1_vaccine)
lgb2.fit(train, train_label.seasonal_vaccine)

[LightGBM] [Info] Number of positive: 5674, number of negative: 21033
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003738 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 276
[LightGBM] [Info] Number of data points in the train set: 26707, number of used features: 111
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.212454 -> initscore=-1.310198
[LightGBM] [Info] Start training from score -1.310198
[LightGBM] [Info] Number of positive: 12435, number of negative: 14272
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003162 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 276
[LightGBM] [Info] Number of data points in the train set: 26707, number of used features: 111
[LightGBM] [Info] [

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.1
,n_estimators,100
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [73]:
df = pd.DataFrame()
df["respondent_id"] = test_ids
df['h1n1_vaccine'] = lgb1.predict_proba(X_test)[:, 1]
df['seasonal_vaccine'] = lgb2.predict_proba(X_test)[:, 1]
df.to_csv("submission2.csv", index_label=False, index=False)

In [85]:
def ensemble(X, y, test_X):
    lr = LogisticRegression()
    rf = RandomForestClassifier()
    gbt = GradientBoostingClassifier()
    lr.fit(X, y)
    rf.fit(X, y)
    gbt.fit(X, y)
    pred1 = lr.predict_proba(X_test)[:, 1]
    pred2 = gbt.predict_proba(X_test)[:,1]
    pred3 = rf.predict_proba(X_test)[:, 1]
    return np.mean([pred1, pred2, pred3], axis = 0)

In [88]:
h1n1_vaccine = ensemble(train, train_label.h1n1_vaccine, X_test)

In [89]:
seasonal_vaccine = ensemble(train, train_label.seasonal_vaccine, X_test)

In [90]:
df_pred = pd.read_csv("submission.csv")
lgbm_pred = pd.read_csv("submission2.csv")

In [92]:
h1n1_vaccine = np.mean([h1n1_vaccine, df_pred.h1n1_vaccine, lgbm_pred.h1n1_vaccine], axis=0)
seasonal_vaccine = np.mean([seasonal_vaccine, df_pred.seasonal_vaccine, lgbm_pred.seasonal_vaccine], axis=0)

In [93]:
df = pd.DataFrame()
df["respondent_id"] = test_ids
df['h1n1_vaccine'] = h1n1_vaccine
df['seasonal_vaccine'] = seasonal_vaccine
df.to_csv("submission3.csv", index_label=False, index=False)