In [1]:
import os
os.chdir('C:\\Users\\Takanori\\Desktop\\Kaggle\\titanic\\input')

# DataSet & Library Loading

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score


In [3]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.autograd import Variable

In [4]:
df_train = pd.read_csv('../input/titanic/train.csv')
df_test  = pd.read_csv('../input/titanic/test.csv')
df_sub   = pd.read_csv('../input/titanic/gender_submission.csv')

In [5]:
class CFG:
    batch_size = 96
    epochs = 2000
    folds = 10
    seed = 42
    target = 'Survived'
    lr = 0.1
    model_path = "models"
    test_pred = ['pred' + str(i) for i in range(folds)]
    pred = 'pred'
    early_stopping = 100
    lr_factor = 0.5
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [6]:
os.makedirs(CFG.model_path, exist_ok=True)

In [7]:
df_all = pd.concat([df_train, df_test])

In [8]:
# 不要データを削除
df_all.drop(['Name','Ticket','Cabin','PassengerId'],axis=1,inplace=True)


In [9]:
# NA埋め
df_all.fillna(df_all.mean(), inplace=True)

  df_all.fillna(df_all.mean(), inplace=True)


In [10]:
# ターゲットを01に変換
df_all[CFG.target].fillna(0, inplace=True)
df_all[CFG.target] = df_all[CFG.target].astype(int)

In [11]:
# カテゴリ変数をone_hot_encoding
sex = pd.get_dummies(df_all['Sex'], drop_first=True)
embark = pd.get_dummies(df_all['Embarked'], drop_first=True)
df_all = pd.concat([df_all, sex, embark], axis=1)

In [12]:
# 不要になったカテゴリ変数を削除
df_all.drop(['Embarked', 'Sex'], axis=1, inplace=True)

In [13]:
# 学習の対象とする特徴量を列挙する
all_features = df_all.columns.tolist()
all_features.remove(CFG.target)

In [14]:
df_train = df_all[:len(df_train)]
df_test = df_all[len(df_train):]

# Pytorch

## Define Pytorch Model

In [15]:
#thank you very much https://www.kaggle.com/mburakergenc/ttianic-minimal-pytorch-mlp
class Net(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.bn1 = nn.BatchNorm1d(hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.bn2 = nn.BatchNorm1d(hidden_size)
        self.fc3 = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.bn1(x)
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.bn2(x)
        x = self.dropout(x)
        x = self.fc3(x)  # 後続のCrossEntropyLossでSoftMaxを掛けるので、ここでは掛けない
        return x

# Pytorch Loss Function

In [16]:
criterion = nn.BCEWithLogitsLoss()

In [17]:
# TODO GPUで動くようにする

# Pytorch Training

In [18]:
from torch.optim import lr_scheduler

In [19]:
class EarlyStopping:
    # 指定した回数、lossが改善されていなければ打ち止めする
    def __init__(self, patience=20):
        self.partince = patience
        self.bef_epoch = 0
        self.min_loss = float('inf')

    def step(self, epoch, loss):
        if self.min_loss > loss:
            self.min_loss = loss
            self.bef_epoch = epoch
        if epoch - self.bef_epoch > self.partince:
            return True
        else:
            return False
early_stopping = EarlyStopping(CFG.early_stopping)

In [21]:
batch_size = CFG.batch_size
batch_no = len(df_train) // batch_size

skf = StratifiedKFold(n_splits=CFG.folds, shuffle=True, random_state=CFG.seed)

for fold, (train_index, valid_index) in enumerate(skf.split(df_train, df_train[CFG.target])):
    print('-----', fold, '-----')
    # fold毎に初期化する設定
    # model, optimizer, scheduler, その他変数
    model = Net(len(all_features), 512, 1).to(CFG.device)
    print(model)

    optimizer = torch.optim.Adam(model.parameters(), lr=CFG.lr)
    scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', verbose=True, factor=CFG.lr_factor, min_lr=1e-6, patience=10)

    train_loss = 0
    train_loss_min = np.Inf

    # データを分割する。fold毎に一回だけやる
    X_train = df_train[all_features].iloc[train_index]
    y_train = df_train.iloc[train_index][CFG.target]
    X_valid = df_train[all_features].iloc[valid_index]
    y_valid = df_train.iloc[valid_index][CFG.target]
    X_test = df_test[all_features]

    for epoch in range(CFG.epochs):
        # TODO Variableを使うのは古いらしい
        x0_var = Variable(torch.FloatTensor(X_train.values)).to(CFG.device)
        y0_var = Variable(torch.FloatTensor(y_train.values)).to(CFG.device)
        for i in range(batch_no):
            # ミニバッチ学習
            start = i * batch_size
            end   = start + batch_size
            x_var = x0_var[start:end]
            y_var = y0_var[start:end]
            if len(x_var) == 1:
                raise "len(x_var) == 1, cant use bn layer. please change batch_size"

            optimizer.zero_grad()
            output = model(x_var).squeeze(1)
            loss   = criterion(output, y_var)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()*batch_size

        train_loss = train_loss / len(X_train)
        if train_loss <= train_loss_min:
            print("Validation loss decreased ({:6f} ===> {:6f}). Saving the model...".format(train_loss_min,train_loss))
            torch.save(model.state_dict(), CFG.model_path + "/model" + str(fold) + ".pt")
            train_loss_min = train_loss

        # lrを引き下げる
        # 暫くlossが変わってなければ、EarlyStoppingする。
        scheduler.step(train_loss)
        if early_stopping.step(epoch, train_loss):
            break

        # log
        if epoch % 200 == 0:
            print('')
            print("Epoch: {} \tTrain Loss: {}".format(epoch+1, train_loss))

    print('STOP train fold=', fold)
    x0_var = Variable(torch.FloatTensor(X_valid.values)).to(CFG.device)
    pred = torch.sigmoid(model(x0_var))
    df_train.loc[valid_index, CFG.pred] = pred.data.cpu().numpy()
    auc = roc_auc_score(df_train.loc[valid_index, CFG.target], df_train.loc[valid_index, CFG.pred])
    print('fold', fold, 'AUC:', auc)

auc = roc_auc_score(df_train[CFG.target], df_train[CFG.pred])
print('-----', 'Training Ended!', '-----')
print('AUC', auc)

----- 0 -----
Net(
  (fc1): Linear(in_features=8, out_features=512, bias=True)
  (bn1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc2): Linear(in_features=512, out_features=512, bias=True)
  (bn2): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc3): Linear(in_features=512, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)
Validation loss decreased (   inf ===> 4.925098). Saving the model...

Epoch: 1 	Train Loss: 4.925097833411971
Validation loss decreased (4.925098 ===> 1.360217). Saving the model...
Validation loss decreased (1.360217 ===> 0.885273). Saving the model...
Validation loss decreased (0.885273 ===> 0.820040). Saving the model...
Validation loss decreased (0.820040 ===> 0.702759). Saving the model...
Validation loss decreased (0.702759 ===> 0.650033). Saving the model...
Validation loss decreased (0.650033 ===> 0.611978). Saving the model...
Validation loss decreased (0.61

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train.loc[valid_index, CFG.pred] = pred.data.cpu().numpy()


Validation loss decreased (1.437813 ===> 0.984470). Saving the model...
Validation loss decreased (0.984470 ===> 0.861342). Saving the model...
Validation loss decreased (0.861342 ===> 0.796001). Saving the model...
Validation loss decreased (0.796001 ===> 0.623915). Saving the model...
Validation loss decreased (0.623915 ===> 0.565800). Saving the model...
Validation loss decreased (0.565800 ===> 0.559829). Saving the model...
Validation loss decreased (0.559829 ===> 0.516211). Saving the model...
Validation loss decreased (0.516211 ===> 0.503796). Saving the model...
Validation loss decreased (0.503796 ===> 0.503445). Saving the model...
Validation loss decreased (0.503445 ===> 0.488926). Saving the model...
Validation loss decreased (0.488926 ===> 0.477426). Saving the model...
Validation loss decreased (0.477426 ===> 0.457263). Saving the model...
Validation loss decreased (0.457263 ===> 0.446923). Saving the model...
Epoch 00041: reducing learning rate of group 0 to 5.0000e-02.
Va

# predictions

In [22]:
import glob

In [25]:
models = np.sort(glob.glob(f"./{CFG.model_path}/*.pt"))
print(models)
# fold別に作った10個のモデルをロードする
with torch.no_grad():
    for i, model_name in enumerate(models):
        model = Net(len(all_features), 512, 1).to(CFG.device)
        model.load_state_dict(torch.load(model_name,))
        X_test = df_test[all_features]
        x0_var = Variable(torch.FloatTensor(X_test.values)).to(CFG.device)
        pred = F.sigmoid(model(x0_var))
        df_test[CFG.test_pred[i]] = pred.data.cpu().numpy()

df_test.head()


['./models\\model0.pt' './models\\model1.pt' './models\\model2.pt'
 './models\\model3.pt' './models\\model4.pt' './models\\model5.pt'
 './models\\model6.pt' './models\\model7.pt' './models\\model8.pt'
 './models\\model9.pt']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[CFG.test_pred[i]] = pred.data.cpu().numpy()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[CFG.test_pred[i]] = pred.data.cpu().numpy()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[CFG.test_pred[i]] = pred.data.cpu().numpy()
A value is trying to be set on a copy of a slice 

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,male,Q,S,pred0,pred1,pred2,pred3,pred4,pred5,pred6,pred7,pred8,pred9
0,0,3,34.5,0,0,7.8292,1,1,0,0.105576,0.091893,0.070121,0.079433,0.19446,0.090482,0.091987,0.046344,0.09854,0.061862
1,0,3,47.0,1,0,7.0,0,0,1,0.283934,0.283783,0.262539,0.335459,0.209208,0.093508,0.089988,0.097999,0.214368,0.3186
2,0,2,62.0,0,0,9.6875,1,1,0,0.180792,0.015352,0.041926,0.086274,0.098147,0.014793,0.036599,0.018911,0.148233,0.081029
3,0,3,27.0,0,0,8.6625,1,0,1,0.06857,0.115416,0.056804,0.160763,0.082783,0.109066,0.097097,0.08643,0.064767,0.076494
4,0,3,22.0,1,1,12.2875,0,0,1,0.214496,0.325254,0.36237,0.065092,0.264526,0.309451,0.177329,0.370248,0.409841,0.743845


In [26]:
df_sub[CFG.target] = df_test[CFG.test_pred].mean(axis=1)

In [27]:
df_sub[CFG.target] = (df_sub[CFG.target] > 0.5).astype(int)

In [28]:
df_sub.to_csv('submission.csv', index=False)