In [None]:
!nvidia-smi

# Titanic - Machine Learning from Disaster
Start here! Predict survival on the Titanic and get familiar with ML basics  
kaggle: https://www.kaggle.com/c/titanic

## 作法說明

### 步驟

1. 連接雲端硬碟、載入套件
2. EDA、預處理
3. 標準化、切割訓練/測試/驗證集
4. 單一模型
 * Logistic
 * KNN
 * SVM
 * Decision Tree
5. 神經網路模型 (PyTorch)
6. 集成模型
 * Voting
 * Stacking

## 連接雲端硬碟、載入套件

In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/ColabData/Titanic
#!unzip 'titanic.zip'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/ColabData/Titanic


In [2]:
import numpy as np
import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plt
import seaborn as sns

import scipy.stats as stats
from scipy.stats import norm
import math
import random

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingClassifier

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch.autograd import Variable
import time
import copy

Random_seed = 57
np.random.seed(Random_seed)
random.seed(Random_seed)
%matplotlib inline

## EDA、預處理
 * 讀檔
 * 特徵工程
 * EDA
 * 處理遺漏值

### 讀檔

In [None]:
df_tnc_train = pd.read_csv('train.csv')
df_tnc_test = pd.read_csv('test.csv')
df_tnc = pd.concat([df_tnc_train, df_tnc_test])

In [None]:
# 資料大小、欄位資訊

print('train data:', df_tnc_train.shape, ', test data:', df_tnc_test.shape)
df_tnc_train.info()

### 特徵工程(1)
 * 把性別變成數值
 * 製造特徵: 姓名長度(單字數)

In [None]:
# 把性別變成數值

df_tnc['Male'] = df_tnc['Sex'].apply(lambda x: 1 if x == 'male' else 0)

In [None]:
# 製造特徵: 姓名長度(單字數)

df_tnc['Name_len'] = df_tnc['Name'].apply(lambda x: len(x.split()))

In [None]:
# 切割出訓練資料

df_tnc_train = df_tnc[df_tnc['Survived'].isna() == False]

### EDA
 * 存活結果分布
 * 相關係數熱圖
 * 遺漏值情形
 * 數值欄位分布

In [None]:
# 存活結果分布

plt.figure(figsize = (18, 6))
plt.subplot(1, 3, 1)
plt.pie(df_tnc_train['Survived'].value_counts().sort_values(),
    autopct = "%1.2f%%", startangle=90, colors=['lightcoral','skyblue']) 
plt.title('All')
plt.legend(['Died', 'Suvived'])

plt.subplot(1, 3, 2)
plt.pie(df_tnc_train[df_tnc_train['Sex'] == 'male']['Survived'].value_counts().sort_values(),
    autopct = "%1.2f%%", startangle=90, colors=['lightcoral','skyblue']) 
plt.title('Male')
plt.legend(['Died', 'Suvived'])

plt.subplot(1, 3, 3)
plt.pie(df_tnc_train[df_tnc_train['Sex'] == 'female']['Survived'].value_counts().sort_values(),
    autopct = "%1.2f%%", startangle=90, colors=['lightcoral','skyblue']) 
plt.title('Female')
plt.legend(['Died', 'Suvived'])
plt.show()

plt.figure(figsize = (10, 7))
sns.violinplot(x = 'Sex', y = 'Age', hue = 'Survived', data = df_tnc_train, split = True)
plt.show()

In [None]:
# 相關係數熱圖

f, ax = plt.subplots(figsize=(20, 3))
mat = df_tnc_train.corr('pearson')
sns.heatmap(mat.sort_values(by=['Survived'], ascending=False).head(1), cmap='coolwarm', linewidths=.5)

In [None]:
# 遺漏值情形

df_na = DataFrame(df_tnc.isna().mean().sort_values(ascending=False)).reset_index()
df_na.columns = ['var', 'missing_values']
df_na = df_na[df_na['missing_values'] > 0]

fig,ax=plt.subplots(figsize=(10, 6))
sns.barplot(x='missing_values',y='var',data=df_na)

In [None]:
# 數值欄位分布

df_num_var = df_tnc_train.iloc[:, :-4].select_dtypes(['int64','float64'])
sns.pairplot(df_num_var, kind="scatter", diag_kind="hist", hue='Survived',
             plot_kws=dict(s=50,edgecolor="w",color="g",alpha=.2))
plt.show()

### 處理遺漏值

In [None]:
# 遺漏值處理

## 移除遺漏值太多的欄位
df_tnc = df_tnc.drop(columns=['Cabin'], axis=1)

## 類別欄位補 none
df_tnc['Embarked'] = df_tnc['Embarked'].fillna('none')

## Fare補 df_train平均數
df_tnc['Fare'] = df_tnc['Fare'].fillna(df_tnc_train['Fare'].mean())

## 年齡欄位補 none
df_tnc['Age'] = df_tnc['Age'].fillna('none')

### 特徵工程(2)
 * 把年齡分成組別
 * One-Hot Encoding
 * 製造特徵: 姓名稱謂字


In [None]:
# 把年齡分成組別

def to_age_group(x):  
  if x == 'none':
    return x
  elif x < 5:
    group = '0-4'
    return group
  elif (x >= 5) & (x < 15):
    group = '5-14'
    return group
  elif (x >= 15) & (x < 20):
    group = '15-19'
    return group
  elif (x >= 20) & (x < 30):
    group = '20-29'
    return group
  elif (x >= 30) & (x < 40):
    group = '30-39'
    return group
  elif (x >= 40) & (x < 50):
    group = '40-49'
    return group
  elif (x >= 50) & (x < 60):
    group = '50-59'
    return group
  elif x >= 60:
    group = 'over60'
    return group

 
df_tnc['Age_group'] = df_tnc['Age'].apply(lambda x: to_age_group(x))

In [None]:
# One-Hot Encoding

cate_var = ['Age_group', 'Pclass', 'Embarked']
df_tnc = pd.get_dummies(df_tnc, columns=cate_var , prefix=cate_var)

In [None]:
# 製造特徵: 姓名稱謂字

for i in ['Mr.', 'Mrs.', 'Miss.', 'Master.'] :
  df_tnc[i] = df_tnc['Name'].apply(lambda x : 1 if i in x else 0)
  #result = df_tnc[df_tnc[i] == 1]['Survived']
  #print(i, result.mean(), result.count())

In [None]:
# 刪除多餘欄位、儲存處理後資料

df_tnc = df_tnc.drop(columns=['Name', 'Sex', 'Ticket', 'Age'], axis=1)
df_tnc.to_csv('tnc_data.csv', encoding = 'utf-8-sig', index=0)

## 讀檔、標準化、切割訓練/測試/驗證集

In [3]:
# 標準化、切割訓練/測試/驗證集

df_tnc = pd.read_csv('tnc_data.csv')

df_train_valid = df_tnc[df_tnc['Survived'] >= 0]
df_train, df_valid = train_test_split(df_train_valid, test_size=0.15, random_state=Random_seed)
df_test = df_tnc[df_tnc['Survived'].isna()].drop(columns=['Survived'], axis=1)

sc = StandardScaler()
sc.fit(df_train.drop(columns=['PassengerId', 'Survived']))

X_train_all = sc.transform(df_train_valid.drop(columns=['PassengerId', 'Survived'], axis=1)) 
y_train_all = df_train_valid['Survived'].values
X_train = sc.transform(df_train.drop(columns=['PassengerId', 'Survived'], axis=1)) 
y_train = df_train['Survived'].values
X_valid = sc.transform(df_valid.drop(columns=['PassengerId', 'Survived'], axis=1)) 
y_valid = df_valid['Survived'].values
X_test = sc.transform(df_test.drop(columns=['PassengerId'], axis=1))

In [4]:
#　各模型的最佳參數與 score

models = {}
preds = {}
scores = {}

## 單一模型 

 * Logistic
 * KNN
 * SVM
 * Decision Tree
 * Random Forest 

### Logistic

In [None]:
# Logistic

Log_reg = LogisticRegression(fit_intercept = True, random_state = Random_seed)
Log_reg.fit(X_train, y_train)
models["Log"] = Log_reg 

In [None]:
# 預測及存檔

y_pred = Log_reg.predict(X_valid).astype(int)
acc_valid = accuracy_score(y_valid, y_pred)
scores["Log"] = acc_valid
print('Valid Acc:', acc_valid)

Log_reg.fit(X_train_all, y_train_all)

pred_Log = Log_reg.predict(X_test).astype(int)
preds['Log'] = pred_Log
df_test['Survived'] = pred_Log
df_test.loc[:, ['PassengerId', 'Survived']].to_csv('./output/LogReg.csv', index = 0)

Valid Acc: 0.7835820895522388


### KNN

In [None]:
# KNN 

param_grid = {'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10],
        'weights': ['distance', 'uniform'],
        'metric': ['euclidean', 'minkowski']
        }
KNN_grid = GridSearchCV(KNeighborsClassifier(), param_grid, scoring= 'accuracy')
KNN_grid.fit(X_train, y_train)
KNN = KNeighborsClassifier(**KNN_grid.best_params_)
KNN.fit(X_train, y_train)

models["KNN"] = KNN 
print("best estimator: {}".format(KNN_grid.best_params_))

best estimator: {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'uniform'}


In [None]:
# 預測及存檔

y_pred = KNN.predict(X_valid).astype(int)
acc_valid = accuracy_score(y_valid, y_pred)
scores["KNN"] = acc_valid
print('Valid Acc:', acc_valid)

KNN.fit(X_train_all, y_train_all)

pred_KNN = KNN.predict(X_test).astype(int)
df_test['Survived'] = pred_KNN
preds['KNN'] = pred_KNN
df_test.loc[:, ['PassengerId', 'Survived']].to_csv('./output/KNN.csv', index = 0)

Valid Acc: 0.8059701492537313


### SVM

In [None]:
# SVM

param_grid = {'C': [0.1, 0.2, 0.5, 0.8, 1.0, 1.2, 1.5],
        'kernel': ['linear', 'rbf', 'sigmoid'],
        'gamma': ['scale', 'auto'],
        'random_state': [Random_seed],
        'probability': [True]
        }
SVM_grid = GridSearchCV(SVC(), param_grid, scoring= 'accuracy')
SVM_grid.fit(X_train, y_train)
SVM = SVC(**SVM_grid.best_params_)
SVM.fit(X_train, y_train)


models["SVM"] = SVM 
print("best estimator: {}".format(SVM_grid.best_params_))

best estimator: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear', 'probability': True, 'random_state': 57}


In [None]:
# 預測及存檔

y_pred = SVM.predict(X_valid).astype(int)
acc_valid = accuracy_score(y_valid, y_pred)
scores["SVM"] = acc_valid
print('Valid Acc:', acc_valid)

SVM.fit(X_train_all, y_train_all)

pred_SVM = SVM.predict(X_test).astype(int)
df_test['Survived'] = pred_SVM
preds['SVM'] = pred_SVM
df_test.loc[:, ['PassengerId', 'Survived']].to_csv('./output/SVM.csv', index = 0)

Valid Acc: 0.8059701492537313


### Decision Tree

In [None]:
# Decision Tree

param_grid = {'criterion': ['gini', 'entropy'], 
        'splitter': ['best'],
        'max_depth': [2, 3, 4, 5, 6],
        'min_samples_leaf': [4, 5, 6, 8],
        'random_state': [Random_seed],
        }
DTree_grid = GridSearchCV(DecisionTreeClassifier(), param_grid, scoring= 'accuracy')
DTree_grid.fit(X_train, y_train)
DTree = DecisionTreeClassifier(**DTree_grid.best_params_)
DTree.fit(X_train, y_train) 

models["DTree"] = DTree 
print("best estimator: {}".format(DTree_grid.best_params_))

best estimator: {'criterion': 'gini', 'max_depth': 4, 'min_samples_leaf': 4, 'random_state': 57, 'splitter': 'best'}


In [None]:
# 預測及存檔

y_pred = DTree.predict(X_valid).astype(int)
acc_valid = accuracy_score(y_valid, y_pred)
scores["DTree"] = acc_valid
print('Valid Acc:', acc_valid)

DTree.fit(X_train_all, y_train_all)

pred_DTree = DTree.predict(X_test).astype(int)
df_test['Survived'] = pred_DTree
preds['DTree'] = pred_DTree
df_test.loc[:, ['PassengerId', 'Survived']].to_csv('./output/DTree.csv', index = 0)

Valid Acc: 0.7985074626865671


### Random Forest

In [None]:
# Random Forest

param_grid = {'n_estimators': [10, 30, 50, 80, 100, 120, 150],
        'criterion': ['gini', 'entropy'], 
        'max_depth': [2, 3, 4, 5, 6],
        'min_samples_leaf': [4, 5, 6, 8],
        'random_state': [Random_seed],
        }
RF_grid = GridSearchCV(RandomForestClassifier(), param_grid, scoring= 'accuracy')
RF_grid.fit(X_train, y_train)
RF = RandomForestClassifier(**RF_grid.best_params_)
RF.fit(X_train, y_train) 

models["RF"] = RF 
print("best estimator: {}".format(RF_grid.best_params_))

best estimator: {'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 5, 'n_estimators': 10, 'random_state': 57}


In [None]:
# 預測及存檔

y_pred = RF.predict(X_valid).astype(int)
acc_valid = accuracy_score(y_valid, y_pred)
scores["RF"] = acc_valid
print('Valid Acc:', acc_valid)

RF.fit(X_train_all, y_train_all)

pred_RF = RF.predict(X_test).astype(int)
df_test['Survived'] = pred_RF
preds['RF'] = pred_RF
df_test.loc[:, ['PassengerId', 'Survived']].to_csv('./output/RF.csv', index = 0)

Valid Acc: 0.7910447761194029


## 集成模型

 * Voting
 * Stacking

### Voting
每個模型1票

In [None]:
# Votint(same weight)

model_list = [('Log', models['Log']), ('KNN', models['KNN']), ('SVM', models['SVM']), ('DTree', models['DTree']), ('RF', models['RF'])]

param_grid = {'estimators': [model_list], 
        'voting': ['hard', 'soft']}

Voting_grid = GridSearchCV(VotingClassifier(model_list), param_grid, scoring= 'accuracy')
Voting_grid.fit(X_train, y_train)
Voting_model = VotingClassifier(**Voting_grid.best_params_)
Voting = Voting_model.fit(X_train, y_train)
models["Voting"] = Voting 
print("best estimator: {}".format(Voting_grid.best_params_))

best estimator: {'estimators': [('Log', LogisticRegression(random_state=57)), ('KNN', KNeighborsClassifier(metric='euclidean', n_neighbors=7)), ('SVM', SVC(C=0.1, kernel='linear', probability=True, random_state=57)), ('DTree', DecisionTreeClassifier(max_depth=4, min_samples_leaf=4, random_state=57)), ('RF', RandomForestClassifier(max_depth=5, min_samples_leaf=5, n_estimators=10,
                       random_state=57))], 'voting': 'soft'}


In [None]:
# 預測及存檔

y_pred = Voting.predict(X_valid).astype(int)
acc_valid = accuracy_score(y_valid, y_pred)
scores["Vote"] = acc_valid
print('Valid Acc:', acc_valid)

Voting.fit(X_train_all, y_train_all)

pred_Voting = Voting.predict(X_test).astype(int)
df_test['Survived'] = pred_Voting
df_test.loc[:, ['PassengerId', 'Survived']].to_csv('./output/Voting.csv', index = 0)

Valid Acc: 0.8134328358208955


### Stacking

In [None]:
# Stacking

model_list = [('Log', models['Log']), ('KNN', models['KNN']), ('SVM', models['SVM']), ('DTree', models['DTree']), ('RF', models['RF'])]

param_grid = {'estimators': [model_list], 
        'final_estimator': [LogisticRegression(fit_intercept = True, random_state = Random_seed),
                   SVC(random_state = Random_seed)
                   ]}

Stacking_grid = GridSearchCV(StackingClassifier(model_list), param_grid, scoring= 'accuracy')
Stacking_grid.fit(X_train, y_train)
Stacking_model = StackingClassifier(**Stacking_grid.best_params_)
Stacking = Stacking_model.fit(X_train, y_train)
models["Stacking"] = Stacking 
print("best estimator: {}".format(Stacking_grid.best_params_))

best estimator: {'estimators': [('Log', LogisticRegression(random_state=57)), ('KNN', KNeighborsClassifier(metric='euclidean', n_neighbors=7)), ('SVM', SVC(C=0.1, kernel='linear', probability=True, random_state=57)), ('DTree', DecisionTreeClassifier(max_depth=4, min_samples_leaf=4, random_state=57)), ('RF', RandomForestClassifier(max_depth=5, min_samples_leaf=5, n_estimators=10,
                       random_state=57))], 'final_estimator': LogisticRegression(random_state=57)}


In [None]:
# 預測及存檔

y_pred = Stacking.predict(X_valid).astype(int)
acc_valid = accuracy_score(y_valid, y_pred)
scores["Stacking"] = acc_valid
print('Valid Acc:', acc_valid)

Stacking.fit(X_train_all, y_train_all)

pred_Stacking = Stacking.predict(X_test).astype(int)
df_test['Survived'] = pred_Stacking
df_test.loc[:, ['PassengerId', 'Survived']].to_csv('./output/Stacking.csv', index = 0)

Valid Acc: 0.8134328358208955


## Pytorch

### Neural Network Model

In [5]:
# Model

class NeuralNet(nn.Module):
    def __init__(self, input_dim):
        super(NeuralNet, self).__init__()

        self.net = nn.Sequential(
            nn.Linear(input_dim, 1200),
            nn.Dropout(p=0.1),
            nn.ReLU(),
            nn.Linear(1200, 240),
            nn.ReLU(),
            nn.Linear(240, 2)
        )

    def forward(self, x):
        net_x = self.net(x)
        net_x = torch.sigmoid(net_x)
        #print(net_x)
        return net_x

### Hyper-parameters

In [6]:
# Hyper-parameters

learning_rate = 0.0001
momentum = 0.95
weight_decay = 0.000001
batch_size = 60
n_jobs = 4
n_epochs = 1000                
patience = 200               
save_path = 'models/NN.pth' 
#device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
criterion = nn.CrossEntropyLoss()

### Data
 * Dataset
 * DataLoader


In [7]:
# Dataset 

class DataSet(Dataset):
  def __init__(self, X, Y):

    self.X = X
    self.Y = Y
    self.length = len(X)

  def __len__(self):

    return self.length

  def __getitem__(self, index):

    x = self.X[index]
    y = self.Y[index]

    return x, y

In [8]:
# DataLoader

trainset = DataSet(X_train, y_train)
trainallset = DataSet(X_train_all, y_train_all)

trainloader = DataLoader(trainset, batch_size = batch_size, num_workers = n_jobs, shuffle = True)
trainallloader = DataLoader(trainallset, batch_size = batch_size, num_workers = n_jobs, shuffle = True)

### Train & Predict

In [9]:
# Train

model = NeuralNet(X_train.shape[1])

#optimizer = torch.optim.AdamW(model.parameters(), lr = learning_rate)
#optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate, momentum = momentum, weight_decay = weight_decay)

best_acc = 0
best_epochs = n_epochs
early_stop = False
early_stop_cnt = 0
since = time.time()
for epoch in range(n_epochs):
    
  model.train()    
  for data in trainloader:

      x, y = data[0].float(), data[1].long()

      optimizer.zero_grad()
      output = model(x)
      _, pred = torch.max(output, 1)
      #print(pred, y.squeeze(0))
      
      loss = criterion(output, y)
      loss.backward()
      optimizer.step()
      
  train_acc = accuracy_score(y, pred) 


  model.eval()
  with torch.no_grad():
    out = model(torch.tensor(X_valid).float())
    _, pred = torch.max(out, 1)

  valid_acc = accuracy_score(torch.tensor(y_valid), pred)
  
  print('Epoch {}/{}'.format(epoch+ 1, n_epochs))
  print('Train Acc: {:.8f}, Valid Acc: {:.8f}'.format(train_acc, valid_acc))
    
  if valid_acc > best_acc:
    best_acc = valid_acc
    best_epochs = epoch           
    torch.save(model.state_dict(), save_path)

    print('Saving The Model') 
    early_stop_cnt = 0
  else:
    print("Counter {} of {}".format(early_stop_cnt+ 1, patience))
    early_stop_cnt += 1
     
  print('-' * 10)

  if early_stop_cnt >= patience:
    print("Early stop!")
    early_stop = True

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    print('Best valid Acc: {:.8f}, epoch {}'.format(best_acc, best_epochs))
    break

if early_stop == False:

  time_elapsed = time.time() - since
  print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
  print('Best valid Acc: {:.8f}'.format(best_acc))

Epoch 1/1000
Train Acc: 0.40540541, Valid Acc: 0.29850746
Saving The Model
----------
Epoch 2/1000
Train Acc: 0.29729730, Valid Acc: 0.29850746
Counter 1 of 200
----------
Epoch 3/1000
Train Acc: 0.24324324, Valid Acc: 0.29850746
Counter 2 of 200
----------
Epoch 4/1000
Train Acc: 0.45945946, Valid Acc: 0.27611940
Counter 3 of 200
----------
Epoch 5/1000
Train Acc: 0.35135135, Valid Acc: 0.31343284
Saving The Model
----------
Epoch 6/1000
Train Acc: 0.29729730, Valid Acc: 0.33582090
Saving The Model
----------
Epoch 7/1000
Train Acc: 0.27027027, Valid Acc: 0.38059701
Saving The Model
----------
Epoch 8/1000
Train Acc: 0.35135135, Valid Acc: 0.39552239
Saving The Model
----------
Epoch 9/1000
Train Acc: 0.45945946, Valid Acc: 0.40298507
Saving The Model
----------
Epoch 10/1000
Train Acc: 0.48648649, Valid Acc: 0.43283582
Saving The Model
----------
Epoch 11/1000
Train Acc: 0.56756757, Valid Acc: 0.50746269
Saving The Model
----------
Epoch 12/1000
Train Acc: 0.48648649, Valid Acc: 0.57

In [10]:
# Valid acc

model = NeuralNet(X_train.shape[1])
model.load_state_dict(torch.load("models/NN.pth"))

model.eval()
with torch.no_grad():
  out = model(torch.tensor(X_valid).float())
  _, pred = torch.max(out, 1)

valid_acc = accuracy_score(torch.tensor(y_valid), pred)
print('Valid_Acc :', valid_acc)

Valid_Acc : 0.8134328358208955


In [11]:
# Train with all data

model = NeuralNet(X_train.shape[1])
model.load_state_dict(torch.load("models/NN.pth"))

#optimizer = torch.optim.AdamW(model.parameters(), lr = learning_rate)
#optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate, momentum = momentum, weight_decay = weight_decay)

since = time.time()
for epoch in range(best_epochs):
    
  model.train()    
  for data in trainallloader:

      x, y = data[0].float(), data[1].long()

      optimizer.zero_grad()
      output = model(x)
      _, pred = torch.max(output, 1)
      #print(pred, y.squeeze(0))
      
      loss = criterion(output, y)
      loss.backward()
      optimizer.step()

torch.save(model.state_dict(), "models/NN_all.pth")
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))

Training complete in 0m 34s


In [12]:
# Predict

model = NeuralNet(X_train.shape[1])
#model.load_state_dict(torch.load("models/NN.pth"))
model.load_state_dict(torch.load("models/NN_all.pth"))

model.eval()
with torch.no_grad():
  out = model(torch.tensor(X_test).float())
  _, pred = torch.max(out, 1)

df_test['Survived'] = pred.squeeze().detach().numpy().astype(int)
#df_test.loc[:, ['PassengerId', 'Survived']].to_csv('./output/Torch_AdamW.csv', index = 0)
#df_test.loc[:, ['PassengerId', 'Survived']].to_csv('./output/Torch_Adam.csv', index = 0)
df_test.loc[:, ['PassengerId', 'Survived']].to_csv('./output/Torch_SGD.csv', index = 0)

In [13]:
# Voting

df_AdamW = pd.read_csv('./output/Torch_AdamW.csv')
df_Adam = pd.read_csv('./output/Torch_Adam.csv')
df_SGD = pd.read_csv('./output/Torch_SGD.csv')
df_Torch = df_AdamW
df_Torch['Survived1'] = df_AdamW['Survived']
df_Torch['Survived2'] = df_Adam['Survived']
df_Torch['Survived3'] = df_SGD['Survived']

df_Torch['Survived1'] = df_Torch.loc[:, ['Survived1', 'Survived2', 'Survived3']].mode(axis = 1)
df_Torch.loc[:, ['PassengerId', 'Survived']].to_csv('./output/Torch_Vote.csv', index = 0)