# Preprocess

In [258]:
import pandas as pd
import zipfile

zip_file = zipfile.ZipFile('titanic.zip')
train_data = zip_file.open('train.csv')
test_data = zip_file.open('test.csv')

train_df = pd.read_csv(train_data)
test_df = pd.read_csv(test_data)

# make apssengerId as index
train_df = train_df.set_index('PassengerId')
test_df = test_df.set_index('PassengerId')

In [259]:
def preprocess(df):
    df = df.copy()
    
    def normalize_name(x):
        return " ".join([v.strip(",()[].\"'") for v in x.split(" ")])
    
    def ticket_number(x):
        return x.split(" ")[-1]
        
    def ticket_item(x):
        items = x.split(" ")
        if len(items) == 1:
            return "NONE"
        return "_".join(items[0:-1])
    
    df["Name"] = df["Name"].apply(normalize_name)
    df["Ticket_number"] = df["Ticket"].apply(ticket_number)
    df["Ticket_item"] = df["Ticket"].apply(ticket_item)                     
    return df
    
# preprocess
preprocessed_train_df = preprocess(train_df)
preprocessed_test_df = preprocess(test_df)

# print(train_df.isnull().sum())  # Cabin has too many Nan values
# drop Cabin column, and you don't need Ticket column
preprocessed_train_df = preprocessed_train_df.drop(columns=['Cabin', 'Ticket', 'Name', 'Ticket_number', 'Ticket_item'])
preprocessed_test_df = preprocessed_test_df.drop(columns=['Cabin', 'Ticket', 'Name', 'Ticket_number', 'Ticket_item'])

preprocessed_train_df.head(5)

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,male,22.0,1,0,7.25,S
2,1,1,female,38.0,1,0,71.2833,C
3,1,3,female,26.0,0,0,7.925,S
4,1,1,female,35.0,1,0,53.1,S
5,0,3,male,35.0,0,0,8.05,S


In [260]:
# age and embarked has Nan values
# fill Nan values with mean value
preprocessed_train_df["Age"] = preprocessed_train_df["Age"].fillna(preprocessed_train_df["Age"].mean())
preprocessed_test_df["Age"] = preprocessed_test_df["Age"].fillna(preprocessed_test_df["Age"].mean())
preprocessed_test_df["Fare"] = preprocessed_test_df["Fare"].fillna(preprocessed_test_df["Fare"].mean())

# fill Nan values with most frequent value
preprocessed_train_df["Embarked"] = preprocessed_train_df["Embarked"].fillna(preprocessed_train_df["Embarked"].mode()[0])
preprocessed_test_df["Embarked"] = preprocessed_test_df["Embarked"].fillna(preprocessed_test_df["Embarked"].mode()[0])

# dummy variables
preprocessed_train_df = pd.get_dummies(preprocessed_train_df)
preprocessed_test_df = pd.get_dummies(preprocessed_test_df)


In [261]:
preprocessed_train_df.head(5)
# delete redundant columns
preprocessed_train_df.drop(columns = ['Sex_male', 'Embarked_Q'])
preprocessed_test_df.drop(columns = ['Sex_male', 'Embarked_Q'])

Unnamed: 0_level_0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Embarked_C,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
892,3,34.50000,0,0,7.8292,False,False,False
893,3,47.00000,1,0,7.0000,True,False,True
894,2,62.00000,0,0,9.6875,False,False,False
895,3,27.00000,0,0,8.6625,False,False,True
896,3,22.00000,1,1,12.2875,True,False,True
...,...,...,...,...,...,...,...,...
1305,3,30.27259,0,0,8.0500,False,False,True
1306,1,39.00000,0,0,108.9000,True,True,False
1307,3,38.50000,0,0,7.2500,False,False,True
1308,3,30.27259,0,0,8.0500,False,False,True


In [263]:
# normalize
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# split the data into train and test
from sklearn.model_selection import train_test_split

X = preprocessed_train_df.drop(['Survived'], axis=1)
y = preprocessed_train_df['Survived']
X = scaler.fit_transform(X)
X_sub = scaler.transform(preprocessed_test_df)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# FULLY CONNECTED

In [265]:
# Use pytorch make a fully connected neural network
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(10, 32)
        self.fc2 = nn.Linear(32, 32)
        self.fc3 = nn.Linear(32, 2)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.softmax(self.fc3(x))
        return x
    
net = Net()
print(net)

# define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.001)

# convert to tensor
X_train = torch.from_numpy(X_train).float()
X_test = torch.from_numpy(X_test).float()
y_train = torch.squeeze(torch.from_numpy(y_train.to_numpy()).long())
y_test = torch.squeeze(torch.from_numpy(y_test.to_numpy()).long())
X_sub = torch.from_numpy(X_sub).float()

Net(
  (fc1): Linear(in_features=10, out_features=32, bias=True)
  (fc2): Linear(in_features=32, out_features=32, bias=True)
  (fc3): Linear(in_features=32, out_features=2, bias=True)
)


In [266]:
# train the network
for epoch in range(1000):
    optimizer.zero_grad()
    outputs = net(X_train)
    loss = criterion(outputs, y_train)
    loss.backward()
    optimizer.step()
    if epoch % 100 == 0:
        print("Epoch: %d, Loss: %1.5f" % (epoch, loss.item()))


Epoch: 0, Loss: 0.69092


  x = F.softmax(self.fc3(x))


Epoch: 100, Loss: 0.47990
Epoch: 200, Loss: 0.45931
Epoch: 300, Loss: 0.45085
Epoch: 400, Loss: 0.44606
Epoch: 500, Loss: 0.44223
Epoch: 600, Loss: 0.43880
Epoch: 700, Loss: 0.43592
Epoch: 800, Loss: 0.43391
Epoch: 900, Loss: 0.43243


In [267]:
# test the network
outputs = net(X_test)
_, predicted = torch.max(outputs.data, 1)
correct = (predicted == y_test).sum()
print('Accuracy: %.2f %%' % (100 * correct / len(y_test)))


Accuracy: 75.75 %


  x = F.softmax(self.fc3(x))


In [268]:
# predict the test data
outputs = net(X_sub)
_, predicted = torch.max(outputs.data, 1)
predicted = predicted.numpy()
predicted = pd.DataFrame(predicted, index=preprocessed_test_df.index, columns=['Survived'])
predicted.to_csv('fc_nn_submission.csv')

  x = F.softmax(self.fc3(x))


# SVM

In [269]:
from sklearn import svm

# Create a SVM Classifier with a linear kernel
clf = svm.SVC(kernel='linear')

# Train the model using the training sets
clf.fit(X_train, y_train)

# Predict the response for test dataset
y_pred = clf.predict(X_test)

# Evaluate the model
from sklearn import metrics

print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.7686567164179104


In [270]:
# export solution
y_pred = clf.predict(X_sub)
y_pred = pd.DataFrame(y_pred, index=preprocessed_test_df.index, columns=['Survived'])
y_pred.to_csv('svm_submission.csv')

# RANDOM FOREST

In [271]:
# make a random forest model
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.7910447761194029


In [274]:
# Same but search for the best params
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300, 400, 500, 600, 700],
    'max_depth': [3, 4, 5, 6, 7, 8, 9, 10, 11],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

rf = RandomForestClassifier(random_state=1)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
print(grid_search.best_score_)
y_pred = grid_search.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Fitting 5 folds for each of 1134 candidates, totalling 5670 fits
{'bootstrap': False, 'max_depth': 4, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}
0.8443225806451613
Accuracy: 0.7723880597014925


: 

In [273]:
# predict the test data
y_pred = grid_search.predict(X_sub)
y_pred = pd.DataFrame(y_pred, index=preprocessed_test_df.index, columns=['Survived'])
y_pred.to_csv('rf_submission.csv')