In [1]:
import pandas as pd 
import numpy as np

from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, ParameterGrid
from sklearn.ensemble import RandomForestClassifier

from collections import Counter, defaultdict
from itertools import combinations 
from sklearn.feature_extraction import DictVectorizer
from sklearn import preprocessing, tree
import matplotlib.pyplot as plt

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
import warnings; warnings.simplefilter('ignore')

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [4]:
def gender_10(df):
    gender_10 = [] 
    for i in np.arange(len(df['Sex'])):
        if df['Sex'][i] == 'female':
            gender_10.append(0)
        else:
            gender_10.append(1)
    return gender_10

In [5]:
def age_categories(array):
    age_categories = []
    for i in np.arange(len(array)):
        if array[i] <= 18:
            age_categories.append(0)
        elif array[i] > 18 and array[i] <= 40:
            age_categories.append(1)
        elif array[i] > 40 and array[i] <= 60:
            age_categories.append(2)
        else:
            age_categories.append(3)
    return age_categories

In [6]:
test[test['PassengerId'] == 1044] = test[test['PassengerId'] == 1044].replace(np.nan, test.groupby('Pclass').agg(np.mean)['Fare'][3])

In [7]:
sex_train = gender_10(train)
sex_test = gender_10(test)

age_mean_train = np.round(np.mean(train['Age'].dropna()))
age_mean_test = np.round(np.mean(test['Age'].dropna()))

age_train = train['Age'].replace(np.nan, age_mean_train)
age_train = age_categories(age_train)
age_test = test['Age'].replace(np.nan, age_mean_test)
age_test = age_categories(age_test)

In [8]:
train_feature = train[['Pclass','Fare']]
train_feature['sex'] = sex_train
train_feature['age'] = age_train
train_feature['relatives'] = train['SibSp'] + train['Parch']

test_feature = test[['Pclass','Fare']]
test_feature['sex'] = sex_test
test_feature['age'] = age_test
test_feature['relatives'] = test['SibSp'] + test['Parch']

In [9]:
train_feature

Unnamed: 0,Pclass,Fare,sex,age,relatives
0,3,7.2500,1,1,1
1,1,71.2833,0,1,1
2,3,7.9250,0,1,0
3,1,53.1000,0,1,1
4,3,8.0500,1,1,0
...,...,...,...,...,...
886,2,13.0000,1,1,0
887,1,30.0000,0,1,0
888,3,23.4500,0,1,3
889,1,30.0000,1,1,0


In [10]:
train_set = train_feature.to_numpy()
test_set = test_feature.to_numpy()
survival_train = train['Survived']

In [11]:
#Random Forest 
rf = RandomForestClassifier(n_estimators=20, max_depth = 5, random_state = 42)
scores = cross_val_score(rf, train_set, survival_train, cv=10, scoring = "accuracy")
print("Scores:", scores)
print("Mean:", scores.mean())
print("Standard Deviation:", scores.std())

Scores: [0.82222222 0.84269663 0.7752809  0.8988764  0.88764045 0.78651685
 0.83146067 0.80898876 0.84269663 0.80898876]
Mean: 0.8305368289637952
Standard Deviation: 0.03770469855267935


In [12]:
# Neural Network
nn = MLPClassifier(hidden_layer_sizes=(20,15), max_iter = 100, activation='logistic', solver='lbfgs', verbose=0, random_state = 1)
scores = cross_val_score(nn, train_set, survival_train, cv=10, scoring = "accuracy")
print("Scores:", scores)
print("Mean:", scores.mean())
print("Standard Deviation:", scores.std())

Scores: [0.84444444 0.79775281 0.76404494 0.87640449 0.82022472 0.76404494
 0.80898876 0.78651685 0.83146067 0.82022472]
Mean: 0.8114107365792759
Standard Deviation: 0.0334244430902832


In [13]:
#Logistic Regression
lr = LogisticRegression()
scores = cross_val_score(lr, train_set, survival_train, cv=10, scoring = "accuracy")
print("Scores:", scores)
print("Mean:", scores.mean())
print("Standard Deviation:", scores.std())

Scores: [0.78888889 0.79775281 0.76404494 0.82022472 0.80898876 0.76404494
 0.78651685 0.7752809  0.82022472 0.79775281]
Mean: 0.7923720349563046
Standard Deviation: 0.019567666545699348


In [14]:
#Decision Tree
features = train_feature.to_dict('records')
test_features = test_feature.to_dict('records')
vec = DictVectorizer()
features_vectorized = vec.fit_transform(features).toarray()
test_vectorized = vec.fit_transform(test_features).toarray()

le = preprocessing.LabelEncoder()
le.fit([0,1])
target = le.transform(train['Survived'])

dt = tree.DecisionTreeClassifier(max_depth = 7)
scores = cross_val_score(dt, train_set, survival_train, cv=10, scoring = "accuracy")
print("Scores:", scores)
print("Mean:", scores.mean())
print("Standard Deviation:", scores.std())

Scores: [0.78888889 0.87640449 0.71910112 0.79775281 0.86516854 0.84269663
 0.83146067 0.79775281 0.87640449 0.84269663]
Mean: 0.823832709113608
Standard Deviation: 0.046428670930584275


In [15]:
nn.fit(train_set, survival_train);
rf.fit(train_set, survival_train);
lr.fit(train_set, survival_train);
dt.fit(features_vectorized, target);

In [16]:
pred_train = nn.predict(train_set)
nn_array = nn.predict(test_set)
dt_array = dt.predict(test_vectorized)
lr_array = lr.predict(test_set)
rf_array = rf.predict(test_set)

In [17]:
def ensemble(array):
    all_result = []
    for i in np.arange(len(array)):
        if array[i] >= 2:
            all_result.append(1)
        else:
            all_result.append(0)
    return all_result

In [23]:
#ensemble the result we get from each classifiers, threshold = 1/2
result = test[['PassengerId']]
result['random forest'] = rf.predict(test_set)
result['logistic'] = lr.predict(test_set)
result['decision tree'] = dt.predict(test_vectorized)
result['neural network'] = nn.predict(test_set)
result['Survived'] = ensemble(sum([nn_array,dt_array,lr_array,rf_array]))
result

Unnamed: 0,PassengerId,random forest,logistic,decision tree,neural network,Survived
0,892,0,0,0,0,0
1,893,0,0,0,1,0
2,894,0,0,0,0,0
3,895,0,0,0,0,0
4,896,1,1,0,1,1
...,...,...,...,...,...,...
413,1305,0,0,0,0,0
414,1306,1,1,1,1,1
415,1307,0,0,0,0,0
416,1308,0,0,0,0,0


In [19]:
#import our best performed classifiers' predictions: 3,10,11,new
trial_10 = pd.read_csv('trial 10.csv')
trial_11 = pd.read_csv('trial 11.csv')
new = pd.read_csv('new.csv')
trial_3 = pd.read_csv('trial 3.csv')

In [20]:
#ensemble these predictions using simple combination, threshold = 1/2
result_2 = test[['PassengerId']]
result_2['trial_3'] = trial_3['Survived']
result_2['trial_10'] = trial_10['Survived']
result_2['trial_11'] = trial_11['Survived']
result_2['trial_new'] = new['Survived']
result_2['Survived'] = ensemble(sum([trial_3['Survived'],trial_10['Survived'],trial_11['Survived'],new['Survived']]))
result_2

Unnamed: 0,PassengerId,trial_3,trial_10,trial_11,trial_new,Survived
0,892,0,0,0,0,0
1,893,1,0,0,0,0
2,894,0,0,0,0,0
3,895,0,0,0,0,0
4,896,1,0,1,1,1
...,...,...,...,...,...,...
413,1305,0,0,0,0,0
414,1306,1,1,1,1,1
415,1307,0,0,0,0,0
416,1308,0,0,0,0,0


In [21]:
report_2 = result_2[['PassengerId', 'Survived']]
report_2

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [22]:
compression_opts = dict(method='zip',
                       archive_name='trial 1.csv')  
report_2.to_csv('trial 15.zip', index=False,
                       compression=compression_opts)  