In [683]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

In [684]:
train_dataset = pd.read_csv('./train.csv', index_col = 'PassengerId')
train_dataset.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [685]:
train_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [686]:
women = train_dataset.loc[train_dataset.Sex == 'female']["Survived"]
sum(women)/len(women)

0.7420382165605095

In [687]:
men = train_dataset.loc[train_dataset.Sex == 'male']["Survived"]
sum(men)/len(men)

0.18890814558058924

In [688]:
X = train_dataset.drop(labels = ['Survived', 'Name', 'Cabin'], axis = 1)
y = train_dataset.Survived

In [689]:
index_drop = X[X.Embarked.isnull()].index

In [690]:
X.drop(index = index_drop, inplace=True)
y.drop(index = index_drop, inplace=True)

In [691]:
X.Age = X.Age.fillna(X.Age.mean())

In [692]:
X.Sex = LabelEncoder().fit_transform(X.Sex)
X.Embarked = LabelEncoder().fit_transform(X.Embarked)
X.Ticket = LabelEncoder().fit_transform(X.Ticket)
X.Age = X.Age.astype(int)

In [693]:
scaler = StandardScaler().fit(X)
X = scaler.transform(X)

In [694]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [695]:
rf_clf = RandomForestClassifier()

param = {
    'n_estimators': range(100,140, 9),
    'criterion': ['gini', 'entropy'],
    'max_depth': range(1, 21, 5),
    'min_samples_split': range(2, 10, 2),
    'min_samples_leaf': range(2, 10, 2)
}

random_search = RandomizedSearchCV(rf_clf, param, cv = 5)
random_search.fit(X_train, y_train)

In [696]:
rf_clf = random_search.best_estimator_
np.average(cross_val_score(rf_clf, X_test, y_test, cv= 5))

0.789937106918239

In [697]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

In [698]:
np.average(cross_val_score(log_reg, X_test, y_test, cv= 5))

0.8048916841369673

In [699]:
svc = SVC()
param = {
    'C': range(1,11),
    'kernel': ['linear', 'poly', 'rbf'],
    'degree': range(3, 15, 2)
}

random_search = RandomizedSearchCV(svc, param, cv = 5)
random_search.fit(X_train, y_train)

In [700]:
svc = random_search.best_estimator_
np.average(cross_val_score(svc, X_test, y_test, cv= 5))

0.7560447239692522

In [701]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)

In [702]:
np.average(cross_val_score(gnb, X_test, y_test, cv= 5))

0.7450733752620545

In [703]:
voting_clf = VotingClassifier([
    ('rf_clf', rf_clf), ('log_reg', log_reg), ('svc', svc), ('gnb', gnb)
])
voting_clf.fit(X_train, y_train)

In [704]:
np.average(cross_val_score(voting_clf, X_test, y_test, cv= 5))

0.7862334032145352

In [705]:
test_dataset = pd.read_csv('./test.csv')
test_dataset.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [706]:
passenger_id = test_dataset.PassengerId

test_dataset.drop(labels = ['PassengerId','Name', 'Cabin'], axis = 1, inplace = True)

test_dataset.Age = test_dataset.Age.fillna(test_dataset.Age.mean())
test_dataset.Fare = test_dataset.Fare.fillna(test_dataset.Fare.mean())
test_dataset.Sex = LabelEncoder().fit_transform(test_dataset.Sex)
test_dataset.Embarked = LabelEncoder().fit_transform(test_dataset.Embarked)
test_dataset.Ticket = LabelEncoder().fit_transform(test_dataset.Ticket)
test_dataset.Age = test_dataset.Age.astype(int)

scaler = StandardScaler().fit(test_dataset)
test_dataset = scaler.transform(test_dataset)

In [707]:
output = pd.DataFrame({'PassengerId': passenger_id, 'Survived': voting_clf.predict(test_dataset)})
output.to_csv('submission.csv', index = False)