In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv('train.csv')

In [3]:
test = pd.read_csv('test.csv')

In [4]:
train.shape

(891, 12)

In [5]:
test.shape

(418, 11)

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [8]:
train_test_data = [train, test]
for dataset in train_test_data:
    dataset['Title'] = dataset['Name'].str.extract(' ([A-Za-z]+)\.', expand = False)

In [9]:
train['Title'].value_counts()

Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Mlle          2
Col           2
Major         2
Sir           1
Capt          1
Lady          1
Ms            1
Don           1
Jonkheer      1
Mme           1
Countess      1
Name: Title, dtype: int64

In [10]:
test['Title'].value_counts()

Mr        240
Miss       78
Mrs        72
Master     21
Col         2
Rev         2
Ms          1
Dr          1
Dona        1
Name: Title, dtype: int64

In [11]:
title_mapping = {'Mr': 0, 'Miss': 1, 'Mrs':2,
                'Master':3, 'Dona':3, 'Dr': 3, 'Rev':3, 'Mlle':3, 'Major':3, 'Col':3, 
                'Mme':3, 'Ms':3, 'Sir':3, 'Don':3, 'Countess':3, 'Jonkheer':3, 'Capt':3, 
                'Lady':3}
for dataset in train_test_data:
    dataset['Title'] = dataset['Title'].map(title_mapping)

In [17]:
train.drop('Name', axis = 1, inplace = True)
test.drop('Name', axis = 1, inplace = True)

In [19]:
sex_mapping = {'male':0, 'female':1}
for dataset in train_test_data:
    dataset['Sex'] = dataset['Sex'].map(sex_mapping)

In [20]:
train['Age'].fillna(train.groupby('Title')['Age'].transform('median'), inplace = True)
test['Age'].fillna(test.groupby('Title')['Age'].transform('median'), inplace = True)

In [23]:
for dataset in train_test_data:
    dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0,
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 26), 'Age'] = 1,
    dataset.loc[(dataset['Age'] > 26) & (dataset['Age'] <= 36), 'Age'] = 2,
    dataset.loc[(dataset['Age'] > 36) & (dataset['Age'] <= 62), 'Age'] = 3,
    dataset.loc[ dataset['Age'] > 62, 'Age'] = 4

In [24]:
for dataset in train_test_data:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')

In [27]:
embarked_mapping = {'S': 0, 'C': 1, 'Q' : 2}
for dataset in train_test_data:
    dataset['Embarked'] = dataset['Embarked'].map(embarked_mapping)

In [28]:
test['Fare'].fillna(test.groupby('Pclass')['Fare'].transform('median'), inplace = True)

In [30]:
for dataset in train_test_data:
    dataset.loc[ dataset['Fare'] <= 17, 'Fare'] = 0,
    dataset.loc[ (dataset['Fare'] > 17) & (dataset['Fare'] <= 30), 'Fare'] = 1,
    dataset.loc[ (dataset['Fare'] > 30) & (dataset['Fare'] <= 100), 'Fare'] = 2,
    dataset.loc[ dataset['Fare'] > 100, 'Fare'] = 4

In [31]:
for dataset in train_test_data:
    dataset['Cabin'] = dataset['Cabin'].str[:1]

In [33]:
cabin_mapping = {'A': 0, 'B' : .4, 'C' : .8, 'D': 1.2,
                'E' : 1.6, 'F': 2.0, 'G': 2.4, 'H': 2.8}
for dataset in train_test_data:
    dataset['Cabin'] = dataset['Cabin'].map(cabin_mapping)

In [34]:
train['Cabin'].fillna(train.groupby('Pclass')['Cabin'].transform('median'), inplace = True)
test['Cabin'].fillna(test.groupby('Pclass')['Cabin'].transform('median'), inplace = True)

In [37]:
train['FamilySize'] = train['SibSp'] + train['Parch'] + 1
test['FamilySize'] = test['SibSp'] + test['Parch'] + 1

In [40]:
family_mapping = {1:0, 2:.4, 3:.8, 4:1.2, 5:1.6, 6:2, 7:2.4, 8:2.8, 9:3.2, 10:3.6, 11:4}
for dataset in train_test_data:
    dataset['FamilySize'] = dataset['FamilySize'].map(family_mapping)

In [63]:
train_test_data = train.drop('Survived', axis = 1)
train_test_data = train.drop('Ticket', axis = 1)
target = train['Survived']

In [64]:
train_test_data.shape, target.shape

((891, 11), (891,))

In [109]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [110]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
k_fold = KFold(n_splits = 10, shuffle = True, random_state = 0)

In [111]:
clf = KNeighborsClassifier(n_neighbors = 13)
scoring = 'accuracy'
score = cross_val_score(clf, train_test_data, target, cv = k_fold, n_jobs = 1, scoring = scoring)
print(score)

[0.92222222 0.88764045 0.92134831 0.95505618 0.93258427 0.93258427
 0.8988764  0.92134831 0.8988764  0.95505618]


In [112]:
import numpy as np
round(np.mean(score)*100,2)

92.26

In [113]:
clf = DecisionTreeClassifier()
socring = 'accuracy'
score = cross_val_score(clf, train_test_data, target, cv = k_fold, n_jobs = 1, scoring = scoring)
print(score)

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [114]:
round(np.mean(score)*100,2)

100.0

In [115]:
clf = RandomForestClassifier(n_estimators = 13)
scoring = 'accuracy'
score = cross_val_score(clf, train_test_data, target, cv = k_fold, n_jobs = 1, scoring = scoring)
print(score)

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [116]:
round(np.mean(score)*100,2)

100.0

In [117]:
clf = GaussianNB()
scoring = 'accuracy'
score = cross_val_score(clf, train_test_data, target, cv = k_fold, n_jobs = 1, scoring = scoring)
print(score)

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [118]:
round(np.mean(score)*100,2)

100.0

In [119]:
clf = SVC()
scoring = 'accuracy'
score = cross_val_score(clf, train_test_data, target, cv = k_fold, n_jobs = 1, scoring = scoring)
print(score)

[1.         1.         0.97752809 1.         0.98876404 1.
 1.         0.98876404 1.         1.        ]




In [120]:
round(np.mean(score)*100,2)

99.55

In [124]:
clf = SVC()
clf.fit(train_test_data, target)

test_data = test.drop('PassengerId', axis = 1).copy()
test_data = test.drop('Ticket',axis = 1)
prediction = clf.predict(test_data)



In [125]:
submission = pd.DataFrame({
        'PassengerId': test["PassengerId"],
        'Survived' : prediction
    })
submission.to_csv('submission.csv', index = False)

In [126]:
submission = pd.read_csv('submission.csv')
submission.head(50)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
5,897,0
6,898,0
7,899,0
8,900,0
9,901,0
