In [21]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
%matplotlib inline

In [22]:
# i added skipinitialspace becuase it returns null when i created title column because of the whitespace
train = pd.read_csv('train.csv', skipinitialspace = True)
test = pd.read_csv('test.csv', skipinitialspace = True)

In [23]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [24]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [25]:
train.shape

(891, 12)

In [8]:
test.shape

(418, 11)

In [26]:
train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [10]:
test.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [27]:
train_test = [train,test]

In [28]:
train_test

[     PassengerId  Survived  Pclass  \
 0              1         0       3   
 1              2         1       1   
 2              3         1       3   
 3              4         1       1   
 4              5         0       3   
 ..           ...       ...     ...   
 886          887         0       2   
 887          888         1       1   
 888          889         0       3   
 889          890         1       1   
 890          891         0       3   
 
                                                   Name     Sex   Age  SibSp  \
 0                              Braund, Mr. Owen Harris    male  22.0      1   
 1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
 2                               Heikkinen, Miss. Laina  female  26.0      0   
 3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
 4                             Allen, Mr. William Henry    male  35.0      0   
 ..                                               

In [29]:
# extracing the first titles of the names into new column called title
for data in train_test:
    data['title'] = data['Name'].str.extract('([A-Za-z]+)\.', expand =False)

In [30]:
train['title'].value_counts()

Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Major         2
Mlle          2
Col           2
Jonkheer      1
Sir           1
Lady          1
Don           1
Mme           1
Capt          1
Ms            1
Countess      1
Name: title, dtype: int64

In [31]:
train['title'].dtypes

dtype('O')

In [32]:
# chainging the values into numerical values 
title_map = {'Mr':0,'Miss':1,'Mrs':2,'Master':3,'Dr':3,'Rev':3,'Col':3,'Ms':3,'Mlle':3,
                            'Major':3,'Dona':3,'Lady':3,'Don':3,'Countess':3,'Capt':3,'Mme':3,'Sir':3,
                            'Jonkheer':3}
for data in train_test:
    data['title'] = data['title'].map(title_map)

In [33]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,2
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,2
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0


In [34]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,title
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,2
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,2


In [35]:
train.drop(columns = 'Name', axis =1, inplace = True, )

In [36]:
test.drop(columns = 'Name', axis =1, inplace = True)

In [37]:
test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,title
0,892,3,male,34.5,0,0,330911,7.8292,,Q,0
1,893,3,female,47.0,1,0,363272,7.0,,S,2
2,894,2,male,62.0,0,0,240276,9.6875,,Q,0
3,895,3,male,27.0,0,0,315154,8.6625,,S,0
4,896,3,female,22.0,1,1,3101298,12.2875,,S,2


In [38]:
# change the sex column values into 0 for males and 1 for females 
sex_mapping = {'male':0, 'female':1}
for i in train_test:
    i['Sex'] = i['Sex'].map(sex_mapping)

In [39]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,title
0,1,0,3,0,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,1,1,38.0,1,0,PC 17599,71.2833,C85,C,2
2,3,1,3,1,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,1,1,1,35.0,1,0,113803,53.1,C123,S,2
4,5,0,3,0,35.0,0,0,373450,8.05,,S,0


In [40]:
# check for missing values 
train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
title            0
dtype: int64

In [None]:
train.Age.mean()

In [41]:
# fill in the missing values of the age column with the median 
train['Age'].fillna(value = 29.69, inplace = True)
test['Age'].fillna(value = 29.69, inplace = True)
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,title
0,1,0,3,0,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,1,1,38.0,1,0,PC 17599,71.2833,C85,C,2
2,3,1,3,1,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,1,1,1,35.0,1,0,113803,53.1,C123,S,2
4,5,0,3,0,35.0,0,0,373450,8.05,,S,0


In [None]:
# get the median of the values in the embarked column to fill inn missing Values
train.Embarked.median

In [42]:
# fill in the missing values with the median S
train.Embarked.fillna(value = 'S', inplace = True)

In [43]:
embarked_map = {'S':0, 'C':1,'Q':2}
for i in train_test:
    i['Embarked'] = i['Embarked'].map(embarked_map)

In [44]:
for i in train_test:
    i.loc[i['Fare']<=17, 'Fare'] =0,
    i.loc[(i['Fare']>17)& (i['Fare']<=30),'Fare'] = 1,
    i.loc[(i['Fare']>30)& (i['Fare']<=100),'Fare'] = 2,
    i.loc[i['Fare']>17, 'Fare'] = 3

In [None]:
train.Embarked.value_counts()

In [None]:
train.head()

In [45]:
# extract the first chcaracter of the cabin number 
for i in train_test:
    i['Cabin'] = i['Cabin'].str[:1]

In [46]:
train.Cabin.unique()

array([nan, 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [47]:
cabin_map = {'B':0, 'C':1, 'E':2, 'D':3, 'A':4, 'T':5, 'F':6, 'G':7}
for i in train_test:
    i['Cabin'] = i['Cabin'].map(cabin_map)


In [None]:
train.head()

In [48]:
train.Cabin.fillna(train.groupby('Pclass')['Cabin'].transform('median'),
                   inplace = True)
test.Cabin.fillna(test.groupby('Pclass')['Cabin'].transform('median'), 
                  inplace = True)


In [None]:
train.isna().sum()

In [49]:
# we need to dropp the ticket column 
train.drop(columns = 'Ticket', inplace = True)
test.drop(columns = 'Ticket', inplace = True)

In [50]:
train.drop(columns = 'PassengerId', inplace = True)
test.drop(columns = 'PassengerId').copy()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,title
0,3,0,34.50,0,0,0.0,6.0,2,0
1,3,1,47.00,1,0,0.0,6.0,0,2
2,2,0,62.00,0,0,0.0,6.0,2,0
3,3,0,27.00,0,0,0.0,6.0,0,0
4,3,1,22.00,1,1,0.0,6.0,0,2
...,...,...,...,...,...,...,...,...,...
413,3,0,29.69,0,0,0.0,6.0,0,0
414,1,1,39.00,0,0,3.0,1.0,1,3
415,3,0,38.50,0,0,0.0,6.0,0,0
416,3,0,29.69,0,0,0.0,6.0,0,0


In [51]:
target = train['Survived']

In [None]:
#drop the survive column from the training set 

In [52]:
train.drop(columns = 'Survived', inplace = True)

In [53]:
# the column in the test data has a null value  to fill the null:
test.Fare.fillna(value = 2, inplace = True)

In [54]:
target.shape, train.shape

((891,), (891, 9))

In [56]:
test_data

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,title
0,3,0,34.50,0,0,0.0,6.0,2,0
1,3,1,47.00,1,0,0.0,6.0,0,2
2,2,0,62.00,0,0,0.0,6.0,2,0
3,3,0,27.00,0,0,0.0,6.0,0,0
4,3,1,22.00,1,1,0.0,6.0,0,2
...,...,...,...,...,...,...,...,...,...
413,3,0,29.69,0,0,0.0,6.0,0,0
414,1,1,39.00,0,0,3.0,1.0,1,3
415,3,0,38.50,0,0,0.0,6.0,0,0
416,3,0,29.69,0,0,0.0,6.0,0,0


In [55]:
# the number of columns in the train dataset is 9 and the numner od the columns in the test dataset is 20 
# we have tpo make them match
test_data = test.drop('PassengerId',axis = 1).copy()

In [68]:
test_data.shape

(418, 9)

In [57]:
# choosing the KFold metrics
k_fold = KFold(n_splits = 10, shuffle = True, random_state = 2)

In [58]:
Knn = KNeighborsClassifier(n_neighbors = 5)
Knn.fit(train,target)
y_pred = Knn.predict(test_data)
score = cross_val_score(Knn,train,target,cv = k_fold, scoring = 'accuracy')
print(score)

[0.73333333 0.76404494 0.82022472 0.80898876 0.83146067 0.78651685
 0.78651685 0.78651685 0.7752809  0.78651685]


In [59]:
# using DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf.fit(train, target)
y_pred = clf.predict(test_data)
score = cross_val_score(clf,train,target,cv = k_fold, scoring = 'accuracy')
print(score)

[0.81111111 0.78651685 0.7752809  0.82022472 0.82022472 0.84269663
 0.79775281 0.80898876 0.76404494 0.76404494]


In [60]:
# using RandomForestClassifier
Rfc = RandomForestClassifier(n_estimators = 12)
Rfc.fit(train,target)
y_pred = Rfc.predict(test_data)
score = cross_val_score(Rfc, train, target, cv = k_fold, scoring = 'accuracy')
print(score)

[0.78888889 0.78651685 0.76404494 0.86516854 0.82022472 0.84269663
 0.80898876 0.79775281 0.80898876 0.75280899]


In [61]:
from sklearn.svm import SVC
clf = SVC()
clf.fit(train,target)
prediction = clf.predict(test_data)
score = cross_val_score(clf, train,target, cv = k_fold , scoring = 'accuracy')
print(score)

[0.63333333 0.70786517 0.74157303 0.76404494 0.82022472 0.75280899
 0.7752809  0.73033708 0.73033708 0.73033708]


In [62]:
csv_file = pd.DataFrame({'PassengerId':test['PassengerId'],
                        'Survived':prediction})
csv_file.to_csv('Titanic_submission.csv', index = False)