In [1]:
# Import Libraries for Data Analysis
import numpy as np
import pandas as pd

In [2]:
# Import the training and test sets
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
# Make a backup copy
train_copy = train.copy()
test_copy = test.copy()

In [4]:
# Look at an overview of the training set
train.columns.tolist()

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [5]:
print(train.shape)
print(test.shape)

(891, 12)
(418, 11)


In [6]:
# I already know that I do not need the 'Ticket' column. I will drop it now.
# I want to change to inplace=True to affect the underlying data
# And axis=1 refers to an action on column labels
train.drop(labels='Ticket',inplace=True,axis=1)
test.drop(labels='Ticket',inplace=True,axis=1)

In [7]:
# Check the number missing data in training and test set
print('---Training Set---')
print(pd.isnull(train).sum())
print('---Test Set---')
print(pd.isnull(test).sum())

---Training Set---
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Fare             0
Cabin          687
Embarked         2
dtype: int64
---Test Set---
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In [8]:
# Count specific null values in the Age column, return True if Null
train.Age.isnull().value_counts()

False    714
True     177
Name: Age, dtype: int64

In [9]:
# Find the max, min, median, and mean for the 'Age' in training and test data
print('---Training Set---')
print (train.Age.min())
print (train.Age.max())
print (train.Age.median())
print (train.Age.mean())
print('---Test Set---')
print (test.Age.min())
print (test.Age.max())
print (test.Age.median())
print (test.Age.mean())

---Training Set---
0.42
80.0
28.0
29.69911764705882
---Test Set---
0.17
76.0
27.0
30.272590361445783


In [10]:
# I will fill the missing 'Age' values with the mean
train['Age']=train['Age'].fillna(np.mean(train['Age'])).astype(float)
test['Age']=test['Age'].fillna(np.mean(test['Age'])).astype(float)

In [11]:
# Count specific null values in the 'Cabin' column, return True if null
train.Cabin.isnull().value_counts()

True     687
False    204
Name: Cabin, dtype: int64

In [12]:
# Create a new column called 'CabinBool' 
# Fill with 1 value if there was a recorded cabin, and 0 value if missing
train["CabinBool"] = (train["Cabin"].notnull().astype('int'))
test["CabinBool"] = (test["Cabin"].notnull().astype('int'))

In [13]:
# Drop the 'Cabin' column
train.drop(labels='Cabin',inplace=True,axis=1)
test.drop(labels='Cabin',inplace=True,axis=1)

In [14]:
train.Embarked.dtype

dtype('O')

In [15]:
# The Test set has no missing date in 'Embarked'
# This step will only apply to training set
# Count the number of each value that is in the 'Embarked' column
print('---Training Set---')
print(train.Embarked.value_counts())


---Training Set---
S    644
C    168
Q     77
Name: Embarked, dtype: int64


In [16]:
# Find the number of null values in 'Embarked', and return true if null
print('---Training Set---')
print(train.Embarked.isnull().value_counts())



---Training Set---
False    889
True       2
Name: Embarked, dtype: int64


In [17]:
# Fill the null values of 'Embarked' with S since it is the most frequent
#train['Embarked'] = train['Embarked'].astype(str)
train = train.fillna({"Embarked": "S"})

In [19]:
train_embark=pd.get_dummies(train['Embarked'],drop_first=True,prefix='EmbarkDummy')
test_embark=pd.get_dummies(test['Embarked'],drop_first=True,prefix='EmbarkDummy')

In [21]:
train=pd.concat([train,train_embark],axis=1)
test=pd.concat([test,test_embark],axis=1)

In [22]:
# Drop the 'Embarked' column
train.drop(labels='Embarked',inplace=True,axis=1)
test.drop(labels='Embarked',inplace=True,axis=1)

In [24]:
# I think there is one missing value for 'Fare' in the test set. 
print(test.Fare.dtype)
print(test.Fare.isnull().value_counts())

float64
False    417
True       1
Name: Fare, dtype: int64


In [27]:
# Fill the missing value in test 'Fare' with the mean
test['Fare']=test['Fare'].fillna(np.mean(test['Fare'])).astype(float)

In [28]:
# Get data info to remind what still needs to be processed
train.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId      891 non-null int64
Survived         891 non-null int64
Pclass           891 non-null int64
Name             891 non-null object
Sex              891 non-null object
Age              891 non-null float64
SibSp            891 non-null int64
Parch            891 non-null int64
Fare             891 non-null float64
CabinBool        891 non-null int32
EmbarkDummy_Q    891 non-null uint8
EmbarkDummy_S    891 non-null uint8
dtypes: float64(2), int32(1), int64(5), object(2), uint8(2)
memory usage: 67.9+ KB


In [31]:
train_sex=pd.get_dummies(train['Sex'],drop_first=True,prefix='SexDummy')#male=1 and  female=0
test_sex=pd.get_dummies(test['Sex'],drop_first=True,prefix='SexDummy')#male=1 and  female=0
train=pd.concat([train,train_sex],axis=1).drop(['Sex'],axis=1)
test=pd.concat([test,test_sex],axis=1).drop(['Sex'],axis=1)

In [32]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId      891 non-null int64
Survived         891 non-null int64
Pclass           891 non-null int64
Name             891 non-null object
Age              891 non-null float64
SibSp            891 non-null int64
Parch            891 non-null int64
Fare             891 non-null float64
CabinBool        891 non-null int32
EmbarkDummy_Q    891 non-null uint8
EmbarkDummy_S    891 non-null uint8
SexDummy_male    891 non-null uint8
dtypes: float64(2), int32(1), int64(5), object(1), uint8(3)
memory usage: 61.9+ KB


In [33]:
# I will use 'SibSp' and 'Parch' to create two new features called 'FamilySize' and 'IsAlone'
for row in train:
    train['FamilySize'] = train['SibSp'] + train['Parch'] + 1
    
for row in test:
    test['FamilySize'] = test['SibSp'] + test['Parch'] + 1
    
for row in train:
    train['IsAlone'] = 0
    train.loc[train['FamilySize'] == 1, 'IsAlone'] = 1
    
for row in test:
    test['IsAlone'] = 0
    test.loc[test['FamilySize'] == 1, 'IsAlone'] = 1

In [38]:
# I will drop the 'SibSp' , 'Parch' , and 'FamilySize' features.
train.drop(labels=['SibSp', 'Parch', 'FamilySize'],inplace=True,axis=1)
test.drop(labels=['SibSp', 'Parch', 'FamilySize'],inplace=True,axis=1)

In [39]:
train.sample(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,Fare,CabinBool,EmbarkDummy_Q,EmbarkDummy_S,SexDummy_male,IsAlone
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",14.0,30.0708,0,0,0,0,0
390,391,1,1,"Carter, Mr. William Ernest",36.0,120.0,1,0,1,1,0
792,793,0,3,"Sage, Miss. Stella Anna",29.699118,69.55,0,0,1,0,0
849,850,1,1,"Goldenberg, Mrs. Samuel L (Edwiga Grabowska)",29.699118,89.1042,1,0,0,0,0
881,882,0,3,"Markun, Mr. Johann",33.0,7.8958,0,0,1,1,1
832,833,0,3,"Saad, Mr. Amin",29.699118,7.2292,0,0,0,1,1
252,253,0,1,"Stead, Mr. William Thomas",62.0,26.55,1,0,1,1,1
539,540,1,1,"Frolicher, Miss. Hedwig Margaritha",22.0,49.5,1,0,0,0,0
719,720,0,3,"Johnson, Mr. Malkolm Joackim",33.0,7.775,0,0,1,1,1
753,754,0,3,"Jonkoff, Mr. Lalio",23.0,7.8958,0,0,1,1,1


In [45]:
for row in train:
    train['Title'] = train.Name.str.extract(', ([A-Za-z]+)\.', expand=False)
    
for row in test:
    test['Title'] = test.Name.str.extract(', ([A-Za-z]+)\.', expand=False)

In [47]:
for row in train:
    train['Title'] = train['Title'].replace(['Lady', 'Capt', 'Col',
    'Don', 'Dr', 'Major', 'Rev', 'Jonkheer', 'Dona'], 'Rare')
    
    train['Title'] = train['Title'].replace(['Countess', 'Lady', 'Sir'], 'Royal')
    train['Title'] = train['Title'].replace('Mlle', 'Miss')
    train['Title'] = train['Title'].replace('Ms', 'Miss')
    train['Title'] = train['Title'].replace('Mme', 'Mrs')

for row in test:
    test['Title'] = test['Title'].replace(['Lady', 'Capt', 'Col',
    'Don', 'Dr', 'Major', 'Rev', 'Jonkheer', 'Dona'], 'Rare')
    
    test['Title'] = test['Title'].replace(['Countess', 'Lady', 'Sir'], 'Royal')
    test['Title'] = test['Title'].replace('Mlle', 'Miss')
    test['Title'] = test['Title'].replace('Ms', 'Miss')
    test['Title'] = test['Title'].replace('Mme', 'Mrs')

In [48]:
train_title=pd.get_dummies(train['Title'],drop_first=True,prefix='TitleDummy')#male=1 and  female=0
test_title=pd.get_dummies(test['Title'],drop_first=True,prefix='TitleDummy')#male=1 and  female=0
train=pd.concat([train,train_title],axis=1).drop(['Name'],axis=1)
test=pd.concat([test,test_title],axis=1).drop(['Name'],axis=1)

In [50]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 16 columns):
PassengerId         891 non-null int64
Survived            891 non-null int64
Pclass              891 non-null int64
Age                 891 non-null float64
Fare                891 non-null float64
CabinBool           891 non-null int32
EmbarkDummy_Q       891 non-null uint8
EmbarkDummy_S       891 non-null uint8
SexDummy_male       891 non-null uint8
IsAlone             891 non-null int64
Title               890 non-null object
TitleDummy_Miss     891 non-null uint8
TitleDummy_Mr       891 non-null uint8
TitleDummy_Mrs      891 non-null uint8
TitleDummy_Rare     891 non-null uint8
TitleDummy_Royal    891 non-null uint8
dtypes: float64(2), int32(1), int64(4), object(1), uint8(8)
memory usage: 59.2+ KB


In [67]:
train_predictors = train.drop(labels=['PassengerId', 'Survived'],inplace=True,axis=1)
test_predictors = test.drop(labels=['PassengerId'],inplace=True,axis=1)


ValueError: labels ['PassengerId' 'Survived'] not contained in axis

In [73]:
train.drop(labels=['Title'],inplace=True,axis=1)
train.head()

Unnamed: 0,Pclass,Age,Fare,CabinBool,EmbarkDummy_Q,EmbarkDummy_S,SexDummy_male,IsAlone,TitleDummy_Miss,TitleDummy_Mr,TitleDummy_Mrs,TitleDummy_Rare,TitleDummy_Royal
0,3,22.0,7.25,0,0,1,1,0,0,1,0,0,0
1,1,38.0,71.2833,1,0,0,0,0,0,0,1,0,0
2,3,26.0,7.925,0,0,1,0,1,1,0,0,0,0
3,1,35.0,53.1,1,0,1,0,0,0,0,1,0,0
4,3,35.0,8.05,0,0,1,1,1,0,1,0,0,0


In [75]:
test.drop(labels=['Title'],inplace=True,axis=1)
test.head()

Unnamed: 0,Pclass,Age,Fare,CabinBool,EmbarkDummy_Q,EmbarkDummy_S,SexDummy_male,IsAlone,TitleDummy_Miss,TitleDummy_Mr,TitleDummy_Mrs,TitleDummy_Rare
0,3,34.5,7.8292,0,1,0,1,1,0,1,0,0
1,3,47.0,7.0,0,0,1,0,0,0,0,1,0
2,2,62.0,9.6875,0,1,0,1,1,0,1,0,0
3,3,27.0,8.6625,0,0,1,1,1,0,1,0,0
4,3,22.0,12.2875,0,0,1,0,0,0,0,1,0


In [76]:
train_target = train_copy['Survived']
train_target.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [79]:
X = train
y = train_target

In [80]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.22, random_state = 0)

In [82]:
#Decision Tree
from sklearn.tree import DecisionTreeClassifier

decisiontree = DecisionTreeClassifier()
decisiontree.fit(X_train, y_train)
y_pred = decisiontree.predict(X_test)


In [84]:
from sklearn.metrics import confusion_matrix # Functions start with lower case
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[107  17]
 [ 23  50]]


In [85]:
from sklearn.metrics import accuracy_score
acc_decisiontree = round(accuracy_score(y_pred, y_test) * 100, 2)
print(acc_decisiontree)

79.7


In [86]:
# Support Vector Machines
from sklearn.svm import SVC

svc = SVC()
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)


In [87]:
cm_svc = confusion_matrix(y_test, y_pred)
print(cm_svc)

[[102  22]
 [ 26  47]]


In [88]:
acc_svc = round(accuracy_score(y_pred, y_test) * 100, 2)
print(acc_svc)

75.63


In [89]:
# Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier

gbk = GradientBoostingClassifier()
gbk.fit(X_train, y_train)
y_pred = gbk.predict(X_test)
cm_gbk = confusion_matrix(y_test, y_pred)
print(cm_gbk)
acc_gbk = round(accuracy_score(y_pred, y_test) * 100, 2)
print(acc_gbk)

[[115   9]
 [ 20  53]]
85.28


In [97]:
print (test.info())
print (train.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
Pclass             418 non-null int64
Age                418 non-null float64
Fare               418 non-null float64
CabinBool          418 non-null int32
EmbarkDummy_Q      418 non-null uint8
EmbarkDummy_S      418 non-null uint8
SexDummy_male      418 non-null uint8
IsAlone            418 non-null int64
TitleDummy_Miss    418 non-null uint8
TitleDummy_Mr      418 non-null uint8
TitleDummy_Mrs     418 non-null uint8
TitleDummy_Rare    418 non-null uint8
dtypes: float64(2), int32(1), int64(2), uint8(7)
memory usage: 17.6 KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
Pclass              891 non-null int64
Age                 891 non-null float64
Fare                891 non-null float64
CabinBool           891 non-null int32
EmbarkDummy_Q       891 non-null uint8
EmbarkDummy_S       891 non-null uint8
SexDummy_male   

In [95]:
X_test = test

In [96]:
y_pred = decisiontree.predict(X_test)

ValueError: Number of features of the model must match the input. Model n_features is 13 and input n_features is 12 