In [1]:
# Import Libraries
import numpy as np
import pandas as pd

In [2]:
# Import the training and test sets
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
# Make a backup copy
train_copy = train.copy()
test_copy = test.copy()

In [4]:
# Get at an overview of the training set
print(train.columns) # Column Titles
print() # Blank Line
print(train.info()) # Detailed Column Info

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
None


In [None]:
# Drop 'Ticket' Column
#train.drop(labels='Ticket',inplace=True,axis=1) # inplac=True to overwrite underlying data
#test.drop(labels='Ticket',inplace=True,axis=1)

In [5]:
# Get Statistical overiew of 'Age'
print('---Training Set---')
print (train.Age.min())
print (train.Age.max())
print (train.Age.median())
print (train.Age.mean())
print() # Blank Line
print('---Test Set---')
print (test.Age.min())
print (test.Age.max())
print (test.Age.median())
print (test.Age.mean())

---Training Set---
0.42
80.0
28.0
29.69911764705882

---Test Set---
0.17
76.0
27.0
30.272590361445783


In [6]:
# Fill missing 'Age' values with the mean
train['Age']=train['Age'].fillna(np.mean(train['Age'])).astype(float)
test['Age']=test['Age'].fillna(np.mean(test['Age'])).astype(float)

In [None]:
# Count specific null values in the 'Cabin' column, return True if null
#train.Cabin.isnull().value_counts()

In [7]:
# Create a new column called 'CabinBool' 
# Fill with 1 value if there was a recorded cabin, and 0 value if missing
train["CabinBool"] = (train["Cabin"].notnull().astype('int'))
test["CabinBool"] = (test["Cabin"].notnull().astype('int'))

In [14]:
# Drop the 'Cabin' column
#train.drop(labels='Cabin',inplace=True,axis=1)
#test.drop(labels='Cabin',inplace=True,axis=1)
train.sample()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,CabinBool
727,728,1,3,"Mannion, Miss. Margareth",female,29.699118,0,0,36866,7.7375,,Q,0


In [17]:
train["CabinStatus"] = train["CabinBool"].astype('category')
train.dtypes

PassengerId       int64
Survived          int64
Pclass            int64
Name             object
Sex              object
Age             float64
SibSp             int64
Parch             int64
Ticket           object
Fare            float64
Cabin            object
Embarked         object
CabinBool         int32
CabinStatus    category
dtype: object

In [23]:
train.CabinStatus.sample(3)

781    1
553    0
146    0
Name: CabinStatus, dtype: category
Categories (2, int64): [0, 1]

In [24]:
mapper = {0:1, 1:2}
#df = pd.DataFrame( {"A": [7001, 8001, 9001]} )
train['CabinStatus'] = train['CabinStatus'].map(mapper)
train.CabinStatus.sample(3)

28     1
340    2
596    1
Name: CabinStatus, dtype: int64

In [None]:
# The Test set has no missing date in 'Embarked'
# This step will only apply to training set
# Count the number of each value that is in the 'Embarked' column
print('---Training Set---')
print(train.Embarked.value_counts())


In [None]:
# Find the number of null values in 'Embarked', and return true if null
print('---Training Set---')
print(train.Embarked.isnull().value_counts())



In [None]:
# Fill the null values of 'Embarked' with S since it is the most frequent
#train['Embarked'] = train['Embarked'].astype(str)
train = train.fillna({"Embarked": "S"})

In [None]:
train_embark=pd.get_dummies(train['Embarked'],drop_first=True,prefix='EmbarkDummy')
test_embark=pd.get_dummies(test['Embarked'],drop_first=True,prefix='EmbarkDummy')

In [None]:
train=pd.concat([train,train_embark],axis=1)
test=pd.concat([test,test_embark],axis=1)

In [None]:
# Drop the 'Embarked' column
train.drop(labels='Embarked',inplace=True,axis=1)
test.drop(labels='Embarked',inplace=True,axis=1)

In [None]:
# I think there is one missing value for 'Fare' in the test set. 
print(test.Fare.dtype)
print(test.Fare.isnull().value_counts())

In [None]:
# Fill the missing value in test 'Fare' with the mean
test['Fare']=test['Fare'].fillna(np.mean(test['Fare'])).astype(float)

In [None]:
# Get data info to remind what still needs to be processed
train.info()


In [None]:
train_sex=pd.get_dummies(train['Sex'],drop_first=True,prefix='SexDummy')#male=1 and  female=0
test_sex=pd.get_dummies(test['Sex'],drop_first=True,prefix='SexDummy')#male=1 and  female=0
train=pd.concat([train,train_sex],axis=1).drop(['Sex'],axis=1)
test=pd.concat([test,test_sex],axis=1).drop(['Sex'],axis=1)

In [None]:
train.info()

In [None]:
# I will use 'SibSp' and 'Parch' to create two new features called 'FamilySize' and 'IsAlone'
for row in train:
    train['FamilySize'] = train['SibSp'] + train['Parch'] + 1
    
for row in test:
    test['FamilySize'] = test['SibSp'] + test['Parch'] + 1
    
for row in train:
    train['IsAlone'] = 0
    train.loc[train['FamilySize'] == 1, 'IsAlone'] = 1
    
for row in test:
    test['IsAlone'] = 0
    test.loc[test['FamilySize'] == 1, 'IsAlone'] = 1

In [None]:
# I will drop the 'SibSp' , 'Parch' , and 'FamilySize' features.
train.drop(labels=['SibSp', 'Parch', 'FamilySize'],inplace=True,axis=1)
test.drop(labels=['SibSp', 'Parch', 'FamilySize'],inplace=True,axis=1)

In [None]:
train.sample(10)

In [None]:
for row in train:
    train['Title'] = train.Name.str.extract(', ([A-Za-z]+)\.', expand=False)
    
for row in test:
    test['Title'] = test.Name.str.extract(', ([A-Za-z]+)\.', expand=False)

In [None]:
for row in train:
    train['Title'] = train['Title'].replace(['Lady', 'Capt', 'Col',
    'Don', 'Dr', 'Major', 'Rev', 'Jonkheer', 'Dona'], 'Rare')
    
    train['Title'] = train['Title'].replace(['Countess', 'Lady', 'Sir'], 'Royal')
    train['Title'] = train['Title'].replace('Mlle', 'Miss')
    train['Title'] = train['Title'].replace('Ms', 'Miss')
    train['Title'] = train['Title'].replace('Mme', 'Mrs')

for row in test:
    test['Title'] = test['Title'].replace(['Lady', 'Capt', 'Col',
    'Don', 'Dr', 'Major', 'Rev', 'Jonkheer', 'Dona'], 'Rare')
    
    test['Title'] = test['Title'].replace(['Countess', 'Lady', 'Sir'], 'Royal')
    test['Title'] = test['Title'].replace('Mlle', 'Miss')
    test['Title'] = test['Title'].replace('Ms', 'Miss')
    test['Title'] = test['Title'].replace('Mme', 'Mrs')

In [None]:
train_title=pd.get_dummies(train['Title'],drop_first=True,prefix='TitleDummy')#male=1 and  female=0
test_title=pd.get_dummies(test['Title'],drop_first=True,prefix='TitleDummy')#male=1 and  female=0
train=pd.concat([train,train_title],axis=1).drop(['Name'],axis=1)
test=pd.concat([test,test_title],axis=1).drop(['Name'],axis=1)

In [None]:
train.info()

In [None]:
train_predictors = train.drop(labels=['PassengerId', 'Survived'],inplace=True,axis=1)
test_predictors = test.drop(labels=['PassengerId'],inplace=True,axis=1)


In [None]:
train.drop(labels=['Title'],inplace=True,axis=1)
train.head()

In [None]:
test.drop(labels=['Title'],inplace=True,axis=1)
test.head()

In [None]:
train_target = train_copy['Survived']
train_target.head()

In [None]:
X = train
y = train_target

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.22, random_state = 0)

In [None]:
#Decision Tree
from sklearn.tree import DecisionTreeClassifier

decisiontree = DecisionTreeClassifier()
decisiontree.fit(X_train, y_train)
y_pred = decisiontree.predict(X_test)


In [None]:
from sklearn.metrics import confusion_matrix # Functions start with lower case
cm = confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
from sklearn.metrics import accuracy_score
acc_decisiontree = round(accuracy_score(y_pred, y_test) * 100, 2)
print(acc_decisiontree)

In [None]:
# Support Vector Machines
from sklearn.svm import SVC

svc = SVC()
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)


In [None]:
cm_svc = confusion_matrix(y_test, y_pred)
print(cm_svc)

In [None]:
acc_svc = round(accuracy_score(y_pred, y_test) * 100, 2)
print(acc_svc)

In [None]:
# Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier

gbk = GradientBoostingClassifier()
gbk.fit(X_train, y_train)
y_pred = gbk.predict(X_test)
cm_gbk = confusion_matrix(y_test, y_pred)
print(cm_gbk)
acc_gbk = round(accuracy_score(y_pred, y_test) * 100, 2)
print(acc_gbk)

In [None]:
print (test.info())
print (train.info())


In [None]:
X_test = test

In [None]:
y_pred = decisiontree.predict(X_test)