# Importing Necessary Libraries and Modules

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn import preprocessing
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Load the Titanic Dataset

In [0]:
from google.colab import files
uploaded = files.upload()

Saving titanic.csv to titanic (1).csv


In [0]:
import io
dfTitanic = pd.read_csv(io.BytesIO(uploaded['titanic.csv']))

# Doing Some Exploratory Data Analysis

In [0]:
# Get the shape of dataset using attribut shape
dfTitanic.shape

(891, 12)

In [0]:
# Check the values of each columns
print(dfTitanic.columns.values)

['PassengerId' 'Survived' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch'
 'Ticket' 'Fare' 'Cabin' 'Embarked']


In [0]:
# Explore informations of each observations and check if there are some NaN values. Using a method of dataframe object:info()
dfTitanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


From informations above this dataset is not tidy. So we must tidying this dataset first.

There are NaN values in columns Age,  Cabin and Embarked. Then we can assign new values to that NaN values or we can just simply drop it. For Cabin column there so many NaN values, i thought the best way is to dropped it, because guessing its values is hard. For Age column and Embarked column the NaN values guessed by their distribution and majority of values occurence respectively.

There are non numeric values on Names, Sex, Ticket, Cabin, and Embarked columns. We can convert it  to their numerical values representation or just drop it. I thought Names, Ticket, Cabin will be dropped. Sex will not be dropped because in its Movies children and women get priority of rescue. So does Embarked column will not be dropped.

Drop PassengerId, Name, Ticket, Cabin columns because i thought they dont affect the prediction. 

In [0]:
dfTitanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [0]:
dfTitanic.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


# Dealing with Missing Data

In [0]:
dfTitanic.isnull().sum().sort_values(ascending=False)[0:5]

Cabin       687
Age         177
Embarked      2
Fare          0
Ticket        0
dtype: int64

Because the cabin column has many NaN values we just gonna drop it.

In [0]:
# Guessing new values for NaN values on column Age then assign that values to it.
# This guessing using distribution of column Age and guess that values fall onto most occurrencing values

meanAge = dfTitanic.Age.mean()
stdAge = dfTitanic.Age.std()
print(meanAge)
print(stdAge)

dfTitanic.Age = dfTitanic.Age.fillna(np.random.randint(meanAge-stdAge, meanAge+stdAge))
print(dfTitanic.Age.isnull().sum()) # chekk if there are any NaN values

29.69911764705882
14.526497332334044
0


In [0]:
# Embarked NaN values guessed using most occurrencing values
print(dfTitanic.Embarked.describe()) # the most occurrencing values is 'S'
dfTitanic.Embarked = dfTitanic.Embarked.fillna('S')
print(dfTitanic.Embarked.isnull().sum())

count     889
unique      3
top         S
freq      644
Name: Embarked, dtype: object
0


In [0]:
dfTitanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       891 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


# Drop Unnecessary Columns

In [0]:
# Drop PassengerId, Name, Ticket, Cabin columns
dfTitanicTidy = dfTitanic.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

In [0]:
dfTitanicTidy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null object
Age         891 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Embarked    891 non-null object
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


# Convert Object Datatype to Numeric Datatype

In [0]:
dfTitanicTidy.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [0]:
# On the sex column female value converted to 0 and male values converted to 1
jK = {'female':0, 'male':1}
dfTitanicTidy.Sex = dfTitanicTidy.Sex.map(jK)

In [0]:
dfTitanicTidy.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,S
1,1,1,0,38.0,1,0,71.2833,C
2,1,3,0,26.0,0,0,7.925,S
3,1,1,0,35.0,1,0,53.1,S
4,0,3,1,35.0,0,0,8.05,S


In [0]:
# Convert object values in Embarked column to Numerical one
print(dfTitanicTidy.Embarked.unique()) # Search for unique valus in this embarked series
embrk = {'S':0, 'C':1, 'Q':2}
dfTitanicTidy.Embarked = dfTitanicTidy.Embarked.map(embrk)

['S' 'C' 'Q']


In [0]:
dfTitanicTidy.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,0
1,1,1,0,38.0,1,0,71.2833,1
2,1,3,0,26.0,0,0,7.925,0
3,1,1,0,35.0,1,0,53.1,0
4,0,3,1,35.0,0,0,8.05,0


In [0]:
dfTitanicTidy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null int64
Age         891 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Embarked    891 non-null int64
dtypes: float64(2), int64(6)
memory usage: 55.8 KB


# Convert This Dataframe to Array

In [0]:
# Create features dataset (X)
dfTitanicTidy.iloc[:,1:].head() # Select features dataset from dfTitanicTidy 

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,22.0,1,0,7.25,0
1,1,0,38.0,1,0,71.2833,1
2,3,0,26.0,0,0,7.925,0
3,1,0,35.0,1,0,53.1,0
4,3,1,35.0,0,0,8.05,0


In [0]:
X = dfTitanicTidy.iloc[:,1:].values # Convert it using values attribute and assign it to X variable
print(X[0:5])
print(X.shape)

[[ 3.      1.     22.      1.      0.      7.25    0.    ]
 [ 1.      0.     38.      1.      0.     71.2833  1.    ]
 [ 3.      0.     26.      0.      0.      7.925   0.    ]
 [ 1.      0.     35.      1.      0.     53.1     0.    ]
 [ 3.      1.     35.      0.      0.      8.05    0.    ]]
(891, 7)


In [0]:
# Create response dataset (y)
y = dfTitanicTidy['Survived'].values
print(y[0:5])
print(y.shape)

[0 1 1 1 0]
(891,)


# Split the Titanic Dataset using SKLearn

The iris dataset sparated randomly to training dataset and test dataset with ratio of 7:3

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=4)
print('Shape of train dataset {} {}'.format(X_train.shape, y_train.shape))
print('Shape of test dataset {} {}'.format(X_test.shape, y_test.shape))

Shape of train dataset (623, 7) (623,)
Shape of test dataset (268, 7) (268,)


# Begin Classification

In [0]:
# Decision Tree Classifier
def decision3SKL(xTrain, yTrain, xTest, yTest,kriteria, depth):
  decisionTree = DecisionTreeClassifier(criterion=kriteria, max_depth=depth)
  decisionTree.fit(xTrain, yTrain)
  yPredD3 = decisionTree.predict(xTest)
  
  #Model Evaluation
  accTrainD3 = decisionTree.score(xTrain, yTrain)
  accTestD3 = metrics.accuracy_score(yPredD3, yTest)
  print('Classification using Decision Tree: ')
  print('Accuracy of training dataset {}'.format(round(accTrainD3*100,2)))
  print('Accuracy of test dataset {}'.format(round(accTestD3*100,2)))

# K Neighbors Classifier
def knnSKL(xTrain, yTrain, xTest,yTest,nNeighbors):
  Knn = KNeighborsClassifier(n_neighbors=nNeighbors)
  Knn.fit(xTrain, yTrain)
  yPredKnn = Knn.predict(xTest)
  
  #Model Evaluation
  accTestKnn = metrics.accuracy_score(yPredKnn, yTest)
  accTrainKnn = Knn.score(xTrain, yTrain)
  print('\nClassification using K Neighbors Classifier: ')
  print('Number of Nearest Neighbors {}'.format(nNeighbors))
  print('Accuracy training dataset {}'.format(round(accTrainKnn*100,2)))
  print('Accuracy test dataset {}'.format(round(accTestKnn*100,2)))
  
# Logistic Regression Classifier
def logistik(xTrain, yTrain, xTest,yTest,reglr, solvr):
  LR = LogisticRegression(C=reglr, solver=solvr)
  LR.fit(xTrain, yTrain)
  yPredLR = LR.predict(xTest)
  
  #Model Evaluation
  accTestLR = metrics.accuracy_score(yPredLR, yTest)
  accTrainLR = LR.score(xTrain, yTrain)
  print('\nClassifiction using Logistic Regression:')
  print('Accuracy of training dataset {}'.format(round(accTrainLR*100,2)))
  print('Accuracy of test dataset {}'.format(round(accTestLR*100,2)))
  
# Support Vector Machine Classifier
def suppVM(xTrain, yTrain, xTest, yTest, kernelSVM):
  supportVM = SVC(kernel=kernelSVM)
  supportVM.fit(xTrain, yTrain)
  yPredSVM = supportVM.predict(xTest)
  
  accTestSVM = metrics.accuracy_score(yPredSVM, yTest)
  #accTrainSVM = SVC.score(xTrain, yTrain)
  print('\nClassifiction using Support Vector Machine:')
  #print('Accuracy of training dataset {}'.format(round(accTrainSVM*100,2)))
  print('Accuracy of test dataset {}'.format(round(accTestSVM*100,2)))

In [0]:
decision3SKL(X_train, y_train, X_test, y_test, 'entropy', None)
knnSKL(X_train, y_train, X_test, y_test, 3)
logistik(X_train, y_train, X_test, y_test, 0.01, 'newton-cg')
suppVM(X_train, y_train, X_test, y_test, 'rbf')

Classification using Decision Tree: 
Accuracy of training dataset 98.56
Accuracy of test dataset 79.48

Classification using K Neighbors Classifier: 
Number of Nearest Neighbors 3
Accuracy training dataset 83.79
Accuracy test dataset 72.76

Classifiction using Logistic Regression:
Accuracy of training dataset 73.03
Accuracy of test dataset 76.49

Classifiction using Support Vector Machine:
Accuracy of test dataset 69.78




FROM RUNNING OF MACHINE LEARNING MODELS ABOVE. THE DECISION TREE CLASSIFIER GETS THE HIGH ACCURACY