# Import Packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
%matplotlib inline

# Data

In [2]:
titanic = pd.read_csv("train_titanic.csv")
test = pd.read_csv("test.csv")

In [3]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Drop Pointless Columns and target variable

In [4]:
X = titanic.drop(['Survived',"Name","Ticket"], axis=1)
y = titanic['Survived']

#X_test = titanic.drop(['Survived',"Name","Ticket"], axis=1)

# Split Data into Test and Train

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

# Missing Values

In [6]:
print("*****In the train set*****")
print(X_train.isna().sum())
print("\n")
print("*****In the test set*****")
print(X_test.isna().sum())

*****In the train set*****
PassengerId      0
Pclass           0
Sex              0
Age            136
SibSp            0
Parch            0
Fare             0
Cabin          550
Embarked         2
dtype: int64


*****In the test set*****
PassengerId      0
Pclass           0
Sex              0
Age             41
SibSp            0
Parch            0
Fare             0
Cabin          137
Embarked         0
dtype: int64


In [7]:
# Fill missing values with mean column values in the train set
X_train.fillna(X_train.mean(), inplace=True)
# Fill missing values with mean column values in the test set
X_test.fillna(X_test.mean(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


# Convert non-numeric to numeric through label encoding

In [8]:
from sklearn import preprocessing 
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

In [9]:
X_train.dtypes
X_test.dtypes

PassengerId      int64
Pclass           int64
Sex             object
Age            float64
SibSp            int64
Parch            int64
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [10]:
X_train[['Sex', 'Cabin', 'Embarked']] = X_train[['Sex', 'Cabin', 'Embarked']].astype(str)
X_test[['Sex', 'Cabin', 'Embarked']] = X_test[['Sex', 'Cabin', 'Embarked']].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [11]:
# encode variables into numeric labels
le = LabelEncoder()

columns = ['Sex', 'Embarked', 'Cabin']

for col in columns:
    le.fit(X_train[col])
    X_train[col] = le.transform(X_train[col])
    
for col in columns:
    le.fit(X_test[col])
    X_test[col] = le.transform(X_test[col])
    
X_train.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
523,524,1,0,44.0,0,1,57.9792,13,0
432,433,2,0,42.0,1,0,26.0,127,2
217,218,2,1,42.0,1,0,27.0,127,2
607,608,1,1,27.0,0,0,30.5,127,2
476,477,2,1,34.0,1,0,21.0,127,2


# Run SVM Model

In [12]:
from sklearn.svm import SVC
svclassifier = SVC(kernel='linear')
svclassifier.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [13]:
y_pred = svclassifier.predict(X_test)

# Results

In [14]:
#The confusion matrix shows:
#Top left = True Negative
#Bottom Left = False Negative
#Top Right = True Positive
#Bottom Right = False Positive

In [15]:
#The Classification Report shows:

#Precision – Accuracy of positive predictions.
#Precision = TP/(TP + FP)

#Recall: Fraction of positives that were correctly identified.
#Recall = TP/(TP+FN)

#As a rule of thumb the weighted average of F1 should be used to compare classifier alogorithms
#F1 Score = 2*(Recall * Precision) / (Recall + Precision)

In [16]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[90 19]
 [18 52]]
              precision    recall  f1-score   support

           0       0.83      0.83      0.83       109
           1       0.73      0.74      0.74        70

    accuracy                           0.79       179
   macro avg       0.78      0.78      0.78       179
weighted avg       0.79      0.79      0.79       179

