In [59]:
#Importing necessary libraries

import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

### STEP 1: DATA Operation 

#### Download and Initial Finding

In [98]:
#opening the dataset in the pandas lib
#Using this link to download the data from the kaggle : https://www.kaggle.com/c/titanic/data
titanic_data = pd.read_csv('titanic.csv')

In [99]:
#Showing the first 5 instances of the data
titanic_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [100]:
# Information about the columns and its values

'''
Variable	Definition	Key
survival	Survival	0 = No, 1 = Yes
pclass	Ticket class	1 = 1st, 2 = 2nd, 3 = 3rd
sex	Sex	
Age	Age in years	
sibsp	# of siblings / spouses aboard the Titanic	
parch	# of parents / children aboard the Titanic	
ticket	Ticket number	
fare	Passenger fare	
cabin	Cabin number	
embarked	Port of Embarkation	C = Cherbourg, Q = Queenstown, S = Southampton
'''

'\nVariable\tDefinition\tKey\nsurvival\tSurvival\t0 = No, 1 = Yes\npclass\tTicket class\t1 = 1st, 2 = 2nd, 3 = 3rd\nsex\tSex\t\nAge\tAge in years\t\nsibsp\t# of siblings / spouses aboard the Titanic\t\nparch\t# of parents / children aboard the Titanic\t\nticket\tTicket number\t\nfare\tPassenger fare\t\ncabin\tCabin number\t\nembarked\tPort of Embarkation\tC = Cherbourg, Q = Queenstown, S = Southampton\n'

In [101]:
titanic_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,0,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,0,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,0,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [102]:
'''
We can see here there are 418 rows and 12 columns in the train data set, lets just quickly checks how many columns have NULL value
'''
titanic_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

### STEP 2: PRE-PROCESSING 

In [103]:
#Pre-Processing
'''
As we have seen that 3 of the columns named: Age, Cabin and Embarked are having the NULL values

1. We would simply drop the column cabin- as it is not much of importance
2. For the age column we will replace the null value withe mean value
3. For Fare column we will use mean value .

'''

titanic_data = titanic_data.drop(columns='Cabin')
titanic_data['Age'].fillna(titanic_data['Age'].mean(), inplace=True)
titanic_data['Fare'].fillna(titanic_data['Fare'].mean(), inplace=True)

In [104]:
'''
Running the IsNULL method for the verification
'''
titanic_data.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [105]:
'''
As we have seen that SEX and Embarked are having the string values, lets just quickly changes it into the categorical numerical values
'''
titanic_data.replace({'Sex':{'male':0,'female':1}, 'Embarked':{'S':0,'C':1,'Q':2}}, inplace=True)

In [106]:
titanic_data.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,892,0,3,"Kelly, Mr. James",0,34.5,0,0,330911,7.8292,2
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,1,0,363272,7.0,0
2,894,0,2,"Myles, Mr. Thomas Francis",0,62.0,0,0,240276,9.6875,2
3,895,0,3,"Wirz, Mr. Albert",0,27.0,0,0,315154,8.6625,0
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,1,1,3101298,12.2875,0


In [107]:
# Removing not so important columns
 titanic_data.drop(columns = ['PassengerId','Name','Ticket'],axis=1)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,34.50000,0,0,7.8292,2
1,1,3,1,47.00000,1,0,7.0000,0
2,0,2,0,62.00000,0,0,9.6875,2
3,0,3,0,27.00000,0,0,8.6625,0
4,1,3,1,22.00000,1,1,12.2875,0
...,...,...,...,...,...,...,...,...
413,0,3,0,30.27259,0,0,8.0500,0
414,1,1,1,39.00000,0,0,108.9000,1
415,0,3,0,38.50000,0,0,7.2500,0
416,0,3,0,30.27259,0,0,8.0500,0


### Step 3:  Data Prepration

In [108]:
'''
Splitting the data into features and  target values
'''
X = titanic_data.drop(columns = ['Survived','PassengerId','Name','Ticket'],axis=1)
Y = titanic_data['Survived']

In [109]:
print(X.head())
print(Y.head())

   Pclass  Sex   Age  SibSp  Parch     Fare  Embarked
0       3    0  34.5      0      0   7.8292         2
1       3    1  47.0      1      0   7.0000         0
2       2    0  62.0      0      0   9.6875         2
3       3    0  27.0      0      0   8.6625         0
4       3    1  22.0      1      1  12.2875         0
0    0
1    1
2    0
3    0
4    1
Name: Survived, dtype: int64


In [110]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, random_state=2)

### Step 4 : Model Prepration - Logistic Regression

In [111]:
model = LogisticRegression()

In [134]:
model.fit(X_train, Y_train)

LogisticRegression()

In [135]:
print(model.coef_)

[[-8.85245522e-02  6.30141189e+00 -5.63138059e-03  2.00058275e-02
   1.72017148e-01  2.37046344e-03  2.15517287e-01]]


In [168]:
'''
Here from the coefficient, we gets the magnitude of the influence of the parameter 
'''
cols = X_train.columns
inf = model.coef_
idx = np.argsort(inf[0])
cols = list(cols[idx])
#Order in which it makes influence 
cols[::-1]

['Sex', 'Embarked', 'Parch', 'SibSp', 'Fare', 'Age', 'Pclass']

### Step 5 : Model Evaluation 

In [113]:
'''
Train Data Accuracy
'''
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)
print('Accuracy score of training data : ', training_data_accuracy)

Accuracy score of training data :  1.0


In [114]:
'''
Test Data Accuracy
'''
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(Y_test, X_test_prediction)
print('Accuracy score of test data : ', test_data_accuracy)

Accuracy score of test data :  1.0


### Checking for the Random person

In [115]:
input_data = (3,0,35,0,0,8.05,0)  # Note that these datas exclude the Survived data, as it is to be determined from the model itself

In [116]:
input_data_as_numpy_array = np.asarray(input_data)

In [117]:
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

In [118]:
prediction = model.predict(input_data_reshaped)
#print(prediction)
if prediction[0]==0:
    print("Dead")
if prediction[0]==1:
    print("Alive")

Dead


In [169]:
## Putting more parents/children info along with male, so to make that person more favorable for survival
input_data2 = (1,0,33,0,20,500,0) 
input_data_as_numpy_array = np.asarray(input_data2)
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)
prediction = model.predict(input_data_reshaped)
#print(prediction)
if prediction[0]==0:
    print("Dead")
if prediction[0]==1:
    print("Alive")

Alive


In [129]:
# Female info 
input_data3 = (1,1,22,0,0,50,0) 
input_data_as_numpy_array = np.asarray(input_data3)
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)
prediction = model.predict(input_data_reshaped)
#print(prediction)
if prediction[0]==0:
    print("Dead")
if prediction[0]==1:
    print("Alive")

Alive


In [133]:
# if you are a woman or girl your chance of survival would had been high