#### Applying simple logistic regression on Titanic dataset

In [1]:
# importing required libraries
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import sklearn.metrics

import warnings
warnings.filterwarnings("ignore")

In [2]:
train_data = pd.read_csv("./Data/train_titanic.csv")
train_data.head()

Unnamed: 0,Survived,Age,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,...,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Embarked_C,Embarked_Q,Embarked_S
0,0,28.5,7.2292,0,0,1,0,1,1,0,...,1,0,0,0,0,0,0,1,0,0
1,1,27.0,10.5,0,1,0,1,0,1,0,...,1,0,0,0,0,0,0,0,0,1
2,1,29.699118,16.1,0,0,1,1,0,0,1,...,1,0,0,0,0,0,0,0,0,1
3,0,29.699118,0.0,1,0,0,0,1,1,0,...,1,0,0,0,0,0,0,0,0,1
4,0,17.0,8.6625,0,0,1,0,1,1,0,...,1,0,0,0,0,0,0,0,0,1


In [3]:
test_data = pd.read_csv("./Data/test_titanic.csv")
test_data.head()

Unnamed: 0,Survived,Age,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,...,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Embarked_C,Embarked_Q,Embarked_S
0,0,35.0,7.125,0,0,1,0,1,1,0,...,1,0,0,0,0,0,0,0,0,1
1,0,20.0,7.05,0,0,1,0,1,1,0,...,1,0,0,0,0,0,0,0,0,1
2,0,26.0,7.8958,0,0,1,0,1,1,0,...,1,0,0,0,0,0,0,0,0,1
3,1,58.0,146.5208,1,0,0,1,0,1,0,...,1,0,0,0,0,0,0,1,0,0
4,1,35.0,83.475,1,0,0,1,0,0,1,...,1,0,0,0,0,0,0,0,0,1


In [4]:
# shape of the dataset
print('Shape of training data :',train_data.shape)
print('Shape of testing data :',test_data.shape)

Shape of training data : (712, 25)
Shape of testing data : (179, 25)


Now, we need to predict the missing target variable in the test data
target variable - Survived

In [5]:
# seperate the independent and target variable on training data
train_x = train_data.drop(['Survived'], axis = 1)
train_y = train_data['Survived']

In [6]:
# seperate the independent and target variable in testing data
test_x = test_data.drop(['Survived'], axis = 1)
test_y = test_data['Survived']

''
Create the object of the Logistic Regression model
You can also add other parameters and test your code here
Some parameters are : fit_intercept and penalty
Documentation of sklearn LogisticRegression: 

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

 '''

In [7]:
### Model creation
model = LogisticRegression()

In [8]:
model.fit(train_x, train_y)

In [9]:
print('Coefficients of the model are: ', model.coef_)

Coefficients of the model are:  [[-0.02994753  0.00180934  0.88465128  0.09918644 -0.99937026  1.23813804
  -1.25367058  1.00981822  0.96176804  0.60861592 -1.1139701  -0.76392069
  -0.27898073 -0.4388632   0.15413063  0.60781533 -0.03316805  0.20281203
  -0.45097758 -0.33138694 -0.16475796  0.08570664  0.27028198 -0.37152115]]


In [10]:
## Intercepts of the model
print("Intercepts of the model are: ", model.intercept_)

Intercepts of the model are:  [0.06352384]


In [29]:
## model probability for training

train_prob_df = pd.DataFrame(model.predict_proba(train_x), columns=["Probability of Not Survived", "Probability of Survived"])
train_prob_df["Actual"] = train_y.values
train_prob_df

Unnamed: 0,Probability of Not Survived,Probability of Survived,Actual
0,0.676931,0.323069,0
1,0.663014,0.336986,1
2,0.654311,0.345689,1
3,0.700796,0.299204,0
4,0.638748,0.361252,0
...,...,...,...
707,0.464891,0.535109,1
708,0.657954,0.342046,0
709,0.110697,0.889303,0
710,0.615440,0.384560,1


In [31]:
## model probability for testing data
train_prob_df = pd.DataFrame(model.predict_proba(test_x), columns=["Probability of Not Survived", "Probability of Survived"])
train_prob_df["Actual"] = test_y.values
print(train_prob_df)

Unnamed: 0,Probability of Not Survived,Probability of Survived,Actual
0,0.695576,0.304424,0
1,0.652628,0.347372,0
2,0.667766,0.332234,0
3,0.328131,0.671869,1
4,0.454131,0.545869,1
...,...,...,...
174,0.723658,0.276342,0
175,0.631525,0.368475,0
176,0.717501,0.282499,0
177,0.757568,0.242432,0


In [11]:
## Predicting the target on the train dataset
predict_train = model.predict(train_x)
predict_train

array([0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0,
       0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1,
       1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,

In [12]:
## Accuracy on the train dataset
print("Model accuracy for training dataset is",round(accuracy_score(train_y, predict_train),2)*100,"%")

Model accuracy for training dataset is 81.0 %


In [13]:
## Prediction on the test data
predict_test = model.predict(test_x)
predict_test

array([0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1,
       1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0,
       0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 1], dtype=int64)

In [14]:
## Accuracy score on the test dataset - 
print("Model accuracy for Testing dataset is",round(accuracy_score(test_y, predict_test),2)*100,"%")

Model accuracy for Testing dataset is 83.0 %
