# Logistic Regression

I will use the logistic regression model for predicting the survival of passengers on the Titanic because the result is binary which means that there can only be two outcomes (live or die).

# Code


In [2]:
# Import everything
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from statsmodels.formula.api import logit
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [3]:
# Import titanic dataset

titanic_df = pd.read_csv("titanic.csv")

titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
# Turn the sex column into 1's and 0'set

labels = LabelEncoder()

titanic_df['Sex'] = labels.fit_transform(titanic_df['Sex'].values)

# See the new data

titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,S


In [5]:
# Create a new dataframe

new_df = titanic_df[['Survived', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex']].copy()

# See the correlation for the new dataframe (essentially a heat map)

new_df.corr()

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,Sex
Survived,1.0,-0.077221,-0.035322,0.081629,0.257307,-0.543351
Age,-0.077221,1.0,-0.308247,-0.189119,0.096067,0.093254
SibSp,-0.035322,-0.308247,1.0,0.414838,0.159651,-0.114631
Parch,0.081629,-0.189119,0.414838,1.0,0.216225,-0.245489
Fare,0.257307,0.096067,0.159651,0.216225,1.0,-0.182333
Sex,-0.543351,0.093254,-0.114631,-0.245489,-0.182333,1.0


In [6]:
# X is what we are going to use to predict y

X = titanic_df[['Sex', 'Pclass']]

# y is what is going to be predicted (survival)

y = np.array(titanic_df['Survived']).reshape(-1,1)

In [7]:
# Split the data using train-test split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [8]:
# Fit the training data to the Logistic Regression model

regression = LogisticRegression()

regression.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [9]:
# Test the model

survival = regression.predict(X_test)

survival

array([0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0])

In [10]:
# See the accuracy

accuracy_score = regression.score(X_test, y_test)

accuracy_score

0.7910447761194029