### Applied Machine Learning
# Logistic Regression

<hr>


In [1]:
# importing libraries 
from IPython.display import display
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
# load training dataset 
titanic_train = pd.read_csv("titanic_train.csv")

#titanic_train = titanic_train.dropna()
print(titanic_train.columns)
display(titanic_train.isnull().sum()) # display missing values 
print("Total:", titanic_train.shape[0])

# dropping rows with missing Embarked
titanic_train = titanic_train[pd.notnull(titanic_train["Embarked"])]

# setting missing age to mean 
titanic_age = titanic_train[pd.notnull(titanic_train["Age"])]["Age"]
titanic_age_mean = round(sum(titanic_age) / len(titanic_age))
titanic_train["Age"] = titanic_train["Age"].replace(np.nan, titanic_age_mean)

display(titanic_train.isnull().sum())
print("Total:", titanic_train.shape[0])

# cleaning cabin removing numbers and keeping letters only 
#titanic_train["Cabin"] = titanic_train["Cabin"].str.replace("[^ABCDEFGHIJKLMNOPQRSTUVWXYZ]", "")
#display(titanic_train["Cabin"].value_counts())
#titanic_train["Ticket"] = titanic_train["Ticket"].str.replace("[^ABCDEFGHIJKLMNOPQRSTUVWXYZ]", "")
#display(titanic_train["Ticket"].value_counts())

# encoding categorical columns
encode_columns = ["Sex", "Embarked", "Cabin", "Ticket"]
for column in encode_columns:
    #display(titanic_train[column].value_counts())
    titanic_train[column] = titanic_train[column].astype("category").cat.codes
    #display(titanic_train[column].value_counts())
    
display(titanic_train.head())

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Total: 891


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

Total: 889


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,522,7.25,-1,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,595,71.2833,80,0
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,668,7.925,-1,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,48,53.1,54,2
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,471,8.05,-1,2


In [3]:
# data exploration 
display(titanic_train["Survived"].value_counts())

count_survived = len(titanic_train[titanic_train["Survived"] == 1])
count_no_survived = len(titanic_train[titanic_train["Survived"] == 0])
perc_survived = count_survived/(count_survived + count_no_survived) * 100
perc_no_survived = count_no_survived/(count_survived + count_no_survived) * 100

# we do have a reasonable ratio to train
print("Survived Percentage:" , perc_survived, "\nNot Survived:" , perc_no_survived)
print("Survive Ratio:", round(perc_survived), ":" , round(perc_no_survived))

0    549
1    340
Name: Survived, dtype: int64

Survived Percentage: 38.24521934758155 
Not Survived: 61.754780652418454
Survive Ratio: 38 : 62


In [4]:
# cabin was left out as too many missing values 
feature_columns = ["Pclass", "Sex", "SibSp", "Age" ,"Ticket", "Fare", "Embarked"]
X = titanic_train[feature_columns]
y = titanic_train["Survived"]

# splitting training and test data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4)
 
# ratio of survive/no survive for training
perc_survived_train = round(sum(1 for x in y_train if x == 1) / y_train.count() * 100)
perc_no_survived_train = round(sum(1 for x in y_train if x == 0) / y_train.count() * 100)
print("Survive Ratio (Train):", perc_survived_train, ":" , perc_no_survived_train)

# ratio of survive/no survive for test
perc_survived_test = round(sum(1 for x in y_test if x == 1) / y_test.count() * 100)
perc_no_survived_test = round(sum(1 for x in y_test if x == 0) / y_test.count() * 100)
print("Survive Ratio (Test):", perc_survived_test, ":" , perc_no_survived_test)

Survive Ratio (Train): 39.0 : 61.0
Survive Ratio (Test): 37.0 : 63.0


In [8]:
# logistic regression using scikit 
log_reg = LogisticRegression(max_iter = 1000, solver = "lbfgs")
log_reg.fit(X_train, y_train)

# predict the test data
y_pred = log_reg.predict(X_test)
print("\nAccuracy Logistic Regression (scikit):", log_reg.score(X_test, y_test))
print("\n" + classification_report(y_test, y_pred))


Accuracy Logistic Regression (scikit): 0.797752808988764

              precision    recall  f1-score   support

           0       0.83      0.86      0.84       226
           1       0.74      0.68      0.71       130

   micro avg       0.80      0.80      0.80       356
   macro avg       0.78      0.77      0.78       356
weighted avg       0.80      0.80      0.80       356

