# Logistic Regressions

## Data Import and Cleaning

Getting started with Logistic regressions

In [1]:
import pandas as pd
import numpy as np

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

Going to use the Kaggle titanic dataset to practice Logistic regressions:
https://www.kaggle.com/c/titanic

In [3]:
train = pd.read_csv('titanic_train.csv')

In [4]:
test = pd.read_csv('titanic_test.csv')

In [29]:
test_survive = pd.read_csv('survival_test.csv')

In [49]:
test_survive.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [5]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Let's convert Pclass, Sex, Cabin, Embarked to indicator variables 

In [6]:
male = pd.get_dummies(train.Sex,drop_first=True)
male.head()

Unnamed: 0,male
0,1
1,0
2,0
3,0
4,1


In [7]:
embark = pd.get_dummies(train.Embarked,drop_first=True)
embark.head()

Unnamed: 0,Q,S
0,0,1
1,0,0
2,0,1
3,0,1
4,0,1


In [8]:
fare_class = pd.get_dummies(train.Pclass,drop_first=True)
fare_class.head()

Unnamed: 0,2,3
0,0,1
1,0,0
2,0,1
3,0,0
4,0,1


In [9]:
#lots of cabin values are unknown, so creating an indicator
#as to known or not
cabin_known = train.Cabin.notnull()*1
cabin_known.head()

0    0
1    1
2    0
3    1
4    0
Name: Cabin, dtype: int64

Create the training data frame with our indicator variables

In [10]:
training = train.drop(['PassengerId','Sex','Embarked','Name','Ticket','Cabin'],axis=1,inplace=False)

In [11]:
training.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
0,0,3,22.0,1,0,7.25
1,1,1,38.0,1,0,71.2833
2,1,3,26.0,0,0,7.925
3,1,1,35.0,1,0,53.1
4,0,3,35.0,0,0,8.05


In [12]:
training = pd.concat([training,male,embark,cabin_known,fare_class],axis=1)

In [13]:
training.tail()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,male,Q,S,Cabin,2,3
886,0,2,27.0,0,0,13.0,1,0,1,0,1,0
887,1,1,19.0,0,0,30.0,0,0,1,1,0,0
888,0,3,,1,2,23.45,0,0,1,0,0,1
889,1,1,26.0,0,0,30.0,1,0,0,1,0,0
890,0,3,32.0,0,0,7.75,1,1,0,0,0,1


Checking for null values

In [14]:
training.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
male        891 non-null uint8
Q           891 non-null uint8
S           891 non-null uint8
Cabin       891 non-null int64
2           891 non-null uint8
3           891 non-null uint8
dtypes: float64(2), int64(5), uint8(5)
memory usage: 53.2 KB


Create an indicator for whether age is known or not, set Age = -1 if unknown

<b>Brad, curious if this is a good approach</b>

In [15]:
age_unknown = train.Age.isnull()*1

In [16]:
training = pd.concat([age_unknown,training],axis=1)

In [17]:
training.columns.values[0] = "age_unknown"
training.head()

Unnamed: 0,age_unknown,Survived,Pclass,Age,SibSp,Parch,Fare,male,Q,S,Cabin,2,3
0,0,0,3,22.0,1,0,7.25,1,0,1,0,0,1
1,0,1,1,38.0,1,0,71.2833,0,0,0,1,0,0
2,0,1,3,26.0,0,0,7.925,0,0,1,0,0,1
3,0,1,1,35.0,1,0,53.1,0,0,1,1,0,0
4,0,0,3,35.0,0,0,8.05,1,0,1,0,0,1


In [18]:
def modify_Age(u):
    if u > 0:
        return u
    else:
        return -1

In [19]:
training.Age = training.Age.apply(lambda x: modify_Age(x))

In [20]:
training.head()

Unnamed: 0,age_unknown,Survived,Pclass,Age,SibSp,Parch,Fare,male,Q,S,Cabin,2,3
0,0,0,3,22.0,1,0,7.25,1,0,1,0,0,1
1,0,1,1,38.0,1,0,71.2833,0,0,0,1,0,0
2,0,1,3,26.0,0,0,7.925,0,0,1,0,0,1
3,0,1,1,35.0,1,0,53.1,0,0,1,1,0,0
4,0,0,3,35.0,0,0,8.05,1,0,1,0,0,1


In [21]:
training.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
age_unknown    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Fare           891 non-null float64
male           891 non-null uint8
Q              891 non-null uint8
S              891 non-null uint8
Cabin          891 non-null int64
2              891 non-null uint8
3              891 non-null uint8
dtypes: float64(2), int64(6), uint8(5)
memory usage: 60.1 KB


## Preparing Test Data

In [87]:
male2 = pd.get_dummies(test.Sex,drop_first=True)
embark2 = pd.get_dummies(test.Embarked,drop_first=True)
fare_class2 = pd.get_dummies(test.Pclass,drop_first=True)
cabin_known2 = test.Cabin.notnull()*1

In [88]:
testing = test.drop(['PassengerId','Sex','Embarked','Name','Ticket','Cabin'],axis=1,inplace=False)

In [89]:
testing = pd.concat([testing,male2,embark2,cabin_known2,fare_class2],axis=1)

In [90]:
test.shape[0]

418

In [54]:
testing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
Pclass    418 non-null int64
Age       332 non-null float64
SibSp     418 non-null int64
Parch     418 non-null int64
Fare      417 non-null float64
male      418 non-null uint8
Q         418 non-null uint8
S         418 non-null uint8
Cabin     418 non-null int64
2         418 non-null uint8
3         418 non-null uint8
dtypes: float64(2), int64(4), uint8(5)
memory usage: 21.7 KB


In [55]:
age_unknown2 = test.Age.isnull()*1
testing = pd.concat([age_unknown2,testing],axis=1)
testing.columns.values[0] = "age_unknown"

0      0
1      1
2      0
3      0
4      1
5      0
6      1
7      0
8      1
9      0
10     0
11     0
12     1
13     0
14     1
15     1
16     0
17     0
18     1
19     1
20     0
21     0
22     1
23     0
24     1
25     0
26     1
27     0
28     0
29     0
      ..
388    0
389    0
390    0
391    1
392    0
393    0
394    0
395    1
396    0
397    1
398    0
399    0
400    1
401    0
402    1
403    0
404    0
405    0
406    0
407    0
408    1
409    1
410    1
411    1
412    1
413    0
414    1
415    0
416    0
417    0
Name: Survived, Length: 418, dtype: int64

In [56]:
testing.Age = testing.Age.apply(lambda x: modify_Age(x))

In [57]:
testing.tail()

Unnamed: 0,age_unknown,Pclass,Age,SibSp,Parch,Fare,male,Q,S,Cabin,2,3
413,1,3,-1.0,0,0,8.05,1,0,1,0,0,1
414,0,1,39.0,0,0,108.9,0,0,0,1,0,0
415,0,3,38.5,0,0,7.25,1,0,1,0,0,1
416,1,3,-1.0,0,0,8.05,1,0,1,0,0,1
417,1,3,-1.0,1,1,22.3583,1,0,0,0,0,1


In [95]:
testing.shape[0]


418

In [96]:
testing = pd.concat([test_survive.Survived,testing],axis=1)

In [67]:
testing.head()

Unnamed: 0,age_unknown,Pclass,Age,SibSp,Parch,Fare,male,Q,S,Cabin,2,3,0
0,0.0,3.0,34.5,0.0,0.0,7.8292,1.0,1.0,0.0,0.0,0.0,1.0,
1,0.0,3.0,47.0,1.0,0.0,7.0,0.0,0.0,1.0,0.0,0.0,1.0,
2,0.0,2.0,62.0,0.0,0.0,9.6875,1.0,1.0,0.0,0.0,1.0,0.0,
3,0.0,3.0,27.0,0.0,0.0,8.6625,1.0,0.0,1.0,0.0,0.0,1.0,
4,0.0,3.0,22.0,1.0,1.0,12.2875,0.0,0.0,1.0,0.0,0.0,1.0,


## Fitting the Model

In [39]:
from sklearn.linear_model import LogisticRegression

In [40]:
X_train = training.drop('Survived',axis=1)
y_train = training.Survived

In [41]:
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [42]:
testing.head()

Unnamed: 0,age_unknown,Pclass,Age,SibSp,Parch,Fare,male,Q,S,Cabin,2,3
0,0,3,34.5,0,0,7.8292,1,1,0,0,0,1
1,0,3,47.0,1,0,7.0,0,0,1,0,0,1
2,0,2,62.0,0,0,9.6875,1,1,0,0,1,0
3,0,3,27.0,0,0,8.6625,1,0,1,0,0,1
4,0,3,22.0,1,1,12.2875,0,0,1,0,0,1


In [43]:
training.head()

Unnamed: 0,age_unknown,Survived,Pclass,Age,SibSp,Parch,Fare,male,Q,S,Cabin,2,3
0,0,0,3,22.0,1,0,7.25,1,0,1,0,0,1
1,0,1,1,38.0,1,0,71.2833,0,0,0,1,0,0
2,0,1,3,26.0,0,0,7.925,0,0,1,0,0,1
3,0,1,1,35.0,1,0,53.1,0,0,1,1,0,0
4,0,0,3,35.0,0,0,8.05,1,0,1,0,0,1


In [97]:
testing.dropna(inplace=True)

In [98]:
predictions = logmodel.predict(testing)

In [104]:
from sklearn.metrics import confusion_matrix

In [105]:
confusion_matrix(testing.Survived,predictions)

array([[190,  14],
       [ 52,  75]])

In [99]:
from sklearn.metrics import classification_report

In [101]:
print(classification_report(testing.Survived,predictions))

              precision    recall  f1-score   support

           0       0.79      0.93      0.85       204
           1       0.84      0.59      0.69       127

   micro avg       0.80      0.80      0.80       331
   macro avg       0.81      0.76      0.77       331
weighted avg       0.81      0.80      0.79       331

