## Importing Necessary Libraries

In [1]:
# for analysis
import pandas as pd
import numpy as np

# for visulaization
import matplotlib.pyplot as plt
import seaborn as sns

# extra
import warnings
warnings.filterwarnings(action="ignore")

## Data Loading

In [2]:
df=pd.read_csv("data.csv") # Titanic Data
#https://drive.google.com/file/d/1SmKv_XVNcpJVuJkr3xKyRTsNN0_pt1Z6/view?usp=sharing

In [3]:
df.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [5]:
## We need to build a logistic regression model to predict whether a passenger 
## will survive or not in the titatnic incident

In [4]:
df["Survived"].value_counts()

Survived
0    549
1    342
Name: count, dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


## Missing Values

In [7]:
100*df.isnull().mean()

PassengerId     0.000000
Survived        0.000000
Pclass          0.000000
Name            0.000000
Sex             0.000000
Age            19.865320
SibSp           0.000000
Parch           0.000000
Ticket          0.000000
Fare            0.000000
Cabin          77.104377
Embarked        0.224467
dtype: float64

In [8]:
df=df.drop(["Cabin"],axis=1)

In [9]:
100*df.isnull().mean()

PassengerId     0.000000
Survived        0.000000
Pclass          0.000000
Name            0.000000
Sex             0.000000
Age            19.865320
SibSp           0.000000
Parch           0.000000
Ticket          0.000000
Fare            0.000000
Embarked        0.224467
dtype: float64

In [10]:
med=df["Age"].median()#Numerical
df["Age"]=df["Age"].fillna(med)

In [11]:
mod=df["Embarked"].mode()[0]#Categorical
df["Embarked"]=df["Embarked"].fillna(mod)

In [12]:
100*df.isnull().mean()

PassengerId    0.0
Survived       0.0
Pclass         0.0
Name           0.0
Sex            0.0
Age            0.0
SibSp          0.0
Parch          0.0
Ticket         0.0
Fare           0.0
Embarked       0.0
dtype: float64

In [None]:
# EDA: We are skipping EDA here, considering the time and agenda limitation otherwise EDA always needs to be done
# without fail in every ML/DS problem

## PreProcessing

In [13]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [14]:
df["Sex"]=df["Sex"].map({"male":0,"female":1})

In [15]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,S


In [16]:
dum_cols=["Pclass","SibSp","Parch","Embarked"]#2+6+6+2
# Dummies to be created for all categorical columns which are having more than 2 unique values

In [17]:
df["Pclass"]=df["Pclass"].astype(object)
df["SibSp"]=df["SibSp"].astype(object)
df["Parch"]=df["Parch"].astype(object)

In [18]:
dum=pd.get_dummies(df[dum_cols],drop_first=True,dtype=int)
df=pd.concat([df,dum],axis=1)

In [19]:
df=df.drop(dum_cols,axis=1)

In [20]:
df.head(2)

Unnamed: 0,PassengerId,Survived,Name,Sex,Age,Ticket,Fare,Pclass_2,Pclass_3,SibSp_1,...,SibSp_5,SibSp_8,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Embarked_Q,Embarked_S
0,1,0,"Braund, Mr. Owen Harris",0,22.0,A/5 21171,7.25,0,1,1,...,0,0,0,0,0,0,0,0,0,1
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,PC 17599,71.2833,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [21]:
df.shape

(891, 23)

In [22]:
df.columns

Index(['PassengerId', 'Survived', 'Name', 'Sex', 'Age', 'Ticket', 'Fare',
       'Pclass_2', 'Pclass_3', 'SibSp_1', 'SibSp_2', 'SibSp_3', 'SibSp_4',
       'SibSp_5', 'SibSp_8', 'Parch_1', 'Parch_2', 'Parch_3', 'Parch_4',
       'Parch_5', 'Parch_6', 'Embarked_Q', 'Embarked_S'],
      dtype='object')

In [23]:
df=df.drop(["PassengerId","Name","Ticket"],axis=1)

In [24]:
df.shape

(891, 20)

In [None]:
11+16->27-4=23-3=20

In [25]:
df.head()

Unnamed: 0,Survived,Sex,Age,Fare,Pclass_2,Pclass_3,SibSp_1,SibSp_2,SibSp_3,SibSp_4,SibSp_5,SibSp_8,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Embarked_Q,Embarked_S
0,0,0,22.0,7.25,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1
1,1,1,38.0,71.2833,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,1,26.0,7.925,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,1,1,35.0,53.1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1
4,0,0,35.0,8.05,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [26]:
df.columns

Index(['Survived', 'Sex', 'Age', 'Fare', 'Pclass_2', 'Pclass_3', 'SibSp_1',
       'SibSp_2', 'SibSp_3', 'SibSp_4', 'SibSp_5', 'SibSp_8', 'Parch_1',
       'Parch_2', 'Parch_3', 'Parch_4', 'Parch_5', 'Parch_6', 'Embarked_Q',
       'Embarked_S'],
      dtype='object')

In [27]:
X=df.drop("Survived",axis=1)
y=df["Survived"]

In [28]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=100)

In [29]:
print("Train X shape:",X_train.shape)
print("Test X shape:",X_test.shape)
print("Train y shape:",y_train.shape)
print("Test y shape:",y_test.shape)

Train X shape: (712, 19)
Test X shape: (179, 19)
Train y shape: (712,)
Test y shape: (179,)


## Scaling

In [30]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
X_train_scaled=scaler.fit_transform(X_train)
X_test_scaled=scaler.transform(X_test)

In [31]:
X_train_scaled

array([[0.        , 0.27969557, 0.01517579, ..., 0.        , 0.        ,
        1.        ],
       [0.        , 0.11660777, 0.0915427 , ..., 0.        , 0.        ,
        1.        ],
       [0.        , 0.38842077, 0.01512699, ..., 0.        , 1.        ,
        0.        ],
       ...,
       [1.        , 0.52432726, 0.16231419, ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.37483012, 0.13575256, ..., 0.        , 0.        ,
        1.        ],
       [1.        , 0.40201142, 0.18249985, ..., 0.        , 0.        ,
        1.        ]])

In [32]:
X_test_scaled=pd.DataFrame(X_test_scaled,columns=X_test.columns)
X_train_scaled=pd.DataFrame(X_train_scaled,columns=X_train.columns)

In [33]:
X_train_scaled

Unnamed: 0,Sex,Age,Fare,Pclass_2,Pclass_3,SibSp_1,SibSp_2,SibSp_3,SibSp_4,SibSp_5,SibSp_8,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Embarked_Q,Embarked_S
0,0.0,0.279696,0.015176,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.116608,0.091543,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.388421,0.015127,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.0,0.537918,0.299539,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.374830,0.027058,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
707,1.0,0.238924,0.018250,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
708,1.0,0.633052,0.102579,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
709,1.0,0.524327,0.162314,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
710,1.0,0.374830,0.135753,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


## Feature Selection

In [34]:
from sklearn.feature_selection import RFE#Recursive Feature Elimination
from sklearn.linear_model import LogisticRegression

estimator=LogisticRegression()
selector=RFE(estimator,n_features_to_select=10)# n_features_to_select generally ranges in 10 to 20
selector.fit(X_train_scaled,y_train)
selector.support_

array([ True,  True,  True, False,  True, False, False,  True,  True,
        True,  True, False, False, False,  True,  True, False, False,
       False])

In [35]:
cols_to_keep=X_train_scaled.columns[selector.support_]

In [36]:
cols_to_keep

Index(['Sex', 'Age', 'Fare', 'Pclass_3', 'SibSp_3', 'SibSp_4', 'SibSp_5',
       'SibSp_8', 'Parch_4', 'Parch_5'],
      dtype='object')

In [38]:
X_train=X_train_scaled[cols_to_keep]
X_test=X_test_scaled[cols_to_keep]

In [39]:
print("Train X shape:",X_train.shape)
print("Test X shape:",X_test.shape)
print("Train y shape:",y_train.shape)
print("Test y shape:",y_test.shape)

Train X shape: (712, 10)
Test X shape: (179, 10)
Train y shape: (712,)
Test y shape: (179,)


## First Model

In [41]:
import statsmodels.api as sm

In [42]:
X_train_sm=sm.add_constant(X_train,has_constant="add")
X_test_sm=sm.add_constant(X_test,has_constant="add")

In [43]:
model1=sm.GLM(np.array(y_train),X_train_sm,family=sm.families.Binomial())
# Binomial --> Binary Classification
result1=model1.fit()
result1.summary()
#Generalised Linear Model

0,1,2,3
Dep. Variable:,y,No. Observations:,712.0
Model:,GLM,Df Residuals:,701.0
Model Family:,Binomial,Df Model:,10.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-308.91
Date:,"Sat, 03 Aug 2024",Deviance:,617.81
Time:,10:26:55,Pearson chi2:,716.0
No. Iterations:,22,Pseudo R-squ. (CS):,0.3658
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.0429,0.338,-0.127,0.899,-0.706,0.620
Sex,2.6759,0.217,12.304,0.000,2.250,3.102
Age,-2.2668,0.625,-3.627,0.000,-3.492,-1.042
Fare,3.6919,1.404,2.630,0.009,0.941,6.443
Pclass_3,-1.4225,0.241,-5.896,0.000,-1.895,-0.950
SibSp_3,-1.8872,0.792,-2.383,0.017,-3.439,-0.335
SibSp_4,-1.4832,0.763,-1.945,0.052,-2.978,0.012
SibSp_5,-23.0903,3.16e+04,-0.001,0.999,-6.2e+04,6.2e+04
SibSp_8,-23.6207,3e+04,-0.001,0.999,-5.87e+04,5.87e+04


In [44]:
X_train_sm=X_train_sm.drop(["Parch_5"],axis=1)
X_test_sm=X_test_sm.drop(["Parch_5"],axis=1)

In [45]:
model2=sm.GLM(np.array(y_train),X_train_sm,family=sm.families.Binomial())
result2=model2.fit()
result2.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,712.0
Model:,GLM,Df Residuals:,702.0
Model Family:,Binomial,Df Model:,9.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-310.6
Date:,"Sat, 03 Aug 2024",Deviance:,621.2
Time:,10:28:53,Pearson chi2:,721.0
No. Iterations:,22,Pseudo R-squ. (CS):,0.3628
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.0049,0.336,0.014,0.988,-0.654,0.664
Sex,2.6533,0.216,12.257,0.000,2.229,3.078
Age,-2.3448,0.624,-3.757,0.000,-3.568,-1.122
Fare,3.6127,1.388,2.602,0.009,0.892,6.334
Pclass_3,-1.4578,0.240,-6.068,0.000,-1.929,-0.987
SibSp_3,-1.8620,0.791,-2.353,0.019,-3.413,-0.311
SibSp_4,-1.4661,0.762,-1.925,0.054,-2.959,0.027
SibSp_5,-23.0681,3.17e+04,-0.001,0.999,-6.21e+04,6.21e+04
SibSp_8,-23.5752,3e+04,-0.001,0.999,-5.88e+04,5.88e+04


In [47]:
model3=sm.GLM(np.array(y_train),X_train_sm,family=sm.families.Binomial())
result3=model3.fit()
result3.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,712.0
Model:,GLM,Df Residuals:,703.0
Model Family:,Binomial,Df Model:,8.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-314.29
Date:,"Sat, 03 Aug 2024",Deviance:,628.58
Time:,10:38:42,Pearson chi2:,727.0
No. Iterations:,21,Pseudo R-squ. (CS):,0.3562
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.0620,0.333,0.186,0.852,-0.591,0.715
Sex,2.6167,0.215,12.189,0.000,2.196,3.038
Age,-2.3608,0.622,-3.796,0.000,-3.580,-1.142
Fare,3.1582,1.297,2.435,0.015,0.617,5.700
Pclass_3,-1.5330,0.237,-6.464,0.000,-1.998,-1.068
SibSp_3,-1.7639,0.783,-2.254,0.024,-3.298,-0.230
SibSp_4,-1.3926,0.759,-1.835,0.066,-2.880,0.095
SibSp_5,-21.9829,1.93e+04,-0.001,0.999,-3.78e+04,3.77e+04
Parch_4,-22.0601,2.13e+04,-0.001,0.999,-4.17e+04,4.16e+04


In [48]:
X_train_sm=X_train_sm.drop(["SibSp_5"],axis=1)
X_test_sm=X_test_sm.drop(["SibSp_5"],axis=1)

In [49]:
model4=sm.GLM(np.array(y_train),X_train_sm,family=sm.families.Binomial())
result4=model4.fit()
result4.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,712.0
Model:,GLM,Df Residuals:,704.0
Model Family:,Binomial,Df Model:,7.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-316.36
Date:,"Sat, 03 Aug 2024",Deviance:,632.72
Time:,10:39:30,Pearson chi2:,733.0
No. Iterations:,20,Pseudo R-squ. (CS):,0.3524
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.0294,0.331,0.089,0.929,-0.619,0.678
Sex,2.6173,0.214,12.230,0.000,2.198,3.037
Age,-2.2565,0.615,-3.670,0.000,-3.462,-1.052
Fare,2.9961,1.266,2.367,0.018,0.515,5.477
Pclass_3,-1.5652,0.236,-6.620,0.000,-2.029,-1.102
SibSp_3,-1.7123,0.780,-2.194,0.028,-3.242,-0.183
SibSp_4,-1.3294,0.758,-1.755,0.079,-2.814,0.155
Parch_4,-21.0369,1.29e+04,-0.002,0.999,-2.53e+04,2.52e+04


In [50]:
X_train_sm=X_train_sm.drop(["Parch_4"],axis=1)
X_test_sm=X_test_sm.drop(["Parch_4"],axis=1)

In [51]:
model5=sm.GLM(np.array(y_train),X_train_sm,family=sm.families.Binomial())
result5=model5.fit()
result5.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,712.0
Model:,GLM,Df Residuals:,705.0
Model Family:,Binomial,Df Model:,6.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-318.39
Date:,"Sat, 03 Aug 2024",Deviance:,636.79
Time:,10:39:43,Pearson chi2:,740.0
No. Iterations:,5,Pseudo R-squ. (CS):,0.3487
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.1020,0.325,0.313,0.754,-0.536,0.740
Sex,2.6062,0.213,12.233,0.000,2.189,3.024
Age,-2.3714,0.611,-3.884,0.000,-3.568,-1.175
Fare,2.7248,1.184,2.301,0.021,0.404,5.046
Pclass_3,-1.6104,0.235,-6.867,0.000,-2.070,-1.151
SibSp_3,-1.6723,0.776,-2.156,0.031,-3.192,-0.152
SibSp_4,-1.3213,0.757,-1.745,0.081,-2.805,0.163


In [52]:
X_train_sm=X_train_sm.drop(["SibSp_4"],axis=1)
X_test_sm=X_test_sm.drop(["SibSp_4"],axis=1)

In [53]:
model6=sm.GLM(np.array(y_train),X_train_sm,family=sm.families.Binomial())
result6=model6.fit()
result6.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,712.0
Model:,GLM,Df Residuals:,706.0
Model Family:,Binomial,Df Model:,5.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-320.08
Date:,"Sat, 03 Aug 2024",Deviance:,640.16
Time:,10:40:03,Pearson chi2:,738.0
No. Iterations:,5,Pseudo R-squ. (CS):,0.3456
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.0089,0.319,0.028,0.978,-0.617,0.634
Sex,2.5883,0.212,12.223,0.000,2.173,3.003
Age,-2.1288,0.589,-3.615,0.000,-3.283,-0.975
Fare,2.5945,1.162,2.233,0.026,0.317,4.872
Pclass_3,-1.6470,0.233,-7.055,0.000,-2.105,-1.189
SibSp_3,-1.5844,0.773,-2.049,0.040,-3.100,-0.069


In [54]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif_data=pd.DataFrame()
vif_data["Feature"]=X_train_sm.columns

vif_data["VIF"]=[variance_inflation_factor(X_train_sm.values,i) 
                 for i in range(len(X_train_sm.columns))]
vif_data

Unnamed: 0,Feature,VIF
0,const,12.021107
1,Sex,1.05364
2,Age,1.11815
3,Fare,1.24644
4,Pclass_3,1.319626
5,SibSp_3,1.039456


## Evaluation

In [60]:
y_pred_train=result6.predict(X_train_sm)
y_pred_test=result6.predict(X_test_sm)

In [61]:
train=pd.DataFrame(y_pred_train,columns=["Predicted_prob"])
train["Actual"]=y_train
train["Predicted_Class"]=np.where(train["Predicted_prob"]>0.5,1,0)
train=train.dropna()

In [62]:
test=pd.DataFrame(y_pred_test,columns=["Predicted_prob"])
test["Actual"]=y_test
test["Predicted_Class"]=np.where(test["Predicted_prob"]>0.5,1,0)
test=test.dropna()

In [63]:
from sklearn import metrics
print("Train Accuracy =", metrics.accuracy_score(train["Actual"], train["Predicted_Class"]))
print("Test Accuracy =", metrics.accuracy_score(test["Actual"], test["Predicted_Class"]))

Train Accuracy = 0.5418894830659536
Test Accuracy = 0.43902439024390244


In [None]:
# Recall and Precision -- HW