# steps for modeling
* Importing the necessary libraries
* Data loading
* Data understanding
* Understand the problem statement
* Missing value treatment
* EDA
* Preprocessing before modelling
    * dummy variable creation
    * train test split
    * scaling

## Importing the necessary libraries

In [38]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Data loading

In [39]:
inp0 = pd.read_csv('data.csv')
inp0.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Understanding the problem statement

- We need to build a logistic regression model to predict whether a passenger will survive in the titanic incident

In [40]:
inp0['Survived'].value_counts()

Survived
0    549
1    342
Name: count, dtype: int64

## Missing values

In [41]:
inp0.isna().mean()

PassengerId    0.000000
Survived       0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
Age            0.198653
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.000000
Cabin          0.771044
Embarked       0.002245
dtype: float64

In [42]:
inp0 = inp0.drop(columns=['Cabin'])

In [43]:
med = inp0['Age'].median()
inp0['Age'] = inp0['Age'].fillna(med)

In [44]:
mod = inp0['Embarked'].mode()[0]
inp0['Embarked'] = inp0['Embarked'].fillna(mod)

In [45]:
inp0.isnull().mean()

PassengerId    0.0
Survived       0.0
Pclass         0.0
Name           0.0
Sex            0.0
Age            0.0
SibSp          0.0
Parch          0.0
Ticket         0.0
Fare           0.0
Embarked       0.0
dtype: float64

In [46]:
inp0.shape

(891, 11)

- we will skip EDA because of the time limitation and the agenda of the session

## Preprocessing before modeling

In [47]:
inp1 = inp0.drop(columns=['PassengerId', 'Name', 'Ticket'])

In [48]:
inp1['Sex'] = inp1['Sex'].map({'male': 0, 'female': 1})

In [49]:
inp1.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,22.0,1,0,7.25,S
1,1,1,1,38.0,1,0,71.2833,C
2,1,3,1,26.0,0,0,7.925,S
3,1,1,1,35.0,1,0,53.1,S
4,0,3,0,35.0,0,0,8.05,S


In [50]:
print(inp1.shape)
inp1 = inp1.drop_duplicates()
print(inp1.shape)

(891, 8)
(775, 8)


### dummy variable creation

In [51]:
dum_cols = ['Pclass', 'SibSp', 'Parch', 'Embarked']

In [52]:
inp1['Pclass'] = inp1['Pclass'].astype(object)
inp1['SibSp'] = inp1['SibSp'].astype(object)
inp1['Parch'] = inp1['Parch'].astype(object)

In [53]:
dum = pd.get_dummies(inp1[dum_cols], drop_first=True, dtype=int)
inp1 = pd.concat([inp1, dum], axis=1)

In [55]:
inp1 = inp1.drop(columns=dum_cols)

In [56]:
inp1.head()

Unnamed: 0,Survived,Sex,Age,Fare,Pclass_2,Pclass_3,SibSp_1,SibSp_2,SibSp_3,SibSp_4,SibSp_5,SibSp_8,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Embarked_Q,Embarked_S
0,0,0,22.0,7.25,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1
1,1,1,38.0,71.2833,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,1,26.0,7.925,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,1,1,35.0,53.1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1
4,0,0,35.0,8.05,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [57]:
inp1.shape

(775, 20)

In [60]:
inp1.columns

Index(['Survived', 'Sex', 'Age', 'Fare', 'Pclass_2', 'Pclass_3', 'SibSp_1',
       'SibSp_2', 'SibSp_3', 'SibSp_4', 'SibSp_5', 'SibSp_8', 'Parch_1',
       'Parch_2', 'Parch_3', 'Parch_4', 'Parch_5', 'Parch_6', 'Embarked_Q',
       'Embarked_S'],
      dtype='object')

### train test split

In [61]:
X = inp1.drop(columns=['Survived'])
y = inp1['Survived']

In [62]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

In [63]:
print(f'Train X shape: {X_train.shape}')
print(f'Test X shape: {X_test.shape}')
print(f'Train y shape: {y_train.shape}')
print(f'Test y shape: {y_test.shape}')

Train X shape: (620, 19)
Test X shape: (155, 19)
Train y shape: (620,)
Test y shape: (155,)


### scaling

In [64]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [65]:
X_train_scaled

array([[0.        , 0.6481528 , 0.15546645, ..., 0.        , 0.        ,
        1.        ],
       [0.        , 0.10781603, 0.06126432, ..., 0.        , 0.        ,
        1.        ],
       [0.        , 0.34656949, 0.01512699, ..., 0.        , 1.        ,
        0.        ],
       ...,
       [1.        , 0.43453129, 0.10149724, ..., 0.        , 0.        ,
        1.        ],
       [0.        , 0.34656949, 0.09193308, ..., 0.        , 0.        ,
        1.        ],
       [0.        , 0.74868057, 0.1545881 , ..., 0.        , 0.        ,
        0.        ]])

In [66]:
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)

In [67]:
X_train_scaled.head()

Unnamed: 0,Sex,Age,Fare,Pclass_2,Pclass_3,SibSp_1,SibSp_2,SibSp_3,SibSp_4,SibSp_5,SibSp_8,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Embarked_Q,Embarked_S
0,0.0,0.648153,0.155466,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.107816,0.061264,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.346569,0.015127,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.0,0.346569,0.154588,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.145514,0.021942,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [69]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

estimator = LogisticRegression()
selector=RFE(estimator, n_features_to_select=8)
selector.fit(X_train_scaled, y_train)
selector.support_

array([ True,  True, False,  True,  True, False, False,  True,  True,
        True, False, False, False, False, False,  True, False, False,
       False])

In [70]:
cols_to_keep = X_train_scaled.columns[selector.support_]

In [71]:
cols_to_keep

Index(['Sex', 'Age', 'Pclass_2', 'Pclass_3', 'SibSp_3', 'SibSp_4', 'SibSp_5',
       'Parch_5'],
      dtype='object')

In [72]:
X_train = X_train_scaled[cols_to_keep]
X_test = X_test_scaled[cols_to_keep]

In [73]:
print(f'Train X shape: {X_train.shape}')
print(f'Test X shape: {X_test.shape}')
print(f'Train y shape: {y_train.shape}')
print(f'Test y shape: {y_test.shape}')

Train X shape: (620, 8)
Test X shape: (155, 8)
Train y shape: (620,)
Test y shape: (155,)
