### Model building:

#### Import necessary Libariry and packages:

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn import metrics
from sklearn.metrics import roc_auc_score, accuracy_score

In [11]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


#### Data cleaning
Imputation strategy applied:
- Imputed missing Cabin data with a separate category, 'Missing'
- Imputed missing Embarked data with mode value
- Imputed missing Age data according to respective Gender's mean value 

In [12]:
#Impute missing Cabin data
train['Cabin'] = np.where(train['Cabin'].isnull()==True, 'Missing', train['Cabin'])
print(train['Cabin'].isnull().sum())

#Impute missing Embarked data 
train['Embarked'] = np.where(train['Embarked'].isnull()==True, train['Embarked'].mode(), train['Embarked'])
print(train['Embarked'].isnull().sum())


#Impute missing Age data
female_avg_age = train[train['Sex']=='female']['Age'].mean()
male_avg_age = train[train['Sex']=='male']['Age'].mean()

train['Age'] = np.where(((train['Age'].isnull()) & (train['Sex']=='female')), female_avg_age, train['Age'])
train['Age'] = np.where(((train['Age'].isnull()) & (train['Sex']=='male')), male_avg_age, train['Age'])
print(train['Age'].isnull().sum())

0
0
0


In [13]:
#test dataset missing values imputation:
test['Embarked'] = np.where(test['Embarked'].isnull()==True, test['Embarked'].mode(), test['Embarked'])
print(test['Embarked'].isnull().sum()) 

test['Fare'] = np.where(test['Fare'].isnull()==True, 0.0, test['Fare'])
print(test['Fare'].isnull().sum()) 

female_avg_age = test[test['Sex']=='female']['Age'].mean()
male_avg_age = test[test['Sex']=='male']['Age'].mean()

test['Age'] = np.where(((test['Age'].isnull()) & (test['Sex']=='female')), female_avg_age, test['Age'])
test['Age'] = np.where(((test['Age'].isnull()) & (test['Sex']=='male')), male_avg_age, test['Age'])
print(test['Age'].isnull().sum())

0
0
0


__Data Pre-processing:__
- For categorical features applied Label encoding technique to convert those into numerical features

In [14]:
# Labelencoding
le = LabelEncoder()
var_mod = train.select_dtypes(include='object').columns
for i in var_mod:
    train[i] = le.fit_transform(train[i])
    
for i in var_mod:
    test[i] = le.fit_transform(test[i])

__Train-validation set split:__

In [15]:
# Seperate Features and Target
y = train['Survived']
X = train.drop(columns = ['Survived','PassengerId','Name','Cabin'], axis=1)
test = test.drop(columns=['PassengerId','Name','Cabin'],axis=1)

In [16]:
X_train,X_valid,y_train,y_valid = train_test_split(X,y,test_size=0.2,stratify=y)

In [17]:
algos = [LogisticRegression(), DecisionTreeClassifier(), RandomForestClassifier()]
names = ['Logistic Regression', 'Decision Tree', 'Random Forest']

rocauc_list=[]
rocauc_train=[]

In [18]:
for name in algos:
    model = name
    model.fit(X_train,y_train)
    y_pred = model.predict(X_valid)
    y_pred_train=model.predict(X_train)
    rocauc= metrics.roc_auc_score(y_valid,y_pred)
    rocaucfortrain=metrics.roc_auc_score(y_train,y_pred_train)
    rocauc_list.append(rocauc)
    rocauc_train.append(rocaucfortrain)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [19]:
evaluation = pd.DataFrame({'Model': names,
                           'roc_auc_validation': rocauc_list,'roc_auc_train': rocauc_train})

In [20]:
evaluation

Unnamed: 0,Model,roc_auc_validation,roc_auc_train
0,Logistic Regression,0.841173,0.773015
1,Decision Tree,0.776812,0.998861
2,Random Forest,0.80863,0.998861


**Proceeding with Logistic Regression model**

In [21]:
# Model Building
LR_model = LogisticRegression()
LR_model.fit(X_train,y_train)
y_pred = LR_model.predict(X_valid)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


#### Make Predictions:

In [36]:
#Check the survival probability for the first 3 test data:
Probabilities = LR_model.predict_proba(X_valid)[:5]
print('Probabilities are:')
print(Probabilities[:3])
print()

print('Predicted labels are:', y_pred[:3])

Probabilities are:
[[0.6740294  0.3259706 ]
 [0.84546037 0.15453963]
 [0.91237302 0.08762698]]

Predicted labels are: [0 0 0]


In [22]:
train_preds = LR_model.predict(X_train)
test_preds = LR_model.predict(X_valid)
train_auc = roc_auc_score(y_train, train_preds)
test_auc = roc_auc_score(y_valid, test_preds)

#### Model Evaluation:

In [40]:
from sklearn.metrics import classification_report
print(classification_report(y_valid, test_preds))

              precision    recall  f1-score   support

           0       0.90      0.83      0.86       110
           1       0.76      0.86      0.80        69

    accuracy                           0.84       179
   macro avg       0.83      0.84      0.83       179
weighted avg       0.85      0.84      0.84       179



#### Save the model output in the test dataset

In [25]:
submission = pd.read_csv('test.csv')
final_predictions = LR_model.predict(test)
submission['Survived'] = final_predictions
submission['Survived'] = submission['Survived'].apply(lambda x: 0 if x<0 else x)
submission.to_csv('Logistic regression Results.csv', index=False)

In [26]:
submission.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,0
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,1


### Final conclusion:
The insights we get from the model report:
- Positive class (class 1) recall 86%: Out of all the actual positives 86% of the data is correctly predicted as positive
- Positive class (class 1) precision 76%: Out of all the predicted positives 76% of the data is actually positive
- Similary for Negative class (class 0), both recall and precision looks good and hence, F1-score as well
- So, with simple imputation technique, encoding techniques the base model, Logistic regression is performing well