### **Akshat Mandloi**
#### 22/07/2020

* **1 Introduction**
* **2 Load raw data**
    * 2.1 Load Raw Data
* **3 Pre Processing Data**
* **4 Feature Scaling**
    * 4.1 Scaling Inputs
* **5 Modeling**
    * 5.1 Simple Logistic Model
    * 5.2 Predicting and Submitting Results
* **6 Evaluating Coefficients*

    

## 1. Introduction

This is my first attempt at any competition on Kaggle. Through this exercise, I would like to explore feature analysis and develop a mindset towards machine learning and feature engineering. 

This will follow approach which follows a structured approach starting with data preprocessing followed by feature analysis and model building and prediction.  

In [25]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split

## 2. Importing Raw Data

In [26]:
raw_train_data = pd.read_csv('train.csv')
raw_test_data = pd.read_csv('test.csv')
raw_train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## 3 Pre Processing Data

In [27]:
# assessing category of columns
raw_train_data.dtypes
numerical_categories = raw_train_data.select_dtypes(include=['int64', 'float64']).columns.tolist()
object_categories = raw_train_data.select_dtypes(include=['object']).columns.tolist()
print("numerical categories:", numerical_categories)
print("object categories:", object_categories)

numerical categories: ['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
object categories: ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']


In [28]:
raw_train_data.describe(include='all')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891,891.0,204,889
unique,,,,891,2,,,,681,,147,3
top,,,,"Toufik, Mr. Nakli",male,,,,CA. 2343,,B96 B98,S
freq,,,,1,577,,,,7,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


In [80]:
# combine test and train data after dropping survived from train data
drop_columns = ['PassengerId', 'Name', 'Cabin', 'Ticket']
dropped_data = raw_train_data.drop(drop_columns, axis=1)
dropped_test_data = raw_test_data.drop(drop_columns, axis=1)
targets = raw_train_data['Survived']
dropped_data

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.2500,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.9250,S
3,1,1,female,35.0,1,0,53.1000,S
4,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S
887,1,1,female,19.0,0,0,30.0000,S
888,0,3,female,,1,2,23.4500,S
889,1,1,male,26.0,0,0,30.0000,C


In [81]:
for category in numerical_categories:
    if category in drop_columns or category in ['Survived']:
        continue
    print(dropped_data[category].value_counts())

3    491
1    216
2    184
Name: Pclass, dtype: int64
24.00    30
22.00    27
18.00    26
19.00    25
30.00    25
         ..
55.50     1
70.50     1
66.00     1
23.50     1
0.42      1
Name: Age, Length: 88, dtype: int64
0    608
1    209
2     28
4     18
3     16
8      7
5      5
Name: SibSp, dtype: int64
0    678
1    118
2     80
5      5
3      5
4      4
6      1
Name: Parch, dtype: int64
8.0500     43
13.0000    42
7.8958     38
7.7500     34
26.0000    31
           ..
8.4583      1
9.8375      1
8.3625      1
14.1083     1
17.4000     1
Name: Fare, Length: 248, dtype: int64


In [82]:
# classifying
def preprocessed(dropped_data):
    dropped_data['Parch'] = np.where(dropped_data['Parch']>1,1,0)
    reason_columns = pd.get_dummies(dropped_data['Pclass'], drop_first=True)
    dropped_data = dropped_data.drop(['Pclass'], axis=1)
    dropped_data['Sex'] = dropped_data['Sex'].map({'male':0, 'female':1})
    dropped_data['Embarked'] = dropped_data['Embarked'].map({'S':0, 'C':1, 'Q':2})
    dropped_data.dropna(subset=['Embarked'], inplace=True)
    dropped_data['Age'].fillna(dropped_data['Age'].mean(), inplace=True)
    preprocessed_data = pd.concat([dropped_data, reason_columns], axis=1)
    return preprocessed_data

preprocessed_data = preprocessed(dropped_data)
preprocessed_test_data = preprocessed(dropped_test_data)
preprocessed_data.head()


Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Fare,Embarked,2,3
0,0.0,0.0,22.0,1.0,0.0,7.25,0.0,0,1
1,1.0,1.0,38.0,1.0,0.0,71.2833,1.0,0,0
2,1.0,1.0,26.0,0.0,0.0,7.925,0.0,0,1
3,1.0,1.0,35.0,1.0,0.0,53.1,0.0,0,0
4,0.0,0.0,35.0,0.0,0.0,8.05,0.0,0,1


## 4. Feature Scaling

In [89]:
def scale_unscaled(unscaled_data):
    index = unscaled_data.columns.get_loc('Fare')
    unscaled_inputs = unscaled_data.iloc[:,index]
    mean_fare = unscaled_inputs.mean()
    std_fare = unscaled_inputs.std()
    fare_scaled = unscaled_inputs - mean_fare
    fare_scaled = fare_scaled/std_fare
    unscaled_data = unscaled_data.drop(['Fare'], axis=1)
    data_scaled = pd.concat([unscaled_data, fare_scaled], axis=1)
    return data_scaled

preprocessed_test_data = preprocessed_test_data.fillna(preprocessed_test_data['Fare'].median())
preprocessed_data.dropna(inplace=True)
data_scaled = scale_unscaled(preprocessed_data)
data_test_scaled = scale_unscaled(preprocessed_test_data)
data_test_scaled.isnull().sum()

Sex         0
Age         0
SibSp       0
Parch       0
Embarked    0
2           0
3           0
Fare        0
dtype: int64

## 5. Modeling

### 5.1 Building Model and Prediction

In [90]:
targets = data_scaled['Survived']
data_scaled = data_scaled.drop(['Survived'], axis=1)
x_train, x_val, y_train, y_val = train_test_split(data_scaled, targets, train_size=0.9, random_state=20)
reg = LogisticRegression()
reg.fit(x_train, y_train)
reg.score(x_train, y_train)

0.7975

### 5.2 Prediction and Submission

In [91]:
Survived = reg.predict(data_test_scaled)
PassengerId = raw_test_data['PassengerId']
submission = pd.DataFrame(PassengerId, columns=['PassengerId'])
submission['Survived'] = Survived
submission

In [93]:
submission.to_csv('my_new_submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


## 6. Evaluating Coefficients

In [95]:
print(reg.intercept_)
print(reg.coef_)
feature_name = data_pre.columns.values
summary_table = pd.DataFrame(columns=['feature_name'], data=feature_name)
summary_table['coefficients'] = reg.coef_[0]
summary_table.head()
summary_table.index = summary_table.index+1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

[1.01670742]
[[ 2.48546686 -0.0394135  -0.27201085 -0.34603345  0.24592803 -0.77726483
  -2.01559531  0.16191449]]


Unnamed: 0,feature_name,coefficients
0,Intercept,1.016707
1,Sex,2.485467
2,Age,-0.039414
3,SibSp,-0.272011
4,Parch,-0.346033
5,Embarked,0.245928
6,2,-0.777265
7,3,-2.015595
8,Fare,0.161914


In [96]:
summary_table['Odds_ratio'] = np.exp(summary_table.coefficients)
summary_table.sort_values('Odds_ratio', ascending=False)

Unnamed: 0,feature_name,coefficients,Odds_ratio
1,Sex,2.485467,12.006724
0,Intercept,1.016707,2.764079
5,Embarked,0.245928,1.278808
8,Fare,0.161914,1.17576
2,Age,-0.039414,0.961353
3,SibSp,-0.272011,0.761846
4,Parch,-0.346033,0.707489
6,2,-0.777265,0.459662
7,3,-2.015595,0.133241
