# Applying Logistics Regression on the Titanic Data set — Classification!

## Assignment

Run this cell to load the Titanic data:

In [0]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
train, test = train_test_split(sns.load_dataset('titanic').drop(columns=['alive']))
target = 'survived'

Some EDA:


In [0]:
train.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alone
70,0,2,male,32.0,0,0,10.5,S,Second,man,True,,Southampton,True
81,1,3,male,29.0,0,0,9.5,S,Third,man,True,,Southampton,True
389,1,2,female,17.0,0,0,12.0,C,Second,woman,False,,Cherbourg,True
810,0,3,male,26.0,0,0,7.8875,S,Third,man,True,,Southampton,True
38,0,3,female,18.0,2,0,18.0,S,Third,woman,False,,Southampton,False


## Replace the missing value under the age column with the average age

In [0]:
train_mean = train.age.mean()
train_mean

29.448070500927646

In [0]:
test_mean = test.age.mean()
test_mean

30.472342857142856

In [0]:
train.age.fillna(train_mean,inplace=True)
train.age.isnull().sum()

0

In [0]:
test.age.fillna(test_mean, inplace=True)
test.age.isnull().sum()

0

## How many survived?

In [0]:
 train.survived.value_counts()

0    402
1    266
Name: survived, dtype: int64

### Preprocessing data

In [0]:
train.sex.replace({'female':0, 'male':1},inplace=True)

In [0]:
test.sex.replace({'female':0, 'male':1},inplace=True)


In [0]:
train['sex'] = train['sex'].astype(int)

In [0]:
test['sex'] = test['sex'].astype(int)

In [0]:
train['age'] = train['age'].astype(int)

In [0]:
test['age'] = test['age'].astype(int)

In [0]:
test.tail()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alone
555,0,1,1,62,0,0,26.55,S,First,man,True,,Southampton,True
660,1,1,1,50,2,0,133.65,S,First,man,True,,Southampton,False
132,0,3,0,47,1,0,14.5,S,Third,woman,False,,Southampton,False
61,1,1,0,38,0,0,80.0,,First,woman,False,B,,True
179,0,3,1,36,0,0,0.0,S,Third,man,True,,Southampton,True


since we would be using the features of* age *and *sex* . Let's also do some EDA on them

## Male Passengers

In [0]:
train[train.sex==0]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alone
389,1,2,0,17,0,0,12.0000,C,Second,woman,False,,Cherbourg,True
38,0,3,0,18,2,0,18.0000,S,Third,woman,False,,Southampton,False
700,1,1,0,18,1,0,227.5250,C,First,woman,False,C,Cherbourg,False
427,1,2,0,19,0,0,26.0000,S,Second,woman,False,,Southampton,True
11,1,1,0,58,0,0,26.5500,S,First,woman,False,C,Southampton,True
423,0,3,0,28,1,1,14.4000,S,Third,woman,False,,Southampton,False
257,1,1,0,30,0,0,86.5000,S,First,woman,False,B,Southampton,True
106,1,3,0,21,0,0,7.6500,S,Third,woman,False,,Southampton,True
416,1,2,0,34,1,1,32.5000,S,Second,woman,False,,Southampton,False
541,0,3,0,9,4,2,31.2750,S,Third,child,False,,Southampton,False


## Female passengers that survived and otherwise

In [0]:
train[train.sex==0].survived.value_counts()

1    184
0     60
Name: survived, dtype: int64

## Male passengers that survived and otherwise

In [0]:
train[train.sex==1].survived.value_counts()

0    342
1     82
Name: survived, dtype: int64

In [0]:
train.survived.dtype

dtype('int64')

## Are there missing values under the sex column in the train and test data set?

In [0]:
train.sex.isnull().sum(), test.sex.isnull().sum()

(0, 0)

## Are there missing values under the age column in the train and test data set?

In [0]:
train.age.isnull().sum(),test.age.isnull().sum()

(0, 0)

## **Preparing the *male *sex for our model**

In [0]:
train[1] = train.sex == 1
test[1] = test.sex == 1

In [0]:
train[['sex', 1]].head()

Unnamed: 0,sex,1
70,1,True
81,1,True
389,0,False
810,1,True
38,0,False


##  Applying Logistic Regression

In [0]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

features = [1, 'age']
target = 'survived'

model = LogisticRegression()
model.fit(train[features], train[target])

# Train accuracy

y_true = train[target]
y_pred = model.predict(train[features])
print(f'Train Accuracy: {accuracy_score(y_true,y_pred)}')

# Test accuracy

y_true = test[target]
y_pred = model.predict(test[features])
print(f'Test Accuracy: {accuracy_score(y_true, y_pred)}')


Train Accuracy: 0.7874251497005988
Test Accuracy: 0.7847533632286996




##  Applying Random Forest

In [0]:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import accuracy_score

features = [1, 'age']
target = 'survived'

# model = RandomForestRegressor()
threshold = 0
model = RandomForestClassifier()
model.fit(train[features], train[target])

# Train Accuracy
y_true = train[target]
y_pred = model.predict(train[features])
print(f'Train Accuracy:{accuracy_score(y_true, (y_pred>threshold))}')
#print(f'Train Accuracy:{accuracy_score(y_true,y_pred)}')



# Test Accuracy
y_true = test[target]
y_pred = model.predict(test[features])
print(f'Test Accuracy:{accuracy_score(y_true, (y_pred>threshold))}')
#print(f'Test Accuracy:{accuracy_score(y_true,y_pred)}')


Train Accuracy:0.8278443113772455
Test Accuracy:0.7309417040358744


