## Implementation from scratch

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df_train = pd.read_csv('train.csv')

df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df_train.drop(['Cabin'], axis = 1, inplace = True)

df_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [4]:
df_train['Sex'] = [0 if i == 'male' else 1 for i in df_train['Sex']]

df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,S


In [5]:
df_train['Age'].fillna(value = df_train['Age'].mean(), inplace = True)

df_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,0.352413,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,0.47799,13.002015,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,0.0,22.0,0.0,0.0,7.9104
50%,446.0,0.0,3.0,0.0,29.699118,0.0,0.0,14.4542
75%,668.5,1.0,3.0,1.0,35.0,1.0,0.0,31.0
max,891.0,1.0,3.0,1.0,80.0,8.0,6.0,512.3292


In [6]:
X_train = df_train[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']]

X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
0,3,0,22.0,1,0,7.25
1,1,1,38.0,1,0,71.2833
2,3,1,26.0,0,0,7.925
3,1,1,35.0,1,0,53.1
4,3,0,35.0,0,0,8.05


In [7]:
Y_train = df_train['Survived']

Y_train.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [8]:
# normalizing the data

def normalise(x):
    m, n = x.shape

    for i in range(n):
        x = (x - x.mean(axis = 0))/x.std(axis = 0)
    
    return x

In [9]:
# sigmoid function

def sigmoid(z):
    return 1.0/(1 + np.exp(-z))

In [10]:
# loss function

def loss(y, y_pred):
    loss = (-y*np.log(y_pred) - (1 - y)*np.log(1 - y_pred)).mean()
    return loss

In [11]:
# gradient function to determine the parameters

def gradient(x, y, y_pred):
    m = x.shape[0] #total training examples
    first = y_pred - y.reshape(x.shape[0], -1)
    dw = (1/m)*np.dot(x.T, first)
    db = (1/m)*np.sum((y_pred - y))
    
    return dw, db

In [12]:
# trianing the model

def train(x, y, epochs, lr):
    m, n = x.shape
    
    # intialising weights and bias to zeros
    w = np.zeros((n,1))
    b = 0
    
    # reshaping y
    y = np.squeeze(y)
    
    x = normalise(x)
    losses = []
    
    # training
    for epoch in range(epochs):
        y_pred = sigmoid(np.dot(x, w) + b)
        
        dw , db = gradient(x, y, y_pred)
        
        w -= lr*dw
        b -= lr*db
        
        losses.append(loss(y, y_pred))
        
        return w, b, losses
        

In [13]:
# predicting the values

def predict(x):
    x = normalise(x)
    
    preds = sigmoid(np.dot(x, w) + b)
    
    y_pred = [1 if i > 0.5 else 0 for i in preds]
    
    
    return np.array(y_pred)
    
    

In [14]:
# trainining model using train dataset

w, b, l = train(X_train, np.array(Y_train), 10000, 0.001)



In [15]:
# Calculating the accuracy of the model

def accuracy(y, y_pred):
    accuracy = np.sum(y == y_pred)/len(y)
    
    return accuracy

In [16]:
accuracy(Y_train, predict(X_train))

0.6161616161616161

## Direct Implementation

In [17]:
from sklearn import linear_model

reg = linear_model.LogisticRegression()

In [18]:
reg.fit(X_train, Y_train)

LogisticRegression()

In [19]:
y_pred = reg.predict(X_train)

In [20]:
from sklearn.metrics import accuracy_score
print ("Accuracy : ", accuracy_score(Y_train, y_pred))

Accuracy :  0.797979797979798


## predicting values for test dataset

In [21]:
df_test = pd.read_csv('test.csv')

df_test['Sex'] = [0 if i == 'male' else 1 for i in df_test['Sex']]
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",0,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",0,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",0,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,1,1,3101298,12.2875,,S


In [22]:
df_test['Age'].fillna(value = df_test['Age'].mean(), inplace = True)
df_test['Fare'].fillna(value = df_test['Fare'].mean(), inplace = True)

df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    int64  
 4   Age          418 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         418 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(5), object(4)
memory usage: 36.0+ KB


In [23]:
x_test = df_test[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']]

In [26]:
y_pred_test = reg.predict(x_test)

In [27]:
dict = {'PassengerId': df_test['PassengerId'], 'Survived': y_pred_test}

df = pd.DataFrame(dict)

df.to_csv('submission.csv', index = False)