In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from numpy import random


In [23]:
import copy

In [3]:
df = pd.read_csv("titanicdata.csv")

In [4]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age
0,0,3,2,22.0
1,1,1,1,38.0
2,1,3,1,26.0
3,1,1,1,35.0
4,0,3,2,35.0


In [7]:
def normalize_features (feature_list, df):
    
    for feature in feature_list:
        
        feature_min = df[feature].min()
        feature_max = df[feature].max()
        
        df[feature] = (df[feature] - feature_min) / (feature_max - feature_min)
        
    return df

In [8]:
df = normalize_features (['Pclass', 'Sex', 'Age'], df)

In [9]:
def sigmoid(z):
    
    return 1.0/(1 + np.exp(-z))

In [10]:
def loss (y, y_pred):
    
    loss = -np.mean(y*(np.log(y_pred)) - (1-y)*np.log(1-y_pred))
    return loss

In [11]:
def calculate_gradient(X, y, y_hat):
    
    m = X.shape[0]
    
    dw = (1/m)*np.dot(X.T, (y_hat - y))
    db = (1/m)*np.sum((y_hat - y)) 
    
    return dw, db

In [12]:
def gradient_ascent (data, labels, batch_size = 8, lr = 0.01, epochs = 1):
    
    
    m, n = data.shape
    w = random.rand(n, 1)
    b = 0
    y = labels.reshape(m,1)
    
    losses = []
    
    for epoch in range(epochs):
        for i in range((m-1)//batch_size + 1):
            
            start_i = i*batch_size
            end_i = start_i + batch_size
            xb = data[start_i:end_i]
            yb = y[start_i:end_i]
            
            y_hat = sigmoid(np.dot(xb, w) + b)
            dw, db = calculate_gradient(xb, yb, y_hat)
            
            w -= lr*dw
            b -= lr*db
        
        l = loss(y, sigmoid(np.dot(data, w) + b))
        losses.append(l)
        
    return w, b, losses


In [16]:
df = df.sample(frac = 1) #shuffle
df.reset_index(drop = True, inplace = True)

In [17]:
X = df[ ['Pclass', 'Sex', 'Age'] ].to_numpy().copy()
y = df['Survived'].to_numpy()

In [18]:
#train val test split
train_length = int(len(X) * 0.6)
test_length = int(len(X) * 0.2)

X_train = X[:train_length,:]
X_val = X[train_length: train_length + test_length, :]
X_test = X[train_length + test_length:,:]

y_train = y[:train_length]
y_val = y[train_length: train_length + test_length]
y_test = y[train_length + test_length:]


In [28]:
def predict(X,w,b):
    x = X
    preds = sigmoid(np.dot(X, w) + b)
    pred_class = []
    pred_class = [1 if i > 0.5 else 0 for i in preds]
    
    return np.array(pred_class)

In [29]:
def get_accuracy (y, y_pred):
    
    return (y == y_pred).sum() / len(y)

In [41]:
def tune_hyperparams (lr_list, epoch_list, X_train, y_train, X_val, y_val, batch_size = 8, epochs = 1000):
    
    metric_dict = {'lr':[], 'epoch':[],'acc':[],'loss':[]}
    weight_arr = np.zeros( (len(lr_list),3,1))
    bias = []
    losses = []
    max_acc = 0
    
    for i,lr in enumerate(lr_list):
        
        for epoch in epoch_list:
            w, b, l = gradient_ascent(X_train, y_train, batch_size=batch_size, lr=lr, epochs=epochs)
            pred_val = predict(X_val,w,b)
            acc = get_accuracy (y_val, pred_val)
            
            metric_dict['lr'].append(lr)
            metric_dict['epoch'].append(epoch)
            metric_dict['acc'].append(acc)
            metric_dict['loss'].append(l)
            
            if acc > max_acc:
                best_weight = copy.deepcopy(w)
                best_bias = copy.deepcopy(b)
                max_acc = acc
                
        
    return metric_dict, losses, best_weight, best_bias

In [42]:
lr_list = [0.1, 0.01, 0.001, 0.001, 0.0001]
epoch_list = [1000, 2000, 3000, 4000, 5000]

In [43]:
metric_dict,losses, best_weight, best_bias = tune_hyperparams (lr_list, epoch_list, X_train, y_train, X_val, y_val, batch_size = 64, epochs = 2000)

In [44]:
metric_df = pd.DataFrame(metric_dict)

In [56]:
result = metric_df[['lr', 'epoch','acc']].copy()
result.sort_values(by = 'acc', ascending = False, inplace = True)
result.reset_index(drop = True, inplace = True)
result

Unnamed: 0,lr,epoch,acc
0,0.001,4000,0.786517
1,0.1,1000,0.780899
2,0.1,3000,0.780899
3,0.1,4000,0.780899
4,0.1,5000,0.780899
5,0.1,2000,0.780899
6,0.01,4000,0.769663
7,0.01,5000,0.769663
8,0.01,3000,0.769663
9,0.01,2000,0.769663


In [62]:
lr = result.loc[0,'lr']
epochs = result.loc[0,'epoch']

In [66]:
w, b, l = gradient_ascent(np.concatenate((X_train, X_val)),np.concatenate((y_train, y_val)), batch_size=32, lr=lr, epochs=epochs)

In [67]:
pred_test = predict(X_test,w,b)
get_accuracy (y_test, pred_test)

0.8100558659217877

In [68]:
#it can be tuned more, but I think this is enough considering low data size