In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Fetching, Scaling & Splitting Data

In [2]:
def get_data():
    
    df=pd.read_csv("titanicdata.csv")
    
    global x_df,y_df
    x_df = df.drop('Survived',axis=1)
    y_df = df['Survived']
    
    if df.isna().any().any()==True:
        print("There are missing values in the dataset!") #to check beforehand
    
    return df.head()

In [3]:
get_data()

Unnamed: 0,Survived,Pclass,Sex,Age
0,0,3,2,22.0
1,1,1,1,38.0
2,1,3,1,26.0
3,1,1,1,35.0
4,0,3,2,35.0


In [4]:
def scale_x(x):
    #does min-max scaling
    
    global scaled_x
    scaled_x = pd.DataFrame()
    
    for i in range(x_df.shape[1]): #x.shape[1] is the number of features (3 for the titanic dataset)
        scaled_feature=[]
        feature=x_df.iloc[:,i]
        min_element=min(feature)
        max_element=max(feature)
        for item in x_df.iloc[:,i]:
            scaled_item=(item-min_element)/(max_element-min_element)
            scaled_feature.append(scaled_item)
        scaled_x[x_df.columns[i]]=scaled_feature

In [5]:
scale_x(x_df)

In [6]:
def split_data(x,y):
    
    global x_train, x_valid, x_test, y_train, y_valid, y_test
    
    x_remain, x_test, y_remain, y_test = train_test_split(x_df,y_df, test_size=0.2, random_state=42) 
    #20% of the original dataset is seperated as test data
    
    x_train, x_valid, y_train, y_valid = train_test_split(x_remain,y_remain,test_size=0.25,random_state=42)
    #80% of the original data remains. 25% of the remaining data, which equals to 20% of the 
    #original data  (because (0.8)*(0.25)=0.2) is seperated as validation data.
    #Remaining data, which is 60% of the original data, is set as training data.
    
    x_train=x_train.reset_index(drop=True) # because indexes got unordered in splitting
    x_valid=x_valid.reset_index(drop=True)
    x_test=x_test.reset_index(drop=True)
    y_train=y_train.reset_index(drop=True)
    y_valid=y_valid.reset_index(drop=True)
    y_test=y_test.reset_index(drop=True)
    
    #check for splitting 
    if not (len(x)*0.6 -1) <= x_train.shape[0] <= (len(x)*0.6 +1): #+1 -1 in case there is odd number of data
        print("Error in splitting!")
    if not (len(x)*0.2 -1) <= x_test.shape[0] <= (len(x)*0.2 +1):
        print("Error in splitting!")
    if not (len(x)*0.2 -1) <= x_valid.shape[0] <= (len(x)*0.2 +1): 
        print("Error in splitting!")

In [7]:
split_data(scaled_x,y_df)

# Logistic Regression Algorithm

In [8]:
def sigmoid(z):
  #defining a sigmoid function helps simplify the rest of the calculations
    return np.exp(z) / (1 + np.exp(z)) #sigmoid formula

In [9]:
def gradients(x, y, y_pred):
  #calculated the gradients
  
    num_of_instances = x.shape[0]

    #by the gradient descent formula
    d_weights = (1/num_of_instances ) * np.dot(x.T, (y_pred-y)) #np.dot gives product of two vectors
    d_bias = (1/num_of_instances ) * np.sum(y_pred - y)

    return d_weights, d_bias #will be used in training and predictions

In [10]:
def loss_function(y, y_pred):
  #calculated the loss for each predicted instance
    loss = - np.mean(y * (np.log(y_pred)) - (1-y) * np.log(1-y_pred)) #by formula
    return loss #will be used in training

In [11]:
def train(x, y, learning_rate=0.01, max_iter=1000): #just arbitrary inital parameter values
  #epsilon here is sort of a stopping condition
  #In detail, if the change in every descent is smaller than the epsilon, the model is not
  #improving much, terminate 
    global weights,bias,y_pred,loss_list

    num_of_instances, num_of_features = x.shape
    weights = np.zeros((num_of_features, 1)) #for the starting value
    bias = 0 #for the starting value
    
    global loss_list, change_per_iteration #made these global to plot later on
    change_per_iteration = []
    loss_list = []

    y = np.array(y_train).reshape(num_of_instances, 1)

    for i in range(max_iter):            

        y_pred = sigmoid(np.dot(x, weights) + bias) #by formula
        
        d_weights, d_bias = gradients(x, y, y_pred) #fetches the parameters again each iteration
            
        weights -= learning_rate * d_weights #updates parameters, again just re-wrote the formula in Python format
      
        bias -= learning_rate * d_bias #updates parameters

        change=learning_rate * d_weights #created a variable for change to plot later on
        change_per_iteration.append(sum(change)) #storing changes to plot later on
                                                #got the sum, because I will create a 2D graph
                                                #seeing convergence in the graph is enough, 
                                                #the sum will be able to show that

        loss = loss_function(y, sigmoid(np.dot(x, weights) + bias)) #gets the loss for the corresponding iter
        
        #for early termination
        termination = 0 
        if len(loss_list) > 0 and loss_list[-1] == loss:
            termination += 1
            if termination== 100:
                return weights, bias, loss_list

        
        loss_list.append(loss)#stores losses to plot later on

  
    return weights, bias, loss_list 

In [12]:
def predict(x, weights, bias):

    predictions = sigmoid(np.dot(x, weights) + bias) #by formula
    
    global pred_class #global so that I can plot them later on, outside the function
    pred_class = [] #stores predicted labels to calculate accuracy and to plot later on
    pred_class = [1 if i > 0.5 else 0 for i in predictions]
    return np.array(pred_class)

# Validation

In [13]:
def valid_function():
  #not cross validation etc, but since the data is low-dimensional, this approach should work
    
    global val_accs, valid_df

    #lists will store the corresponding values for each learning rate-max iteration pair
    val_accs = [] 
    val_losses = [] 
    weights_list = [] 
    bias_list = []

    iterations = [2500,5000,7500, 10000, 12500,15000,17500,20000,22500,25000] #max_iter values to validate
    learning_rates = [0.0001, 0.001, 0.01] # learning_rate values to validate

    valid = []
    for iter in iterations:
        for lr in learning_rates: #nested for loop ensures both parameters are validated
                                  #simultaneously, not sequentially since they may affect each other
            weights, bias, loss = train(x_train, y_train, learning_rate=lr,max_iter=iter )
            y_pred = predict(x_valid, weights, bias)
            acc = accuracy_score(y_valid, y_pred)

            weights_list.append(weights)
            bias_list.append(bias)
            val_accs.append(acc)
            val_losses.append(loss)
            valid.append([acc, iter, lr])
    
    valid_df=pd.DataFrame(valid)
    valid_df.columns =['Validation Accuracy', 'Max Iterations', 'Learning Rate']
    return valid_df
    
           

In [14]:
valid_function()

Unnamed: 0,Validation Accuracy,Max Iterations,Learning Rate
0,0.629213,2500,0.0001
1,0.634831,2500,0.001
2,0.640449,2500,0.01
3,0.629213,5000,0.0001
4,0.662921,5000,0.001
5,0.679775,5000,0.01
6,0.629213,7500,0.0001
7,0.679775,7500,0.001
8,0.735955,7500,0.01
9,0.629213,10000,0.0001


# Testing

In [15]:
#to test on train + validation data
frames = [x_train,x_valid]
x_train=pd.concat(frames).reset_index(drop=True)
y_train=y_train.append(y_valid)

In [16]:
weights, bias, loss= train(x_train, y_train, learning_rate=0.01, max_iter=12500)

In [17]:
test_predictions=predict(x_test,weights,bias)
accuracy_score(y_test, test_predictions)

0.7821229050279329

In [18]:
fp=0
fn=0
tp=0
tn=0

for i in range(len(pred_class)):
    if pred_class[i]==1 and y_test[i]==0:
        fp+=1
    if pred_class[i]==0 and y_test[i]==1:
        fn+=1
    if pred_class[i]==1 and y_test[i]==1:
        tp+=1
    if pred_class[i]==0 and y_test[i]==0:
        tn+=1

recall=tp/(tp+fn)
precision=tp/(fp+tp)
fpr=fp/(fp+tn)
f_meas=(2*recall*precision)/(recall+precision)

print("Recall:", recall)
print("Precision:", precision)
print("False Positive Rate:", fpr)
print("F Measure:", f_meas)

Recall: 0.7027027027027027
Precision: 0.7536231884057971
False Positive Rate: 0.1619047619047619
F Measure: 0.7272727272727273


In [19]:
#just to compare
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(max_iter=12500)
logreg.fit(x_train, y_train)
y_pred = logreg.predict(x_test)
logreg.score(x_test, y_test)

0.8100558659217877