# Assign 1

In [98]:
# import libraries
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt

In [99]:
# load csv and extract the y and X part
def loadcsv(url):
  df = pd.read_csv(url)
  #check for any nan values
  # print(df.head())
  # print(df.describe(include = "all"))

  # set 1 for female and 0 for male
  df = df.replace(to_replace="male", value=0)
  df = df.replace(to_replace="female", value=1)

  # set y to survived column value
  y = np.array(df['Survived'])

  # Replace 'nan' with mean of all age as nan values are only in AGE column
  mean = df['Age'].mean()
  df.fillna(mean, inplace = True)

  # csv to 1D array
  arr = np.array(df)

  # Convert 1D data to matrix form for different columns
  pclass = np.reshape(arr[:,2], (-1,1))
  sex = np.reshape(arr[:,3], (-1,1))
  age = np.reshape(arr[:,4], (-1,1))
  sibsp = np.reshape(arr[:,5], (-1,1))
  parch = np.reshape(arr[:,6], (-1,1))
  fare = np.reshape(arr[:,7], (-1,1))

  X = [pclass, sex, age, sibsp, parch, fare]
  X = np.array(X)

  # create 2D matrix of independent variables
  X = X[:,:,0]
  # Add 0.0001 so that further there is no problem regarding divisible or log of 0 
  X = X.T+0.0001

  return X, y

In [100]:
# To find value of sigmoid function
def sigmoid_func(x):
  sig=1/(1+np.exp(-x))
  return sig


# Modal which returns W based on the training data
def modal(epochs, train_X, train_y, W):
  loss = 0
  error = []
  for i in range(epochs):
    for x, y in zip(train_X, train_y):
      # sigmoid function is 1/1+e^(-WiXi)
      out = sigmoid_func(np.dot(x, W.T))
      dw =  np.dot((out - y).T, x)
      W = W - lr * dw
  return W


In [101]:
# To find Accuracy, Loss, f1 Score, True/False Positive/Negative 
def acc(t_X, t_y, W):
  loss = 0
  y_hat = []
  # print(t_X.shape, t_y.shape)
  for x in t_X:

    y_hat.append(sigmoid_func(np.dot(x, W.T)))
  
  y_hat = np.array(y_hat)

  for y_act,y_pred in zip(t_y, y_hat):
    loss = loss - ((y_act*math.log(y_pred)) - ((1-y_act)*(math.log(1-y_pred))))
    # np.log was giving errors

  y_h = []
  for i in y_hat:
    if(i>=0.5):
      y_h.append(1.0)
    else:
      y_h.append(0.0)

  y_h = np.array(y_h)

  y_h[y_h>=0.5] = 1
  y_h[y_h<0.5] = 0

  # True positive,
  falsen = 0;truep = 0;falsep = 0;truen = 0;accuracy = 0
  for i,j in zip(y_h, t_y):
    if i == j:
      accuracy+=1;
    if i == 1 and j == 1:
      truep+=1
    elif i == 1 and j == 0:
      falsep+=1
    elif i==0 and j == 1:
      falsen+=1
    else:
      truen+=1

  recall = truep/(truep+falsen)
  precision = truep/(truep + falsep)
  f1_score = 2*precision*recall/(precision+recall)

  print('Accuracy = ',accuracy/y_h.shape[0])
  print('Loss = ',loss)
  print('F1_Score = ',f1_score)

In [102]:
trainurl = 'https://raw.githubusercontent.com/vanshbansal1505/ICG-Summer-Program-2021-DS/main/Assignment-1/data/train.csv'
testurl = 'https://raw.githubusercontent.com/vanshbansal1505/ICG-Summer-Program-2021-DS/main/Assignment-1/data/test.csv'

train_X, train_y = loadcsv(trainurl)
test_X, test_y = loadcsv(testurl)

# initial theta values with some random values
W = np.random.rand(6)

# Learning Rate
lr = 0.001
epochs = 1000

W = modal(epochs, train_X, train_y, W)

# print(W)
print("\nTraining Data")
acc(train_X, train_y, W)
print("\nTest Data")
acc(test_X, test_y, W)



Training Data
Accuracy =  0.7951612903225806
Loss =  118.18834434047605
F1_Score =  0.7292110874200427

Test Data
Accuracy =  0.8007380073800738
Loss =  69.61505132408126
F1_Score =  0.6931818181818182
