# INFO 7390 Assignment - 2
## Zihan Wan

In [1]:
# Module imports
import pandas as pd
import numpy as np

In [2]:
# Define activation function - tanh
def tanh(x):
    # 1e-15 - Prevent log(0) when calculating loss
    t=((np.exp(x)-np.exp(-x))/(np.exp(x)+np.exp(-x))+1)/2 + 1e-15
    return t

In [3]:
# Initialize parameters - weight/bias
# dims - Number of features
def initialize_params(dims):
    w = np.zeros((dims,1))
    b = 0
    return w,b

In [4]:
# Define logistic regression algorithm model
def logistic(x,y,w,b):
    
    size = x.shape[0]
    # Call the activation function to get y_hat
    y_hat = tanh(np.dot(x,w)+b)
      
    # Apply log loss method    
    loss = np.squeeze(-1/size * np.sum(y*np.log(y_hat)+(1-y)*np.log(1-y_hat)))
    # Calculate derivatives of weight and bias (gradients)
    dw = 1/size * np.dot(x.T, (y_hat-y))
    db = 1/size * np.sum(y_hat-y)
    return y_hat,loss,dw,db

In [5]:
# Define model training process
# Input x (features), y (label), number of iterations and learning rate
def sgb_opt(x,y,epochs,learning_rate):
    # Initialize weight and bias
    w,b = initialize_params((x.shape[1]))
    
    # Define loss list
    loss_list = []
    
    # Stochastic gradient descent for optimization
    for i in range(epochs): 
        # call logistic regression algorithm
        y_hat,loss,dw,db = logistic(x,y,w,b)
        w = w-learning_rate*dw
        b = b-learning_rate*db
        # Record and print the number of iterations and losses 
        # for each 150 iterations
        if i%150 == 0:         
            loss_list.append(loss)
            print('epoch %d loss %f '%(i,loss))
            
    # Save model parameters and gradients
    params = {'w':w,'b':b}    
    grads = {'dw':dw,'db':db}  
    return loss_list,params,grads

In [6]:
# Define prediction function
# Input x (features) and model parameters
def predict(x,params):
    # Calculate predicted values
    predict = tanh(np.dot(x,params['w'])+params['b'])
    # Output 0 and 1 for classification
    for i in range(len(predict)):
        if predict[i] > 0.5:
            predict[i] = 1
        else:
            predict[i] = 0
    return predict

In [7]:
# Data input and info
data = pd.read_csv('taitanic_data.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [8]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [9]:
# Filter Features
data.drop(['Cabin','Name','Ticket','PassengerId'],inplace=True,axis=1)

In [10]:
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [11]:
# Handle missing values
data['Age'] = data['Age'].fillna(data['Age'].mean())
data = data.dropna()

In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  889 non-null    int64  
 1   Pclass    889 non-null    int64  
 2   Sex       889 non-null    object 
 3   Age       889 non-null    float64
 4   SibSp     889 non-null    int64  
 5   Parch     889 non-null    int64  
 6   Fare      889 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 62.5+ KB


In [13]:
# Convert data types to numbers
labels = data['Embarked'].unique().tolist()
data['Embarked'] = data['Embarked'].apply(lambda x: labels.index(x))
data['Sex'] = (data['Sex'] == 'male').astype('int')

In [14]:
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,0
1,1,1,0,38.0,1,0,71.2833,1
2,1,3,0,26.0,0,0,7.925,0
3,1,1,0,35.0,1,0,53.1,0
4,0,3,1,35.0,0,0,8.05,0


In [15]:
# Select the feature and label columns
x = data.iloc[:,data.columns !='Survived'].to_numpy()
y = data.iloc[:,data.columns =='Survived'].to_numpy()

In [16]:
# Divide training and test sets
split = int(x.shape[0]*0.7)
Xtrain,Ytrain = x[:split],y[:split]
Xtest,Ytest = x[split:],y[split:]

In [17]:
# Call training model and prediction function
loss_list,params,grads = sgb_opt(Xtrain,Ytrain,1500,0.001)
# Output predicted value
y_pred = predict(Xtest,params) 

epoch 0 loss 0.693147 
epoch 150 loss 0.633030 
epoch 300 loss 0.625775 
epoch 450 loss 0.619630 
epoch 600 loss 0.614296 
epoch 750 loss 0.609567 
epoch 900 loss 0.605294 
epoch 1050 loss 0.601375 
epoch 1200 loss 0.597737 
epoch 1350 loss 0.594327 


In [18]:
# Accuracy
accuracy = np.sum(np.equal(Ytest, y_pred)) / len(Ytest)
accuracy

0.7378277153558053

In [19]:
# Combine true and predicted y into one dataframe
df_score = pd.DataFrame(Ytest,columns=['y_true'])
df_score['y_pred']=y_pred

In [20]:
# Calculate confusion matrix
tp = len(df_score[(df_score['y_true']==1) & (df_score['y_pred']==1)])
tn = len(df_score[(df_score['y_true']==0) & (df_score['y_pred']==0)])
fp = len(df_score[(df_score['y_true']==0) & (df_score['y_pred']==1)])
fn = len(df_score[(df_score['y_true']==1) & (df_score['y_pred']==0)])

In [21]:
precision = tp/(tp+fp)
recall = tp/(tp+fn)

In [22]:
# F1 score
f1_score = (precision*recall*2)/(precision+recall)
f1_score

0.4444444444444444