In [89]:
import numpy as np
import pandas as pd

# Read csv files
train_df = pd.read_csv('./dataset/train_tfidf_features.csv')
test_df = pd.read_csv('./dataset/test_tfidf_features.csv')

X_train = train_df.drop(['id', 'label'], axis=1) # Features
y_train = train_df['label'] # Labels
X_test = test_df.drop(['id'], axis=1) # Test Features
test_ids = test_df['id']

print(X_train.head())
print(y_train.head())

     0    1    2    3    4    5    6    7    8    9  ...  4990  4991  4992  \
0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
2  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
3  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
4  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   

   4993  4994  4995  4996  4997  4998  4999  
0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
1   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
2   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
3   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
4   0.0   0.0   0.0   0.0   0.0   0.0   0.0  

[5 rows x 5000 columns]
0    1
1    0
2    1
3    0
4    1
Name: label, dtype: int64


In [90]:
def sigmoid(z):
    return 1.0/(1 + np.exp(-z))

In [91]:
def loss(y, y_hat):
    return -np.mean(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat))

In [92]:
def gradients(X, y, y_hat):
    m = X.shape[0]
    dw = (1/m)*np.dot(X.T, (y_hat - y))
    db = (1/m)*np.sum((y_hat - y))
    return dw, db

In [82]:
def train(X, y, bs, epochs, lr):
    m, n = X.shape
    w = np.zeros((n, 1))
    b = 0
    y = y.values.reshape(m,1)
    losses = []
    
    for epoch in range(epochs):
        for i in range((m - 1) // bs + 1):
            start_i = i * bs
            end_i = start_i + bs
            xb = X[start_i:end_i]
            yb = y[start_i:end_i]
            
            # Calculate hypothesis
            y_hat = sigmoid(np.dot(xb, w) + b)
            
            # Getting gradients of loss
            dw, db = gradients(xb, yb, y_hat)
            
            # Update parameters
            w -= lr*dw
            b -= lr*db
            
        # Calculating loss and appending to list
        l = loss(y, sigmoid(np.dot(X, w) + b))
        losses.append(l)
        
    return w, b, losses

In [83]:
def predict(X, w, b):
    preds = sigmoid(np.dot(X, w) + b)
    pred_class = [1 if i > 0.5 else 0 for i in preds]
    return np.array(pred_class)

In [84]:
def accuracy(y_true, y_pred):
    return np.sum(y_true == y_pred) / len(y_true)

def f1_score(y_true, y_pred, class_label):
    tp = np.sum((y_true == class_label) & (y_pred == class_label))
    fp = np.sum((y_true != class_label) & (y_pred == class_label))
    fn = np.sum((y_true == class_label) & (y_pred != class_label))
    
    if tp + 0.5 * (fp + fn) == 0:
        return 0
    
    f1 = tp / (tp + 0.5 * (fp + fn))
    return f1

def macro_f1_score(y_true, y_pred):
    f1_hateful = f1_score(y_true, y_pred, class_label=1)
    f1_non_hateful = f1_score(y_true, y_pred, class_label=0)
    return (f1_hateful + f1_non_hateful) / 2

In [93]:
# Training
w, b, l = train(X_train, y_train, bs=64, epochs=100, lr=0.01)

# Prediction on training set
y_train_pred = predict(X_train, w, b)

# Save predictions to CSV for the test set
y_test_pred = predict(X_test, w, b)
predictions_df = pd.DataFrame({'id': test_df['id'], 'label': y_test_pred})
predictions_df.to_csv('LogRed_Prediction.csv', index=False)

# Evaluate on training set
print("Training set accuracy:", accuracy(y_train, y_train_pred))
print("F1 Score for Hateful (class 1):", f1_score(y_train, y_train_pred, 1))
print("F1 Score for Non-Hateful (class 0):", f1_score(y_train, y_train_pred, 0))
print("Macro F1 Score:", macro_f1_score(y_train, y_train_pred))

Training set accuracy: 0.6222067039106145
F1 Score for Hateful (class 1): 0.026394721055788842
F1 Score for Non-Hateful (class 0): 0.7656317689530686
Macro F1 Score: 0.39601324500442875
