## Task 1: Logistic Regression Implementation

### 1.1: Plain Logistic Regression. Score = 0.39226

In [1]:
import numpy as np
import pandas as pd

# Read csv files
train_df = pd.read_csv('./dataset/train_tfidf_features.csv')
test_df = pd.read_csv('./dataset/test_tfidf_features.csv')

X_train = train_df.drop(['id', 'label'], axis=1) # Features
y_train = train_df['label'] # Labels
X_test = test_df.drop(['id'], axis=1) # Test Features

print(X_train.head())
print(y_train.head())

     0    1    2    3    4    5    6    7    8    9  ...  4990  4991  4992  \
0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
2  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
3  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
4  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   

   4993  4994  4995  4996  4997  4998  4999  
0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
1   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
2   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
3   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
4   0.0   0.0   0.0   0.0   0.0   0.0   0.0  

[5 rows x 5000 columns]
0    1
1    0
2    1
3    0
4    1
Name: label, dtype: int64


In [3]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def loss(y, y_hat):
    return -np.mean(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat))

def gradients(X, y, y_hat):
    m = X.shape[0]
    dw = (1/m) * np.dot(X.T, (y_hat - y))
    db = (1/m) * np.sum(y_hat - y)
    return dw, db

def train(X, y, bs, epochs, lr):
    m, n = X.shape
    w = np.zeros((n, 1))
    b = 0
    y = y.values.reshape(m,1)
    losses = []
    
    for epoch in range(epochs):
        for i in range((m - 1) // bs + 1):
            start_i = i * bs
            end_i = start_i + bs
            xb = X[start_i:end_i]
            yb = y[start_i:end_i]
            
            # Calculate hypothesis
            y_hat = sigmoid(np.dot(xb, w) + b)
            
            # Getting gradients of loss
            dw, db = gradients(xb, yb, y_hat)
            
            # Update parameters
            w -= lr*dw
            b -= lr*db
            
        # Calculating loss and appending to list
        l = loss(y, sigmoid(np.dot(X, w) + b))
        losses.append(l)
        
    return w, b, losses

def predict(X, w, b):
    preds = sigmoid(np.dot(X, w) + b)
    pred_class = [1 if i > 0.5 else 0 for i in preds]
    return np.array(pred_class)

In [8]:
# Training
w, b, l = train(X_train, y_train, bs=64, epochs=100, lr=0.01)

# Save predictions to CSV for the test set
y_test_pred = predict(X_test, w, b)
predictions_df = pd.DataFrame({'id': test_df['id'], 'label': y_test_pred})
predictions_df.to_csv('./predictions/LogRed_Prediction_Plain.csv', index=False)

### 1.2: Modified Logistic Regression. Score = 0.62412

In [None]:
import numpy as np
import pandas as pd

train_df = pd.read_csv('./dataset/train_tfidf_features.csv')
test_df = pd.read_csv('./dataset/test_tfidf_features.csv')

X_train = train_df.drop(['id', 'label'], axis=1) # Features
y_train = train_df['label'] # Labels
X_test = test_df.drop(['id'], axis=1) # Test Features

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

# Loss function with regularization
def loss(y, y_hat, w, lambda_):
    y_hat = np.clip(y_hat, 1e-10, 1-1e-10) # Avoid log(0)
    log_loss = -np.mean(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat))
    reg_loss = (lambda_ / 2) * np.sum(w**2) # L2 regularization
    return log_loss + reg_loss

# Gradients calculation with regularization
def gradients(X, y, y_hat, w, lambda_):
    m = X.shape[0]
    dw = (1/m) * np.dot(X.T, (y_hat - y)) + (lambda_ / m) * w
    db = (1/m) * np.sum(y_hat - y)
    return dw, db

def train(X, y, bs, epochs, lr, lambda_):
    m, n = X.shape
    w = np.zeros((n, 1))
    b = 0
    y = y.values.reshape(m, 1)
    losses = []
    
    for epoch in range(epochs):
        for i in range((m - 1) // bs + 1):
            start_i = i * bs
            end_i = start_i + bs
            xb = X[start_i:end_i]
            yb = y[start_i:end_i]
            
            y_hat = sigmoid(np.dot(xb, w) + b)
            
            dw, db = gradients(xb, yb, y_hat, w, lambda_)
            
            w -= lr * dw
            b -= lr * db
            
        l = loss(y, sigmoid(np.dot(X, w) + b), w, lambda_)
        losses.append(l)
        
        # Convergence check
        if epoch > 0 and np.abs(losses[-1] - losses[-2]) < 1e-6:
            print(f'Converged at epoch {epoch}')
            break
    
    return w, b, losses

def predict(X, w, b):
    preds = sigmoid(np.dot(X, w) + b)
    pred_class = [1 if i > 0.5 else 0 for i in preds]
    return np.array(pred_class)

lambda_ = 0.01
w, b, l = train(X_train, y_train, bs=64, epochs=1000, lr=0.01, lambda_=lambda_)

y_test_pred = predict(X_test, w, b)
predictions_df = pd.DataFrame({'id': test_df['id'], 'label': y_test_pred})
predictions_df.to_csv('./predictions/LogRed_Prediction.csv', index=False)


### 1.3: SKLearn Logistic Regression for comparison. Score = 0.68446

In [13]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression

train_df = pd.read_csv('./dataset/train_tfidf_features.csv')
test_df = pd.read_csv('./dataset/test_tfidf_features.csv')

X_train = train_df.drop(['id', 'label'], axis=1)  # Features
y_train = train_df['label']  # Labels
X_test = test_df.drop(['id'], axis=1)  # Test Features

log_reg = LogisticRegression()

log_reg.fit(X_train, y_train)
y_train_pred = log_reg.predict(X_train)

y_test_pred = log_reg.predict(X_test)

# Save the predictions to a CSV file
output = pd.DataFrame({"id": test_df["id"], "label": y_test_pred})
output.to_csv("./predictions/SK_Learn_LogisticRegression_Predictions.csv", index=False)
