In [161]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

np.set_printoptions(precision=3, suppress=True)

In [162]:
# Load the data
train = pd.read_csv('iitb-cs-725-1-2024/train.csv')
test = pd.read_csv('iitb-cs-725-1-2024/test.csv')
sample = pd.read_csv('iitb-cs-725-1-2024/sample.csv')

In [163]:
# Create datasets
def create_datasets(data):
    X = data.drop(['score', 'ID'], axis=1)
    y = data['score']
    X = np.array(X)
    y = np.array(y).reshape(-1, 1)
    return X, y

# Shuffle the data
train = train.sample(frac=1).reset_index(drop=True)

n_train = int(0.8 * train.shape[0])
n_val = int(0.9 * train.shape[0])
X_train, y_train = create_datasets(train[:n_train])
X_val, y_val = create_datasets(train[n_train:n_val])
X_test, y_test = create_datasets(train[n_val:])

X_train.shape, X_val.shape, X_test.shape

((27990, 64), (3499, 64), (3499, 64))

In [172]:
def predict(X, W):
    return X @ W

def mse(y_true, y_pred):
    return np.mean((y_true - y_pred)**2)

def rmse(y_true, y_pred):
    return np.sqrt(mse(y_true, y_pred))

def compute_gradients(X, y, y_pred):
    dW = -2 * X.T @ (y - y_pred) / X.shape[0]
    return dW

def fit(X, y, W, Xval, yval, lr=0.01, epochs=100, print_every=100):
    # Error list
    errors = []
    val_errors = []
    best_val_error = float('inf')
    best_W = None
    
    for i in range(epochs):
        y_pred = predict(X, W)
        loss = mse(y, y_pred)
        
        y_val_pred = predict(Xval, W)
        val_loss = mse(yval, y_val_pred)
        
        y_val_pred_rounded = y_val_pred.round()
        val_rounded_loss = mse(yval, y_val_pred_rounded)
        
        dW = compute_gradients(X, y, y_pred)
        W -= lr * dW
        if i % print_every == 0 or i == epochs - 1:
            print(f'Epoch {i}, Loss: {loss}, Val Loss: {val_loss}, Val Rounded Loss: {val_rounded_loss}')
        errors.append(loss)
        val_errors.append(val_loss)
        if val_loss < best_val_error:
            best_val_error = val_loss
            best_W = W
    return best_W, errors, val_errors

In [165]:
# Closed form solution

from closedForm import LinearRegressionClosedForm

model = LinearRegressionClosedForm()

model.fit(X_train, y_train)

y_pred = model.predict(X_val)
rmse(y_val, y_pred)

3.278457375137263

In [166]:
def plot_errors(errors, val_errors):
    plt.plot(errors, label='Train')
    plt.plot(val_errors, label='Validation')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

In [179]:
# Feature engineering

def gaussian_features(X):
    mu = np.mean(X, axis=0)
    sigma = np.std(X, axis=0)
    return np.exp(-0.5 * ((X - mu) / sigma)**2)

# Add polynomial features
def transform_input(X):
    X = np.hstack([X, X**2, np.sin(X), gaussian_features(X)])
    return X

X_train_poly = transform_input(X_train)
X_val_poly = transform_input(X_val)

X_train_poly.shape
W_poly = np.random.randn(X_train_poly.shape[1], 1)

In [182]:
W_poly = np.load('weights3.npy')

In [186]:
W_train_poly, errors, val_errors = fit(X_train_poly, y_train, W_poly, X_val_poly, y_val, lr=0.003, epochs=10000, print_every=100)

plot_errors(errors, val_errors)

y_pred = predict(X_val_poly, W_train_poly)
rmse(y_val, y_pred)

Epoch 0, Loss: 1.1202502412179836, Val Loss: 1.1270387654161573, Val Rounded Loss: 1.2186338953986853
Epoch 100, Loss: 1.104743465379127, Val Loss: 1.1116841756860893, Val Rounded Loss: 1.1974849957130609
Epoch 200, Loss: 1.0904675149786638, Val Loss: 1.0975494865234698, Val Rounded Loss: 1.1906258931123177
Epoch 300, Loss: 1.0773204664012772, Val Loss: 1.0845335414762693, Val Rounded Loss: 1.179765647327808
Epoch 400, Loss: 1.065208987439733, Val Loss: 1.0725437141765821, Val Rounded Loss: 1.1623320948842526
Epoch 500, Loss: 1.0540476001948351, Val Loss: 1.061495176412106, Val Rounded Loss: 1.1437553586739069
Epoch 600, Loss: 1.04375800889961, Val Loss: 1.05131023072061, Val Rounded Loss: 1.138896827665047
Epoch 700, Loss: 1.0342684866845038, Val Loss: 1.0419177015422636, Val Rounded Loss: 1.119748499571306
Epoch 800, Loss: 1.025513315898135, Val Loss: 1.0332523795658368, Val Rounded Loss: 1.1123178050871678
Epoch 900, Loss: 1.017432277127535, Val Loss: 1.0252545144358143, Val Rounded

KeyboardInterrupt: 

In [159]:
# export weights
np.save('weights3.npy', W_train_poly)
W_backup = W_train_poly

In [171]:
X_val_poly = transform_input(X_val)
y_pred = predict(X_val_poly, W_poly)
y_pred_round = y_pred.round()

mse(y_val, y_pred_round)

0.9534152615032867

In [161]:
X_test_poly = transform_input(X_test)
y_test_pred = predict(X_test_poly, W_poly)

rmse(y_test, y_test_pred)

np.float64(0.9308557733835064)

In [162]:
# Get output for final test data
X_final_test = np.array(test.drop('ID', axis=1))
X_final_test_poly = transform_input(X_final_test)
y_final_test = predict(X_final_test_poly, W_poly)
y_final_test = y_final_test.round()

output = pd.DataFrame({'ID': test['ID'], 'score': y_final_test.flatten()})

# output['score'] = output['score'].clip(0, 5)

# count score < 0
below0 = output[output['score'] < 0].shape[0]

# count score > 5
above5 = output[output['score'] > 5].shape[0]

print(output.shape[0], below0, above5)

output.to_csv('output.csv', index=False)

14996 0 1
