In [92]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader

import pandas as pd
from tqdm.auto import tqdm, trange


In [93]:
class LinearRegressionFFN(nn.Module):
    def __init__(self, input_dim, hidden_dim=5):
        super(LinearRegressionFFN, self).__init__()
        self.linear1 = nn.Linear(input_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, 1)

    def forward(self, input):
        hidden_output = F.softmax(self.linear1(input), dim=-1)
        return torch.sigmoid(self.linear2(hidden_output))


In [94]:
train_file = "../model_data/branch_train.csv"
val_file = "../model_data/branch_val.csv"

branch_train_data = pd.read_csv(train_file)
branch_val_data = pd.read_csv(val_file)

branch_training_data = [branch_train_data, branch_val_data]

branch_training_data = pd.concat(branch_training_data)

In [95]:
branch_prob = 'branch_prob'
raw_string = 'raw_string'

clean_train_data = branch_training_data.loc[:, ~branch_training_data.columns.isin([branch_prob, raw_string])]
clean_train_labels = branch_training_data[branch_prob].tolist()

training_data = [(torch.Tensor(clean_train_data.iloc[i].tolist()), torch.Tensor([clean_train_labels[i]])) for i in range(len(clean_train_data))]

## Training Loop

In [96]:
model = LinearRegressionFFN(input_dim=len(clean_train_data.columns))
model.train()

epochs = 10
criterion = torch.nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=.001)


for epoch in range(epochs):
    for step, data in enumerate(tqdm(training_data)):
        inputs, label = data

        optimizer.zero_grad()
        prediction = model(inputs)
        loss = criterion(prediction, label)

        loss.backward()
        optimizer.step()

100%|██████████| 1701/1701 [00:00<00:00, 2005.93it/s]
100%|██████████| 1701/1701 [00:00<00:00, 2285.66it/s]
100%|██████████| 1701/1701 [00:01<00:00, 1504.95it/s]
100%|██████████| 1701/1701 [00:00<00:00, 1962.76it/s]
100%|██████████| 1701/1701 [00:00<00:00, 2144.88it/s]
100%|██████████| 1701/1701 [00:00<00:00, 2133.92it/s]
100%|██████████| 1701/1701 [00:01<00:00, 1523.16it/s]
100%|██████████| 1701/1701 [00:01<00:00, 1463.45it/s]
100%|██████████| 1701/1701 [00:00<00:00, 1725.02it/s]
100%|██████████| 1701/1701 [00:00<00:00, 1850.18it/s]


In [97]:
test_file = "../model_data/branch_test.csv"
branch_test_data = pd.read_csv(test_file)

branch_prob = 'branch_prob'
raw_string = 'raw_string'

clean_test_data = branch_test_data.loc[:, ~branch_test_data.columns.isin([branch_prob, raw_string])]
clean_test_labels = branch_test_data[branch_prob].tolist()

test_data = [(torch.Tensor(clean_test_data.iloc[i].tolist()), torch.Tensor([clean_test_labels[i]])) for i in range(len(clean_test_data))]

In [105]:
model.eval()

predictions = []

with torch.no_grad():
    for step, data in enumerate(tqdm(test_data)):
        inputs, label = data
        prediction = model(inputs)
        predictions.extend(prediction)

100%|██████████| 264/264 [00:00<00:00, 9989.41it/s]


In [99]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

print(f"RMSE for Test Set: {mean_squared_error(clean_test_labels, predictions, squared=False)}") #RMSE
print(f"MAE for Test Set: {mean_absolute_error(clean_test_labels, predictions)}")

MSE for Test Set: 0.3713781082928041
MAE for Test Set: 0.3057702435078154


In [106]:
predictions_file = "pytorch_test_predictions.csv"

test_predictions = [predictions[i].item() for i in range(len(predictions))]

test_predictions = pd.Series(test_predictions)

test_predictions.to_csv(predictions_file, index=False, header=False)