In [None]:
import torch
import spacy
import pandas as pd
# !python -m spacy download en_core_web_md
import json
import numpy as np
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

In [None]:
train_df = pd.read_csv("traintxt.csv")
train_df.head() #y

In [None]:
test_df = pd.read_csv("testtxt.csv")
test_df.head()

In [None]:
SPACY_VEC_SIZE = 300
nlp = spacy.load("en_core_web_md")
def text_processor(text:str):
    tokens = nlp(text)
    return tokens.vector

In [152]:
class TextModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden1 = nn.Linear(SPACY_VEC_SIZE, 64)
        self.act1 = nn.ReLU()
        self.hidden2 = nn.Linear(64, 16)
        self.act2 = nn.ReLU()
        self.output = nn.Linear(16, 1)
        self.act_output = nn.Sigmoid()

    def forward(self, x):
        x = self.act1(self.hidden1(x))
        x = self.act2(self.hidden2(x))
        x = self.act_output(self.output(x))
        return x

model = TextModel()
model.requires_grad = True
print(model)

TextModel(
  (hidden1): Linear(in_features=300, out_features=64, bias=True)
  (act1): ReLU()
  (hidden2): Linear(in_features=64, out_features=16, bias=True)
  (act2): ReLU()
  (output): Linear(in_features=16, out_features=1, bias=True)
  (act_output): Sigmoid()
)


In [153]:
learning_rate = 0.0003
loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate,weight_decay=0.00001)

In [154]:
# data loader
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.x_list = df["x"].values
        self.y_list = df["y"].values

    def __getitem__(self, idx):
        txt_vec = text_processor(self.x_list[idx])
        x_item = torch.tensor(txt_vec).float()
        y_item = self.y_list[idx]
        y_item = torch.tensor([y_item]).float()
        return (x_item,y_item)

    def __len__(self):
        return len(self.x_list)

In [169]:
batch_size = 128
train_data = CustomDataset(train_df)
train_dataloader = torch.utils.data.DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True)
test_data = CustomDataset(test_df)
test_dataloader = torch.utils.data.DataLoader(dataset=test_data, batch_size=batch_size, shuffle=True)

## taining loop

In [170]:
def model_logs(text):
    with open("model.log", "a+") as myfile:
        myfile.write(f"{text}\n")

In [171]:
num_epochs = 3

for epoch in range(num_epochs):
    loss_values = []
    for i,data in enumerate(tqdm(train_dataloader)):
        X, y = data
        # zero the parameter gradients
        optimizer.zero_grad()
        # forward + backward + optimize
        pred = model(X)
        loss = loss_fn(pred, y)
        loss_values.append(loss.item())
        loss.backward()
        optimizer.step()
    # 
    model_logs(f"epoch:{epoch}  loss:,{np.average(loss_values)}")
    
    

100%|██████████| 1172/1172 [11:52<00:00,  1.64it/s]
100%|██████████| 1172/1172 [11:37<00:00,  1.68it/s]
100%|██████████| 1172/1172 [11:49<00:00,  1.65it/s]
