In [44]:
import torch
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from tqdm.notebook import tqdm
from datetime import datetime
from sklearn.metrics import roc_auc_score
from sentence_transformers import SentenceTransformer

Flex GPU:

In [45]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(torch.cuda.get_device_name(0))
print(device)

NVIDIA GeForce RTX 3060 Laptop GPU
cuda


Import data and split training and testing data:

In [46]:
data = pd.read_csv(r"C:\Users\email\OneDrive\Documents\Python\quora_classifier\data\original.csv")
data = data.loc[:, ['txt','has_template']]
data, test = train_test_split(data, test_size = 0.2, stratify=data['has_template'], shuffle=True, random_state= 999)
data

Unnamed: 0,txt,has_template
21016,Theory Z is a name applied to two distinctly d...,0
24880,"Bettystown (), previously known as Betaghstown...",0
15447,Goop has faced severe criticism for promoting ...,1
11240,Track Listing Steve Eaves's Myspace Page,1
14332,Reality-based community is an informal term in...,1
...,...,...
19602,"Jim Bowen (born in Heswall, Cheshire, England ...",0
19076,Bell-bottoms (or flares) are a style of pants ...,0
14487,"Tunceli Province (, , ), formerly Dersim Provi...",1
202,right|thumb|200px|A live frog is magnetically ...,1


In [61]:
class Sequences(Dataset):
    def __init__(self, data) -> None:
        self.data = data.reset_index(drop = True)
        self.model = SentenceTransformer('bert-base-nli-mean-tokens')
        
    def __getitem__(self, i):
        # return the ith sample's list of word counts and label
        return self.model.encode(self.data.loc[i, 'txt']), self.data.has_template[i]
    
    def __len__(self):
        # return number of samples
        return self.data.shape[0]


In [62]:
dataset = Sequences(data)
train_loader = DataLoader(dataset, batch_size = 512)

Build model:

In [63]:
class BagOfWordsClassifier(nn.Module):
    def __init__(self, vocab_size, hidden1, hidden2, hidden3):
        super(BagOfWordsClassifier, self).__init__()
        self.fc1 = nn.Linear(vocab_size, hidden1)
        self.fc2 = nn.Linear(hidden1, hidden2)
        self.fc3 = nn.Linear(hidden2, hidden3)
        self.fc4 = nn.Linear(hidden3, 1)
    
    def forward(self, inputs):
        x = F.relu(self.fc1(inputs.squeeze(1).float()))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        return self.fc4(x)

In [64]:
model = BagOfWordsClassifier(768, 128, 64, 8)
model.to(device)
next(model.parameters()).is_cuda

True

Set loss, optimization functions:

In [65]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam([p for p in model.parameters() if p.requires_grad], lr=0.001)

In [100]:
def training_loop():
    for epoch in range(3):

        train_losses = []
        progress_bar = tqdm(train_loader, leave=False)
        losses = []
        accuracies = []
        total = 0
        
        for txt, has_template in progress_bar:
            txt = txt.to(device)
            has_template = has_template.to(device)

            model.zero_grad()

            output = model(txt)
            loss = criterion(output.squeeze(), has_template.float())
            loss.backward()
            # nn.utils.clip_grad_norm_(model.parameters(), 3) # What?
            optimizer.step()

            accuracy = torch.eq((torch.sigmoid(output) >= 0.5).squeeze(), has_template).sum().item() / has_template.shape[0]
            accuracies.append(accuracy)
            losses.append(loss.item())
            total += 1
            
            progress_bar.set_description(f'Loss: {loss.item():.3f}, Train Accuracy: {accuracy:.3f}')
            
        epoch_loss = sum(losses) / total
        epoch_accuracy = sum(accuracies) / total
        train_losses.append(epoch_loss)
            
        tqdm.write(f'Epoch #{epoch + 1}\tLoss: {epoch_loss:.3f}\tAccuracy: {epoch_accuracy:.3f}')

    

In [101]:
training_loop()

  0%|          | 0/49 [00:00<?, ?it/s]

Epoch #1	Loss: 0.668	Accuracy: 0.551


  0%|          | 0/49 [00:00<?, ?it/s]

Epoch #2	Loss: 0.666	Accuracy: 0.552


  0%|          | 0/49 [00:00<?, ?it/s]

Epoch #3	Loss: 0.665	Accuracy: 0.558


In [134]:
def test_loop():
    accuracies = []
    test.reset_index(drop=True, inplace=True)

    test_dataset = Sequences(test)
    test_loader = DataLoader(test_dataset, batch_size = 512)

    progress_bar = tqdm(test_loader, leave=False)
    test_model = SentenceTransformer('bert-base-nli-mean-tokens')

    total = 0

    predictions = []

    for txt, has_template in progress_bar:
        txt, has_template = txt.to(device), has_template.to(device)
        output = model(txt)
        # prediction = round(torch.sigmoid(model(torch.LongTensor(test_model.encode(test.loc[i, 'txt']), test.has_template[i]).to(device))).item())
        accuracy = torch.eq((torch.sigmoid(output) >= 0.5).squeeze(), has_template).sum().item() / has_template.shape[0]
        accuracies.append(accuracy)
        total = total + 1
        predictions = predictions + torch.round(torch.sigmoid(output).squeeze()).tolist()

    # return sum(accuracies) / total
    return predictions
        


In [135]:
predictions = test_loop()

  0%|          | 0/13 [00:00<?, ?it/s]

In [102]:
torch.save(model.state_dict(), "BERT_wikipedia_file")

In [None]:
model = BagOfWordsClassifier(len(dataset.token2idx), 128, 64, 8)
model.load_state_dict(torch.load(r'C:\Users\email\OneDrive\Documents\Python\quora_classifier\models\bow3_6\epoch_1_2022-08-16__08-05-11'))
model.to(device)
model.eval()

In [128]:
round(torch.sigmoid(model(torch.tensor(test_model.encode("I am gay boy and I love my cousin (boy). He is sexy, but I dont know what to do. He is hot")).reshape([768,1]).to(device))).item())

1

In [136]:
roc_auc_score(test['has_template'], predictions)

0.5389597719971848