In [20]:
import torch
import pandas as pd

from transformers import AutoTokenizer
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from tqdm import tqdm

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
model = AutoModelForQuestionAnswering.from_pretrained("models/RoBERT.pth")
tokenizer = AutoTokenizer.from_pretrained("models/Tokenizer_RoBERT.pth/")

In [6]:
df = pd.read_csv("yerdaulet_annot.csv")

train, val = train_test_split(df, test_size=0.2)

train = train.reset_index(drop=True)
val = val.reset_index(drop=True)

In [7]:
train_encodings = tokenizer(list(train.context), list(train.question), max_length=512, truncation=True, padding='max_length', return_tensors='pt')

val_encodings = tokenizer(list(val.context), list(val.question), max_length=512, truncation=True, padding='max_length', return_tensors='pt')

In [13]:
def add_token_positions(encodings, df):
    start_positions = []
    end_positions = []


    for i in range(len(df['start_position'])):
        target_answer = df.at[i, "answer"]
        start = encodings.char_to_token(i, df['start_position'][i] + 0)
        end = encodings.char_to_token(i, df['end_position'][i] + 0)

        if end is None:
            end = encodings.char_to_token(i, df['end_position'][i] + 1)
        
        answer = tokenizer.decode(encodings['input_ids'][i][start:end])
        
        if target_answer.strip() == answer.strip():
            pass
        else:
            counter = 2
            while end is None:
                end = encodings.char_to_token(i, df['end_position'][i] + counter)
                if counter == 50:
                    break
                    
                counter += 1

            counter = 1
            while end is None:
                end = encodings.char_to_token(i, df['end_position'][i] - counter)
                counter += 1

                
        start_positions.append(start)
        end_positions.append(end)
    
    encodings.update({
        'start_positions': start_positions,
        'end_positions': end_positions
    })

In [14]:
add_token_positions(train_encodings, train)
add_token_positions(val_encodings, val)

In [27]:
class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
        
    def __len__(self, ):
        return len(self.encodings.input_ids)
    
    def __getitem__(self, x):
        return {key: torch.tensor(val[x]) for key, val in self.encodings.items()}

In [31]:
vals = next(iter(SquadDataset(train_encodings)))

  return {key: torch.tensor(val[x]) for key, val in self.encodings.items()}


In [33]:
vals['input_ids']

tensor([    0, 10600,  2276,  3895, 26360,  3716,   427, 10723, 13927,   500,
        15039, 45497,  6380, 22389,  5745,    16, 41130, 36625,  6470, 12243,
          908,    77,  3440, 18364, 44254,  6175, 10541,    18,  1755, 18364,
        44254,    77, 51440,  8600,  5926, 22603,    16,   917, 22271, 24182,
        30920,  8180, 16814,    77, 30404,  3405,  2109,  7274, 30404,  8962,
          612, 29332,   418,  1260,  3637, 30710,    16,  8429, 19891,  8640,
          331,  2597,    18,     2,     2,  1569, 18364, 15633,   414,   613,
        23441, 39084, 11125,    35,     2,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1, 

In [19]:
train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=True)

In [21]:
model.train()
model.to(device)
optim = torch.optim.AdamW(model.parameters(), lr=5e-5)



In [None]:
for epoch in range(3):
    loop = tqdm(train_loader)
    for batch in loop:
        optim.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, 
                        start_positions=start_positions,
                        end_positions=end_positions)
        
        loss = outputs[0]
        loss.backward()
        optim.step()
        
        loop.set_description(f"Epoch {epoch}")
        loop.set_postfix(loss=loss.item())

  return {key: torch.tensor(val[x]) for key, val in self.encodings.items()}
Epoch 0: 100%|████████████████████████████████████████████████████████████| 326/326 [00:29<00:00, 10.95it/s, loss=2.94]
Epoch 1: 100%|████████████████████████████████████████████████████████████| 326/326 [00:29<00:00, 10.88it/s, loss=1.99]
Epoch 2:  13%|████████▏                                                    | 44/326 [00:04<00:26, 10.57it/s, loss=1.32]

In [2]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.nn.functional import softmax

# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)

# Sample text for sentiment analysis
text = "I hate using BERT for sentiment analysis!"

# Tokenize input text and convert to tensor
tokens = tokenizer(text, return_tensors='pt')
input_ids = tokens['input_ids']
attention_mask = tokens['attention_mask']

# Make prediction
with torch.no_grad():
    output = model(input_ids, attention_mask=attention_mask)

# Get probabilities for each sentiment class
probs = softmax(output.logits, dim=1).squeeze().tolist()

# Map probabilities to sentiment labels
sentiment_labels = ['Negative', 'Neutral', 'Positive']
predicted_sentiment = sentiment_labels[probs.index(max(probs))]

print(f"Predicted sentiment: {predicted_sentiment}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Predicted sentiment: Negative
