In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer
import torch
from transformers import BertForSequenceClassification, Trainer, TrainingArguments


In [None]:

# Load the dataset
df = pd.read_csv('output.csv')


texts = df['paragraphs'].tolist()
labels = df['label'].tolist()
print("1")


# Encode the labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Split the dataset into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, encoded_labels, test_size=0.2, random_state=42
)
print("2")
# Tokenize and encode texts and labels for BERT
tokenizer = BertTokenizer.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512)
print("3")


class KeywordDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)
print("3")

train_dataset = KeywordDataset(train_encodings, train_labels)
test_dataset = KeywordDataset(test_encodings, test_labels)



In [None]:

model = BertForSequenceClassification.from_pretrained('distilbert', num_labels=len(label_encoder.classes_))

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=20,
    warmup_steps=50,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)
print("4")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)
print("5")
# Train the model
trainer.train()
print("6")


In [None]:
predictions = trainer.predict(test_dataset)
yhat_classes = predictions.predictions.argmax(axis=1)
testy_inverse = label_encoder.inverse_transform(test_labels)

# Calculate precision and recall
precision = precision_score(testy_inverse, yhat_classes)
recall = recall_score(testy_inverse, yhat_classes)

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")


In [None]:

model_path = r'D:\stuff\college\ai assignment\bert_model'
tokenizer_path = r'D:\stuff\college\ai assignment\bert_model_token'
model.save_pretrained(model_path)
tokenizer.save_pretrained(tokenizer_path) 
from sklearn.metrics import precision_score, recall_score

In [None]:
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)


example_text= "And Ive decided that I will not let them be floats They have to be integers hence the assert over here So inside the init Ive decided Im going to represent my fracture with two numbers one for the numerator and one for the denominator So when I create a fraction object Im going to pass in a numerator and a denominator And a particular instance is going to have self dot numerator and self dot denominator as its data attributes and Im assigning those to be whatevers passed into my init Since I plan on debugging this code maybe possibly sometime in the future Im also including an str method and the str method is going to print a nice looking string thats going to represent the numerator and then a slash and then the denominator And then Ive also implemented some other special methods How do I add two fractions How do I subtract two fractions And how do I convert a fraction to a float The add and subtract are almost the same so lets look at the add for the moment How do we add two fractions Were going to take self which is the instance of an object that I want to do the add operation on and were going to take other which is the other instance of an object that I want to do the operation on so the addition and Im going to figure out the new top So the new top of the resulting fraction So its my numerator multiplied by the other denominator plus my denominator multiplied by the other numerator and then divided by the multiplication of the two denominators So the top is going to be that the bottom is going to be that Notice that were using self dot right Once again were trying to access the data attributes of each different instance right of myself and the other object that Im working with So thats why I have to use self dot here Once I figure out the top and the bottom of the addition Im going to return and here notice Im returning a fraction object Its not a number its not a float its not an integer Its a new object that is of the exact same type as the class that Im implementing So as its the same type of object then on the return value I can do all of the exact same operations that I can do on a regular fraction object Sub is going to be the same Im returning a fraction object Float is just going to do the division for me so its going to take the numerator and then divide it by the denominator just divide the numbers And then Im defining here my own method called inverse And this is just going to take the inverse of the instance Im calling this method on And so its going to also return a new fraction object that just has the denominator as the top part and the numerator as the bottom part So then we have some code here So thats how I implement my fraction object So now lets use it and see what it gives us A is equal to a fraction 1 4 This is going to be 1 over 4 for a And b is going to be 3 over four When I do C notice Im using the plus operator between two fraction objects right A and b are fraction objects so Pythons going to say OK is there an underscore underscore add underscore underscore method implemented It is and its just going to do whatevers inside here So its going to say self dot numerator plus other dot denominator Its going to calculate the top and the bottom"


example_ids = tokenizer.encode(example_text, add_special_tokens=True, max_length=512, truncation=True, padding='max_length', return_tensors='pt')


with torch.no_grad():
    logits = model(example_ids).logits
    probabilities = torch.softmax(logits, dim=1)
    predicted_class_indices = torch.argsort(probabilities, descending=True)[:, :5]


top_keywords = [label_encoder.classes_[idx.item()] for idx in predicted_class_indices[0]]

print("Top 5 Predicted Keywords:")
for keyword in top_keywords:
    print(keyword)