# BERT: TEXT CLASSIFICATION
## Part 1 - Data 

In [3]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split

with open("data/movie_plots_tc.csv", encoding="utf-8",errors="ignore") as csv_file :
    df = pd.read_csv(csv_file, sep=";")
plots = df["Plot"]
labels = df["Genre"]

In [4]:

str2id={'western':0,'drama':1,'comedy':2}
id2str={0:'western',1:'drama',2:'comedy'}

list_plots=plots.fillna("CVxTz").values
indexed_labels=np.array([str2id[l] for l in labels])

In [5]:

train_features, val_features, train_labels, val_labels = train_test_split(list_plots, indexed_labels, test_size=0.25 , random_state=2000)

## Model

In [6]:
import torch
from transformers.file_utils import is_tf_available, is_torch_available
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import Trainer, TrainingArguments

In [7]:
model_name = "bert-base-uncased"
max_lenght = 256
tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True)

In [8]:
train_encodings = tokenizer(train_features.tolist(),truncation=True, padding=True, max_length=max_lenght)
val_encodings = tokenizer(val_features.tolist(),truncation=True, padding=True, max_length=max_lenght)

In [25]:
class OurTorchDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

In [26]:
train_dataset = OurTorchDataset(train_encodings, train_labels)
val_dataset = OurTorchDataset(val_encodings, val_labels)

In [27]:
from sklearn.metrics import accuracy_score

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  # calculate accuracy using sklearn's function
  acc = accuracy_score(labels, preds)
  return {
      'accuracy': acc,
  }

## FINETUNING TEXT CLASSIFICATION

In [None]:
model = BertForSequenceClassification.from_pretrained(model_name,num_labels=3).to("cuda")

In [29]:
training_args = TrainingArguments(
    output_dir="/Model",
    num_train_epochs= 3 ,
    warmup_steps=100,
    weight_decay=0.01,
    seed=1895,
)

In [None]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset= train_dataset,
    eval_dataset= val_dataset,
    compute_metrics= compute_metrics,
    
)
trainer.train()

In [31]:
trainer.evaluate()

100%|██████████| 351/351 [00:08<00:00, 39.32it/s]


{'eval_loss': 0.9536789655685425,
 'eval_accuracy': 0.7987152034261242,
 'eval_runtime': 8.9502,
 'eval_samples_per_second': 313.065,
 'eval_steps_per_second': 39.217,
 'epoch': 3.0}

## testing 

In [34]:
def get_prediction(text):
    #Prepare our text into tokenized sequence
    inputs = tokenizer(text, padding=True, truncation = True, max_length = max_lenght, return_tensors = "pt").to("cuda")
    # perfom inference to our model 
    outputs = model(**inputs )
    # get out probabilities by doing softmax
    probs = outputs[0].softmax(1)
    # executing argmax function to get the candidate label
    return id2str[probs.argmax().item()]

In [35]:
get_prediction("The duo decide to search for the gold together, but they are apprehended by Union forces shortly after leaving the mission - Tuco yells out Confederate-supportive statements at a group of Union soldiers, as they are covered in dust, obscuring the blue color of their uniforms. The two are brought to a prison camp which Angel infiltrated as a Union sergeant in his search for Bill Carson, getting his attention when Tuco poses as Bill Carson. Tuco reveals the name of the cemetery under torture and is sent away to be killed. Knowing that Blondie would not reveal the location, Angel Eyes recruits him into his search. Tuco escapes his fate by killing Angel Eyes' henchman, and soon finds himself in an evacuated town, where Blondie, Angel Eyes, and his gang have also arrived. ")

'drama'

In [36]:
film ="In the heart of the American frontier, 'The Unforgiving Plains' unfolds as a gripping tale of revenge and redemption. Set against the vast, unforgiving landscape of the Wild West in the late 1800s, the story follows the journey of Jacob Calloway, a stoic former gunslinger who has turned his back on violence to live a peaceful life on his homestead. However, when a ruthless gang terrorizes his town and threatens his family, Jacob is compelled to pick up his guns once more and confront the demons of his past. Directed with a keen eye for tension and dramatic landscapes, the film is a masterclass in storytelling, weaving together themes of loyalty, justice, and the unbreakable human spirit. With stunning cinematography that captures the brutal beauty of the frontier and performances that bring depth and nuance to the archetypal characters, 'The Unforgiving Plains' is a modern Western classic that pays homage to the genre while charting its own unique path."
get_prediction(film)

'western'