In [5]:
import pandas as pd
import numpy as np


In [6]:
def load_data(file_path):
    with open(file_path, "r" , encoding = 'utf-8') as f:
        data = f.readlines()
    data = [line.strip().split(":::") for line in data]
    return data

In [8]:
train_data = load_data("train_data.txt")

train_df = pd.DataFrame(train_data , columns=["ID" , "TITLE" ,"GENRE" , "DESCRIPTION"])

test_data = load_data("test_data.txt")

test_df = pd.DataFrame(test_data , columns=["ID" , "TITLE"  , "DESCRIPTION"])

test_sol = load_data("test_data_solution.txt")
test_sol_df = pd.DataFrame(test_sol , columns=["ID" , "TITLE" , "GENRE" , "DESCRIPTION"])

In [9]:
import torch
from transformers import BertTokenizer, BertModel

print("CUDA available:", torch.cuda.is_available())
print("Device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")
print("CUDA version used by PyTorch:", torch.version.cuda)


CUDA available: True
Device: Tesla T4
CUDA version used by PyTorch: 12.4


In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Move model to CUDA if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model.to(device)
bert_model.eval()

def get_bert_batch_embeddings(texts, batch_size=16):
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]

        # Tokenize and move to device
        inputs = tokenizer(batch_texts.tolist(), return_tensors='pt', padding=True, truncation=True, max_length=512)
        inputs = {key: val.to(device) for key, val in inputs.items()}

        with torch.no_grad():
            outputs = bert_model(**inputs)

        # Extract CLS token embeddings and move to CPU
        cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu()
        all_embeddings.append(cls_embeddings)

    return torch.cat(all_embeddings).numpy()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
X_train_tfidf = get_bert_batch_embeddings(train_df['DESCRIPTION'], batch_size=16)
X_test_tfidf = get_bert_batch_embeddings(test_df['DESCRIPTION'], batch_size=16)

######

In [11]:
 from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_df["GENRE"])
y_test = label_encoder.transform(test_sol_df["GENRE"])

In [12]:
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf, y_train)

In [13]:
y_pred = lr_model.predict(X_test_tfidf)
predicted_genres = label_encoder.inverse_transform(y_pred)
test_df["Predicted_Genre"] = predicted_genres

In [14]:
merge_df = pd.merge(test_sol_df[["ID","GENRE"]] , test_df[["ID","Predicted_Genre"]] , on="ID")

In [15]:
from sklearn.metrics import accuracy_score , classification_report
accuracy = accuracy_score(merge_df["GENRE"] , merge_df["Predicted_Genre"])
print(f"Accuracy: {accuracy:.3f}")
print("Classification Report:\n", classification_report(merge_df["GENRE"] , merge_df["Predicted_Genre"]))

Accuracy: 0.566
Classification Report:
                precision    recall  f1-score   support

      action        0.36      0.37      0.36       237
       adult        0.38      0.37      0.38       102
   adventure        0.23      0.18      0.20       119
   animation        0.35      0.20      0.26       108
   biography        0.00      0.00      0.00        49
      comedy        0.53      0.55      0.54      1288
       crime        0.18      0.10      0.13        72
 documentary        0.73      0.81      0.77      2331
       drama        0.59      0.65      0.62      2473
      family        0.30      0.23      0.26       148
     fantasy        0.12      0.06      0.08        51
   game-show        0.76      0.66      0.70        29
     history        0.05      0.02      0.03        42
      horror        0.59      0.61      0.60       390
       music        0.63      0.56      0.60       133
     musical        0.17      0.07      0.10        41
     mystery        0.06

In [16]:
merge_df.head()

Unnamed: 0,ID,GENRE,Predicted_Genre
0,1,thriller,short
1,2,comedy,drama
2,3,documentary,documentary
3,4,drama,drama
4,5,drama,drama
