Data Preprocessing

In [3]:
import re
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from google.colab import drive
nltk.download('all')

# Mount Google Drive
drive.mount('/content/drive')

# File paths
train_file = "/content/drive/MyDrive/Genre Classification Dataset/train_data.txt"
test_file = "/content/drive/MyDrive/Genre Classification Dataset/test_data.txt"
solution_file = "/content/drive/MyDrive/Genre Classification Dataset/test_data_solution.txt"

# Download necessary NLP resources
nltk.download("punkt")
nltk.download("stopwords")

# Function to read and process the dataset
def load_and_process(file_path):
    data = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split(" ::: ")
            if len(parts) == 4:
                movie_id, title, genre, description = parts
                data.append((movie_id, title, genre, description))
    return pd.DataFrame(data, columns=["ID", "Title", "Genre", "Description"])

# Load datasets
train_df = load_and_process(train_file)
test_df = load_and_process(test_file)
solution_df = load_and_process(solution_file)

# Text Preprocessing Function
def clean_text(text):
    text = text.lower()  # Lowercasing
    text = re.sub(r"\W", " ", text)  # Remove special characters
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    tokens = word_tokenize(text)  # Tokenization
    stop_words = set(stopwords.words("english"))  # Stopword removal
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return " ".join(filtered_tokens)

# Apply text cleaning to descriptions
train_df["Cleaned_Description"] = train_df["Description"].apply(clean_text)
test_df["Cleaned_Description"] = test_df["Description"].apply(clean_text)

# Display first few rows
print(train_df.head())


[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_rus to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |  

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


  ID                             Title     Genre  \
0  1      Oscar et la dame rose (2009)     drama   
1  2                      Cupid (1997)  thriller   
2  3  Young, Wild and Wonderful (1980)     adult   
3  4             The Secret Sin (1915)     drama   
4  5            The Unrecovered (2007)     drama   

                                         Description  \
0  Listening in to a conversation between his doc...   
1  A brother and sister with a past incestuous re...   
2  As the bus empties the students for their fiel...   
3  To help their unemployed father make ends meet...   
4  The film's title refers not only to the un-rec...   

                                 Cleaned_Description  
0  listening conversation doctor parents 10 year ...  
1  brother sister past incestuous relationship cu...  
2  bus empties students field trip museum natural...  
3  help unemployed father make ends meet edith tw...  
4  film title refers un recovered bodies ground z...  


Feature Engineering

In [4]:
!pip install scikit-learn




In [7]:
# Install necessary libraries (if not installed)
!pip install pandas scikit-learn numpy nltk

# Import Libraries
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')

# File Paths
train_path = '/content/drive/MyDrive/Genre Classification Dataset/train_data.txt'
test_path = '/content/drive/MyDrive/Genre Classification Dataset/test_data.txt'

# Load Data Function
def load_data(file_path, is_test=False):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split(" ::: ")
            if len(parts) == 4:
                data.append({"ID": int(parts[0]), "Title": parts[1], "Genre": parts[2], "Description": parts[3]})
            elif len(parts) == 3 and is_test:
                # Test data has no genre
                data.append({"ID": int(parts[0]), "Title": parts[1], "Description": parts[2]})
    return pd.DataFrame(data)

# Load Train and Test Data
train_df = load_data(train_path)
test_df = load_data(test_path, is_test=True)

print("Train Data Loaded. Shape:", train_df.shape)
print("Test Data Loaded. Shape:", test_df.shape)

# Verify Column Names
print("Train Data Columns:", train_df.columns)
print("Test Data Columns:", test_df.columns)

# Handle any column name issues (strip spaces)
train_df.columns = train_df.columns.str.strip()
test_df.columns = test_df.columns.str.strip()

# Handle Missing Data
train_df['Description'].fillna("", inplace=True)
test_df['Description'].fillna("", inplace=True)

print("Missing values handled successfully.")

# Text Cleaning Function
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters, numbers, and punctuation
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    # Tokenize and remove stopwords
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Clean Descriptions
train_df['Cleaned_Description'] = train_df['Description'].apply(clean_text)
test_df['Cleaned_Description'] = test_df['Description'].apply(clean_text)

print("Text cleaning completed.")

# Encode Labels
label_encoder = LabelEncoder()
train_df['Genre_Encoded'] = label_encoder.fit_transform(train_df['Genre'])
print("Labels encoded:", label_encoder.classes_)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['Cleaned_Description'])
X_test_tfidf = tfidf_vectorizer.transform(test_df['Cleaned_Description'])

print("TF-IDF Vectorization Completed. Shape:", X_train_tfidf.shape, X_test_tfidf.shape)

# Count Vectorizer
count_vectorizer = CountVectorizer(max_features=5000)
X_train_count = count_vectorizer.fit_transform(train_df['Cleaned_Description'])
X_test_count = count_vectorizer.transform(test_df['Cleaned_Description'])

print("Count Vectorization Completed. Shape:", X_train_count.shape, X_test_count.shape)

# Final Data
y_train = train_df['Genre_Encoded']

print("Data Preparation Completed.")




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Train Data Loaded. Shape: (54214, 4)
Test Data Loaded. Shape: (54200, 3)
Train Data Columns: Index(['ID', 'Title', 'Genre', 'Description'], dtype='object')
Test Data Columns: Index(['ID', 'Title', 'Description'], dtype='object')
Missing values handled successfully.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['Description'].fillna("", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df['Description'].fillna("", inplace=True)


Text cleaning completed.
Labels encoded: ['action' 'adult' 'adventure' 'animation' 'biography' 'comedy' 'crime'
 'documentary' 'drama' 'family' 'fantasy' 'game-show' 'history' 'horror'
 'music' 'musical' 'mystery' 'news' 'reality-tv' 'romance' 'sci-fi'
 'short' 'sport' 'talk-show' 'thriller' 'war' 'western']
TF-IDF Vectorization Completed. Shape: (54214, 5000) (54200, 5000)
Count Vectorization Completed. Shape: (54214, 5000) (54200, 5000)
Data Preparation Completed.


Preprocessing with Transformers

In [8]:
!pip install transformers datasets torch


Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_c

In [None]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset

# Load Data
def load_data(file_path):
    try:
        df = pd.read_csv(file_path, sep=" ::: ", engine='python', header=None, names=["ID", "Title", "Genre", "Description"])
        print(f"✅ Data Loaded Successfully from {file_path}")
        print("Missing Values:\n", df.isnull().sum())
        return df
    except Exception as e:
        print(f"❗ Error Loading Data: {e}")
        return None

train_df = load_data('/content/drive/MyDrive/Genre Classification Dataset/train_data.txt')
test_df = load_data('/content/drive/MyDrive/Genre Classification Dataset/test_data.txt')
solution_df = load_data('/content/drive/MyDrive/Genre Classification Dataset/test_data_solution.txt')

# Downscale Dataset (Optional)
train_df = train_df.sample(n=2000, random_state=42)  # Use only 2000 samples for training
solution_df = solution_df.sample(n=500, random_state=42)  # 500 for evaluation
print(f"✅ Train Data Reduced to {len(train_df)} samples, Test Data Reduced to {len(solution_df)} samples")

# Handle Missing Data
def handle_missing_data(df):
    if df is not None and 'Description' in df.columns:
        df['Description'].fillna('', inplace=True)
        print("✅ Missing Descriptions Handled")
    return df

train_df = handle_missing_data(train_df)
solution_df = handle_missing_data(solution_df)

# Encode Genre Labels
genre_labels = {genre: idx for idx, genre in enumerate(train_df['Genre'].unique())}
train_df['Label'] = train_df['Genre'].map(genre_labels)
solution_df['Label'] = solution_df['Genre'].map(genre_labels)

print("✅ Genre Labels Encoded:", genre_labels)

# Tokenizer Initialization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Custom Dataset Class
class MovieDataset(Dataset):
    def __init__(self, data, tokenizer, max_len=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data.iloc[index]
        inputs = self.tokenizer(row['Description'], truncation=True, padding='max_length', max_length=self.max_len, return_tensors='pt')
        label = torch.tensor(row['Label'])
        return {
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0),
            'label': label
        }

train_dataset = MovieDataset(train_df, tokenizer)
test_dataset = MovieDataset(solution_df, tokenizer)

# Model Initialization
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(genre_labels)).to(device)

# Training Configuration (Reduced EPOCHS and Batch Size)
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,  # Reduced Epochs
    per_device_train_batch_size=8,  # Adjust based on memory
    per_device_eval_batch_size=8,
    warmup_steps=200,  # Reduced Warmup
    weight_decay=0.01,
    logging_dir='./logs',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train Model
print("🚀 Training Started (2 Epochs)...")
trainer.train()

# Evaluate Model
print("📊 Evaluating the Model...")
results = trainer.evaluate()

print("✅ Evaluation Results:", results)

# Predict Test Data
predictions = trainer.predict(test_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=1)

# Map Predicted Labels to Genres
predicted_genres = [list(genre_labels.keys())[label] for label in predicted_labels]

# Output Results
solution_df['Predicted_Genre'] = predicted_genres
solution_df.to_csv('/content/drive/MyDrive/Genre Classification Dataset/predicted_results.csv', index=False)
print("✅ Predictions Saved to predicted_results.csv")


✅ Data Loaded Successfully from /content/drive/MyDrive/Genre Classification Dataset/train_data.txt
Missing Values:
 ID             0
Title          0
Genre          0
Description    0
dtype: int64
✅ Data Loaded Successfully from /content/drive/MyDrive/Genre Classification Dataset/test_data.txt
Missing Values:
 ID                 0
Title              0
Genre              0
Description    54200
dtype: int64
✅ Data Loaded Successfully from /content/drive/MyDrive/Genre Classification Dataset/test_data_solution.txt
Missing Values:
 ID             0
Title          0
Genre          0
Description    0
dtype: int64
✅ Train Data Reduced to 2000 samples, Test Data Reduced to 500 samples
✅ Missing Descriptions Handled
✅ Missing Descriptions Handled
✅ Genre Labels Encoded: {'comedy': 0, 'short': 1, 'horror': 2, 'music': 3, 'documentary': 4, 'thriller': 5, 'sci-fi': 6, 'crime': 7, 'biography': 8, 'drama': 9, 'family': 10, 'history': 11, 'animation': 12, 'romance': 13, 'action': 14, 'western': 15, 'a

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Description'].fillna('', inplace=True)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


🚀 Training Started (2 Epochs)...


Epoch,Training Loss,Validation Loss
