In [2]:
# Install and download required packages
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')  # Optional, depending on your need


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
# File handling and preprocessing libraries
from google.colab import files
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import os

In [4]:
# Upload .txt file(s)
uploaded = files.upload()
file_path = list(uploaded.keys())[0]  # Grab the first uploaded file name


Saving train_data.txt to train_data (1).txt
Saving test_data.txt to test_data.txt
Saving test_data_solution.txt to test_data_solution.txt
Saving description.txt to description.txt


In [5]:
def load_and_clean_data(file_path):
    # Read .txt file with ::: separator (like a CSV but with custom delimiter)
    data = pd.read_csv(file_path, sep=':::', engine='python', names=['id', 'title', 'genre', 'plot_summary'])

    stop_words = set(stopwords.words('english'))

    def clean_text(text):
        text = re.sub(r'[^a-zA-Z\s]', '', str(text))
        text = text.lower()
        tokens = word_tokenize(text)
        tokens = [word for word in tokens if word not in stop_words]
        return ' '.join(tokens)

    print("Cleaning plot summaries...")
    data['cleaned_plot'] = [clean_text(text) for text in tqdm(data['plot_summary'], desc="Progress")]
    return data['cleaned_plot'], data['genre']


In [6]:
def compute_class_weights(labels):
    unique_labels, counts = np.unique(labels, return_counts=True)
    total_samples = len(labels)
    class_weights = {label: total_samples / (len(unique_labels) * count) for label, count in zip(unique_labels, counts)}
    return class_weights


In [7]:
def train_optimized_model(texts, labels):
    print("Transforming text to TF-IDF features...")
    tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
    X = tfidf.fit_transform(tqdm(texts, desc="TF-IDF progress"))

    X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

    class_weights = compute_class_weights(y_train)

    base_model = LogisticRegression(max_iter=1000, class_weight=class_weights)

    param_grid = {
        'C': [0.1, 1.0, 10.0],
        'solver': ['lbfgs', 'liblinear']
    }

    print("Starting Grid Search...")
    grid_search = GridSearchCV(base_model, param_grid, cv=5, scoring='accuracy', verbose=1)
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    print(f"Best parameters: {grid_search.best_params_}")

    y_pred = best_model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))

    return best_model, tfidf


In [8]:
def predict_genre(model, tfidf, new_plot):
    stop_words = set(stopwords.words('english'))
    cleaned_plot = re.sub(r'[^a-zA-Z\s]', '', new_plot.lower())
    tokens = word_tokenize(cleaned_plot)
    tokens = [word for word in tokens if word not in stop_words]
    plot_tfidf = tfidf.transform([' '.join(tokens)])
    prediction = model.predict(plot_tfidf)
    return prediction[0]


In [13]:
print("Loading and preprocessing data...")
texts, labels = load_and_clean_data(file_path)

print("\nGenre distribution:")
print(labels.value_counts())

print("\nTransforming text into TF-IDF features...")
tfidf = TfidfVectorizer(max_features=3000, ngram_range=(1, 1))
X = tfidf.fit_transform(tqdm(texts, desc="TF-IDF progress"))

X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

print("\nTraining Logistic Regression model (no grid search)...")
model = LogisticRegression(max_iter=500, solver='liblinear', class_weight='balanced')
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

print("\nPredicting genre for a new plot...")
new_plot = "Superhero murder mistry"
predicted_genre = predict_genre(model, tfidf, new_plot)
print(f"Predicted Genre: {predicted_genre}")



Loading and preprocessing data...
Cleaning plot summaries...


Progress: 100%|██████████| 54214/54214 [00:22<00:00, 2447.19it/s]



Genre distribution:
genre
drama           13613
documentary     13096
comedy           7447
short            5073
horror           2204
thriller         1591
action           1315
western          1032
reality-tv        884
family            784
adventure         775
music             731
romance           672
sci-fi            647
adult             590
crime             505
animation         498
sport             432
talk-show         391
fantasy           323
mystery           319
musical           277
biography         265
history           243
game-show         194
news              181
war               132
Name: count, dtype: int64

Transforming text into TF-IDF features...


TF-IDF progress: 100%|██████████| 54214/54214 [00:02<00:00, 24155.78it/s]



Training Logistic Regression model (no grid search)...
Accuracy: 0.4987549571151895
Classification Report:
                precision    recall  f1-score   support

      action        0.33      0.41      0.37       263
       adult        0.32      0.64      0.42       112
   adventure        0.14      0.23      0.17       139
   animation        0.17      0.26      0.20       104
   biography        0.04      0.08      0.05        61
      comedy        0.60      0.44      0.51      1443
       crime        0.15      0.36      0.21       107
 documentary        0.77      0.68      0.72      2659
       drama        0.68      0.47      0.56      2697
      family        0.15      0.27      0.19       150
     fantasy        0.09      0.18      0.12        74
   game-show        0.62      0.72      0.67        40
     history        0.06      0.18      0.09        45
      horror        0.52      0.67      0.59       431
       music        0.43      0.72      0.54       144
     music