In [2]:
import pandas as pd
import numpy as np
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score

import warnings
warnings.filterwarnings("ignore")


In [3]:
# Load and parse training data
train_data = []
with open("train_data.txt", "r", encoding='utf-8') as f:
    for line in f:
        parts = line.strip().split(" ::: ")
        if len(parts) == 4:
            ID, title, genre, description = parts
            train_data.append((ID, title, genre, description))

train_df = pd.DataFrame(train_data, columns=["ID", "Title", "Genre", "Description"])
print("Train shape:", train_df.shape)
train_df.head()


Train shape: (54214, 4)


Unnamed: 0,ID,Title,Genre,Description
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his doc...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous re...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fiel...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends meet...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-rec...


In [4]:
# Load and parse test data
test_data = []
with open("test_data.txt", "r", encoding='utf-8') as f:
    for line in f:
        parts = line.strip().split(" ::: ")
        if len(parts) == 3:
            ID, title, description = parts
            test_data.append((ID, title, description))

test_df = pd.DataFrame(test_data, columns=["ID", "Title", "Description"])
print("Test shape:", test_df.shape)
test_df.head()


Test shape: (54200, 3)


Unnamed: 0,ID,Title,Description
0,1,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apart..."
1,2,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty chi..."
2,3,Off the Beaten Track (2010),One year in the life of Albin and his family o...
3,4,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with his..."
4,5,Er nu zhai (1955),Before he was known internationally as a marti...


In [5]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text

train_df['Clean_Description'] = train_df['Description'].apply(clean_text)
test_df['Clean_Description'] = test_df['Description'].apply(clean_text)


In [6]:
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X = vectorizer.fit_transform(train_df['Clean_Description'])
X_test_final = vectorizer.transform(test_df['Clean_Description'])

y = train_df['Genre']


In [7]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
nb_preds = nb_model.predict(X_val)

print("Naive Bayes Accuracy:", accuracy_score(y_val, nb_preds))
print(classification_report(y_val, nb_preds))


Naive Bayes Accuracy: 0.5214424052384027
              precision    recall  f1-score   support

      action       0.55      0.08      0.14       263
       adult       1.00      0.05      0.10       112
   adventure       0.50      0.07      0.13       139
   animation       0.00      0.00      0.00       104
   biography       0.00      0.00      0.00        61
      comedy       0.51      0.44      0.47      1443
       crime       0.00      0.00      0.00       107
 documentary       0.58      0.88      0.70      2659
       drama       0.45      0.82      0.58      2697
      family       0.00      0.00      0.00       150
     fantasy       0.00      0.00      0.00        74
   game-show       1.00      0.17      0.30        40
     history       0.00      0.00      0.00        45
      horror       0.73      0.36      0.48       431
       music       0.83      0.13      0.23       144
     musical       0.00      0.00      0.00        50
     mystery       0.00      0.00      0

In [9]:
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)
lr_preds = lr_model.predict(X_val)

print("Logistic Regression Accuracy:", accuracy_score(y_val, lr_preds))
print(classification_report(y_val, lr_preds))


Logistic Regression Accuracy: 0.5755787143779397
              precision    recall  f1-score   support

      action       0.55      0.25      0.35       263
       adult       0.74      0.22      0.34       112
   adventure       0.44      0.14      0.22       139
   animation       0.67      0.10      0.17       104
   biography       0.00      0.00      0.00        61
      comedy       0.50      0.58      0.54      1443
       crime       0.33      0.02      0.04       107
 documentary       0.66      0.84      0.74      2659
       drama       0.54      0.77      0.64      2697
      family       0.38      0.07      0.12       150
     fantasy       0.00      0.00      0.00        74
   game-show       0.95      0.45      0.61        40
     history       0.00      0.00      0.00        45
      horror       0.62      0.55      0.59       431
       music       0.65      0.49      0.56       144
     musical       0.50      0.02      0.04        50
     mystery       0.00      0.0

In [10]:
svm_model = LinearSVC()
svm_model.fit(X_train, y_train)
svm_preds = svm_model.predict(X_val)

print("SVM Accuracy:", accuracy_score(y_val, svm_preds))
print(classification_report(y_val, svm_preds))


SVM Accuracy: 0.5612837775523379
              precision    recall  f1-score   support

      action       0.38      0.30      0.34       263
       adult       0.63      0.38      0.48       112
   adventure       0.27      0.19      0.23       139
   animation       0.34      0.13      0.19       104
   biography       0.00      0.00      0.00        61
      comedy       0.51      0.55      0.53      1443
       crime       0.16      0.05      0.07       107
 documentary       0.69      0.80      0.74      2659
       drama       0.56      0.70      0.62      2697
      family       0.24      0.15      0.18       150
     fantasy       0.16      0.04      0.06        74
   game-show       0.72      0.65      0.68        40
     history       0.00      0.00      0.00        45
      horror       0.59      0.62      0.61       431
       music       0.54      0.51      0.52       144
     musical       0.08      0.02      0.03        50
     mystery       0.11      0.04      0.05     

In [None]:
# You can use your best model here — let's use LogisticRegression
final_preds = lr_model.predict(X_test_final)

# Save results
test_df['Predicted_Genre'] = final_preds
test_df[['ID', 'Title', 'Predicted_Genre']].to_csv("genre_predictions.csv", index=False)
print(" Predictions saved to genre_predictions.csv")


✅ Predictions saved to genre_predictions.csv


In [12]:
# Load test solution (optional if you want to evaluate)
solution_data = []
with open("test_data_solution.txt", "r", encoding='utf-8') as f:
    for line in f:
        parts = line.strip().split(" ::: ")
        if len(parts) == 4:
            ID, title, genre, description = parts
            solution_data.append((ID, title, genre, description))

solution_df = pd.DataFrame(solution_data, columns=["ID", "Title", "Genre", "Description"])

# Merge with prediction for evaluation
merged = pd.merge(test_df, solution_df[['ID', 'Genre']], on="ID", how="left", suffixes=('_pred', '_true'))

print("🔍 Final Test Accuracy:", accuracy_score(merged['Genre'], merged['Predicted_Genre']))
print(classification_report(merged['Genre'], merged['Predicted_Genre']))


🔍 Final Test Accuracy: 0.5767343173431735
              precision    recall  f1-score   support

      action       0.48      0.27      0.34      1314
       adult       0.62      0.20      0.30       590
   adventure       0.60      0.17      0.27       775
   animation       0.57      0.05      0.09       498
   biography       0.00      0.00      0.00       264
      comedy       0.52      0.58      0.55      7446
       crime       0.36      0.03      0.05       505
 documentary       0.66      0.85      0.74     13096
       drama       0.54      0.77      0.63     13612
      family       0.53      0.08      0.14       783
     fantasy       0.50      0.02      0.04       322
   game-show       0.93      0.48      0.63       193
     history       0.00      0.00      0.00       243
      horror       0.63      0.56      0.60      2204
       music       0.66      0.43      0.52       731
     musical       0.29      0.02      0.03       276
     mystery       0.40      0.01      