## Importing Necessary Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report


## Importing the Dataset

In [None]:
import pandas as pd

file_path = "train_data.txt"
with open(file_path, "r", encoding="utf-8") as f:
    data = f.readlines()

parsed_data = [line.strip().split(" ::: ") for line in data]

## Converting to DataFrame

In [None]:
df = pd.DataFrame(parsed_data, columns=["ID", "Title", "Genre", "Plot"])

df = df.drop(columns=["ID"])

df.head()

Unnamed: 0,Title,Genre,Plot
0,Oscar et la dame rose (2009),drama,Listening in to a conversation between his doc...
1,Cupid (1997),thriller,A brother and sister with a past incestuous re...
2,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fiel...
3,The Secret Sin (1915),drama,To help their unemployed father make ends meet...
4,The Unrecovered (2007),drama,The film's title refers not only to the un-rec...


In [None]:
df['Genre'] = df['Genre'].apply(lambda x: x.split(',')[0])


## Text cleaning/Preprocessing

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()


def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = ' '.join([stemmer.stem(word) for word in text.split() if word not in stop_words])
    return text


df['Cleaned_Plot'] = df['Plot'].apply(preprocess_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Converting Text to Features (TF-IDF)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=500)
X = vectorizer.fit_transform(df['Cleaned_Plot']).toarray()
y = df['Genre']


## Train/Test Splitting

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Naive Bayes Classification

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)

print("Naïve Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))


Naïve Bayes Accuracy: 0.4657382643179932
              precision    recall  f1-score   support

      action       0.50      0.00      0.01       263
       adult       0.00      0.00      0.00       112
   adventure       0.00      0.00      0.00       139
   animation       0.00      0.00      0.00       104
   biography       0.00      0.00      0.00        61
      comedy       0.53      0.25      0.34      1443
       crime       0.00      0.00      0.00       107
 documentary       0.53      0.86      0.65      2659
       drama       0.40      0.83      0.54      2697
      family       0.00      0.00      0.00       150
     fantasy       0.00      0.00      0.00        74
   game-show       1.00      0.12      0.22        40
     history       0.00      0.00      0.00        45
      horror       0.58      0.09      0.15       431
       music       0.62      0.15      0.24       144
     musical       0.00      0.00      0.00        50
     mystery       0.00      0.00      0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


##  Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(solver = 'liblinear', max_iter=200)
log_reg.fit(X_train, y_train)
y_pred_lr = log_reg.predict(X_test)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))


Logistic Regression Accuracy: 0.5253158719911464
              precision    recall  f1-score   support

      action       0.35      0.13      0.18       263
       adult       0.67      0.14      0.24       112
   adventure       0.23      0.06      0.09       139
   animation       0.35      0.08      0.13       104
   biography       0.00      0.00      0.00        61
      comedy       0.45      0.47      0.46      1443
       crime       0.29      0.04      0.07       107
 documentary       0.62      0.83      0.71      2659
       drama       0.49      0.74      0.59      2697
      family       0.39      0.05      0.08       150
     fantasy       0.00      0.00      0.00        74
   game-show       0.88      0.35      0.50        40
     history       0.00      0.00      0.00        45
      horror       0.48      0.42      0.44       431
       music       0.60      0.43      0.50       144
     musical       1.00      0.02      0.04        50
     mystery       0.00      0.0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## SVM

In [None]:
# from sklearn.svm import SVC

# svm_model = SVC(kernel='rbf',max_iter = 500)
# svm_model.fit(X_train, y_train)
# y_pred_svm = svm_model.predict(X_test)

# print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
# print(classification_report(y_test, y_pred_svm))


## Stochastic Gradient Descent (SGD) Classifier

In [None]:
from sklearn.linear_model import SGDClassifier

sgd_svm = SGDClassifier(loss='hinge')
sgd_svm.fit(X_train, y_train)
y_pred_sgd = sgd_svm.predict(X_test)

print("SGD Accuracy:", accuracy_score(y_test, y_pred_sgd))


SGD Accuracy: 0.5030895508623074


## Predict Movie Genre for New Plot

In [None]:
def predict_genre(plot):
    cleaned_plot = preprocess_text(plot)
    transformed_plot = vectorizer.transform([cleaned_plot]).toarray()

    genre_nb = nb_model.predict(transformed_plot)[0]
    genre_lr = log_reg.predict(transformed_plot)[0]
    genre_svm = sgd_svm.predict(transformed_plot)[0]

    print(f"Predicted Genre (Naïve Bayes): {genre_nb}")
    print(f"Predicted Genre (Logistic Regression): {genre_lr}")
    print(f"Predicted Genre (SVM): {genre_svm}")

predict_genre("A detective investigates a mysterious murder case in a small town.")


Predicted Genre (Naïve Bayes): horror
Predicted Genre (Logistic Regression): mystery
Predicted Genre (SVM): mystery


# To improve these Model Performances

## Improve TF-IDF Features

In [None]:
vectorizer = TfidfVectorizer(max_features=2000, ngram_range=(1,2))


In [None]:
vectorizer = TfidfVectorizer(stop_words=None)


##  Balance the Dataset (If Imbalanced Genres Exist)

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)


## Hyperparameter Tuning

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
param_grid = {'alpha': [0.1, 0.5, 1.0, 5.0]}
grid_nb = GridSearchCV(MultinomialNB(), param_grid, cv=5)
grid_nb.fit(X_train, y_train)
print("Best NB Alpha:", grid_nb.best_params_)


Best NB Alpha: {'alpha': 0.1}


##  Evaluate & Interpret Results

In [None]:
from sklearn.metrics import classification_report

y_pred_nb = grid_nb.best_estimator_.predict(X_test)
y_pred_lr = log_reg.predict(X_test)
y_pred_sgd = sgd_svm.predict(X_test)

print("Naïve Bayes Report:\n", classification_report(y_test, y_pred_nb))
print("Logistic Regression Report:\n", classification_report(y_test, y_pred_lr))
print("SGD SVM Report:\n", classification_report(y_test, y_pred_sgd))


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Naïve Bayes Report:
               precision    recall  f1-score   support

      action       0.40      0.01      0.01       263
       adult       1.00      0.01      0.02       112
   adventure       0.00      0.00      0.00       139
   animation       0.00      0.00      0.00       104
   biography       0.00      0.00      0.00        61
      comedy       0.53      0.26      0.34      1443
       crime       0.00      0.00      0.00       107
 documentary       0.53      0.86      0.65      2659
       drama       0.40      0.83      0.54      2697
      family       0.00      0.00      0.00       150
     fantasy       0.00      0.00      0.00        74
   game-show       1.00      0.17      0.30        40
     history       0.00      0.00      0.00        45
      horror       0.59      0.10      0.17       431
       music       0.65      0.23      0.34       144
     musical       0.00      0.00      0.00        50
     mystery       0.00      0.00      0.00        56
      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
