In [1]:
from google.colab import files
uploaded = files.upload()

Saving train_data.txt to train_data (1).txt


**Load and Parse Data**

In [2]:
import pandas as pd

def load_movie_data(file_path):
    rows = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split(":::")
            if len(parts) == 4:
                _, title, genre, plot = parts
                rows.append({
                    "title": title.strip(),
                    "genre": genre.strip().lower(),
                    "plot": plot.strip()
                })
    return pd.DataFrame(rows)

df = load_movie_data("train_data.txt")
df.head()


Unnamed: 0,title,genre,plot
0,Oscar et la dame rose (2009),drama,Listening in to a conversation between his doc...
1,Cupid (1997),thriller,A brother and sister with a past incestuous re...
2,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fiel...
3,The Secret Sin (1915),drama,To help their unemployed father make ends meet...
4,The Unrecovered (2007),drama,The film's title refers not only to the un-rec...


**Clean and Preprocess Text (Professional Style)**

In [3]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()

def clean_text(text):
    text = text.lower()                                      # Lowercase
    text = re.sub(r"[^a-zA-Z\s]", "", text)                  # Remove punctuation and numbers
    tokens = text.split()                                    # Tokenize
    tokens = [t for t in tokens if t not in stop_words]      # Remove stopwords
    tokens = [stemmer.stem(t) for t in tokens]               # Stemming
    return " ".join(tokens)

df["clean_plot"] = df["plot"].apply(clean_text)
df[["genre", "clean_plot"]].head()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,genre,clean_plot
0,drama,listen convers doctor parent yearold oscar lea...
1,thriller,brother sister past incestu relationship curre...
2,adult,bu empti student field trip museum natur histo...
3,drama,help unemploy father make end meet edith twin ...
4,drama,film titl refer unrecov bodi ground zero also ...


**Feature Extraction using TF-IDF**

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df["clean_plot"])
y = df["genre"]


**Split Data**

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)


**Train ML Models**

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

print("🔹 Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))


🔹 Logistic Regression Accuracy: 0.5808791884414387
              precision    recall  f1-score   support

      action       0.50      0.27      0.35       395
       adult       0.71      0.28      0.40       177
   adventure       0.61      0.13      0.21       233
   animation       0.48      0.10      0.17       149
   biography       0.00      0.00      0.00        79
      comedy       0.53      0.60      0.56      2234
       crime       0.32      0.05      0.08       152
 documentary       0.67      0.84      0.74      3929
       drama       0.54      0.77      0.63      4084
      family       0.49      0.09      0.14       235
     fantasy       0.33      0.01      0.02        97
   game-show       0.82      0.40      0.53        58
     history       0.00      0.00      0.00        73
      horror       0.67      0.57      0.61       661
       music       0.61      0.41      0.49       219
     musical       0.50      0.01      0.02        83
     mystery       0.00      0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


**Naive Bayes**

In [7]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)

print("🔹 Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))


🔹 Naive Bayes Accuracy: 0.5122041192745158
              precision    recall  f1-score   support

      action       0.49      0.04      0.08       395
       adult       0.56      0.05      0.09       177
   adventure       0.71      0.05      0.10       233
   animation       0.00      0.00      0.00       149
   biography       0.00      0.00      0.00        79
      comedy       0.52      0.41      0.46      2234
       crime       0.00      0.00      0.00       152
 documentary       0.56      0.88      0.68      3929
       drama       0.45      0.82      0.58      4084
      family       0.00      0.00      0.00       235
     fantasy       0.00      0.00      0.00        97
   game-show       1.00      0.02      0.03        58
     history       0.00      0.00      0.00        73
      horror       0.80      0.31      0.45       661
       music       0.55      0.03      0.05       219
     musical       0.00      0.00      0.00        83
     mystery       0.00      0.00     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


**Support Vector Machine (SVM)**

In [8]:
from sklearn.svm import LinearSVC

svm = LinearSVC()
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)

print("🔹 SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))


🔹 SVM Accuracy: 0.5658776513987089
              precision    recall  f1-score   support

      action       0.36      0.29      0.32       395
       adult       0.58      0.41      0.48       177
   adventure       0.39      0.20      0.27       233
   animation       0.30      0.15      0.20       149
   biography       0.00      0.00      0.00        79
      comedy       0.52      0.57      0.55      2234
       crime       0.25      0.09      0.13       152
 documentary       0.68      0.81      0.74      3929
       drama       0.56      0.68      0.61      4084
      family       0.32      0.16      0.21       235
     fantasy       0.14      0.04      0.06        97
   game-show       0.82      0.55      0.66        58
     history       0.20      0.04      0.07        73
      horror       0.62      0.61      0.61       661
       music       0.54      0.50      0.52       219
     musical       0.19      0.04      0.06        83
     mystery       0.12      0.04      0.06   

**Predict Genre for New Plot**

In [9]:
sample_plot = "A young girl discovers she has magical powers and must learn to use them to defeat an evil wizard."
clean_input = clean_text(sample_plot)
vector_input = vectorizer.transform([clean_input])

predicted_genre = lr.predict(vector_input)  # You can also try svm or nb here
print("📌 Predicted Genre:", predicted_genre[0])


📌 Predicted Genre: fantasy
