In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, accuracy_score
import re

In [None]:
train_path = '/content/drive/MyDrive/mgc/train_data.txt'
test_path = '/content/drive/MyDrive/mgc/test_data.txt'
solution_path = '/content/drive/MyDrive/mgc/test_data_solution.txt'

In [None]:
def load_data(path):
    return pd.read_csv(path, sep=':::', header=None, names=['ID', 'Genre', 'Plot'], engine='python', encoding='utf-8')

train_df = load_data(train_path)
test_df = load_data(test_path)
solution_df = load_data(solution_path)

In [None]:
def clean_text(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', str(text))
    return text.lower()

train_df['clean_plot'] = train_df['Plot'].apply(clean_text)
test_df['clean_plot'] = test_df['Plot'].apply(clean_text)

In [None]:
train_df['Genre'] = train_df['Genre'].fillna('').apply(lambda x: x.split('|'))
solution_df['Genre'] = solution_df['Genre'].fillna('').apply(lambda x: x.split('|'))

mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(train_df['Genre'])

In [None]:
def clean_text(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', str(text))
    return text.lower()

train_df['clean_plot'] = train_df['Plot'].apply(clean_text)
test_df['clean_plot'] = test_df['Plot'].apply(clean_text)

In [None]:
tfidf = TfidfVectorizer(max_features=5000)
X_train = tfidf.fit_transform(train_df['clean_plot'])
X_test = tfidf.transform(test_df['clean_plot'])

In [None]:
model = OneVsRestClassifier(LogisticRegression(max_iter=1000))
model.fit(X_train, y_train)


In [None]:
y_pred = model.predict(X_test)

In [None]:
y_true = mlb.transform(solution_df['Genre'])

print("Accuracy:", accuracy_score(y_true, y_pred))
print("\nClassification Report:\n", classification_report(y_true, y_pred, target_names=mlb.classes_))

Accuracy: 0.35988929889298893

Classification Report:
                precision    recall  f1-score   support

      action        0.75      0.05      0.10      1314
       adult        0.58      0.07      0.13       590
   adventure        0.77      0.06      0.12       775
   animation        0.80      0.01      0.02       498
   biography        0.00      0.00      0.00       264
      comedy        0.73      0.28      0.40      7446
       crime        0.50      0.00      0.00       505
 documentary        0.80      0.69      0.74     13096
       drama        0.69      0.47      0.56     13612
      family        0.86      0.01      0.02       783
     fantasy        0.00      0.00      0.00       322
   game-show        0.94      0.34      0.50       193
     history        0.00      0.00      0.00       243
      horror        0.81      0.29      0.42      2204
       music        0.78      0.24      0.37       731
     musical        0.00      0.00      0.00       276
     myst

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
def predict_genre_force_one(plot_text, top_k=3):
    cleaned = clean_text(plot_text)
    vector = tfidf.transform([cleaned])
    probs = model.predict_proba(vector)[0]

    top_indices = probs.argsort()[::-1][:top_k]
    predicted_labels = [mlb.classes_[i] for i in top_indices if probs[i] > 0.1]

    if not predicted_labels:
        predicted_labels = [mlb.classes_[top_indices[0]]]

    return predicted_labels

In [None]:
while True:
    user_input = input("Enter a movie plot (or type 'exit' to stop):\n")
    if user_input.lower() == 'exit':
        break
    genres = predict_genre_force_one(user_input)
    print("Predicted Genre(s):", ', '.join(genres), '\n')


Enter a movie plot (or type 'exit' to stop):
he kissed her and she cried
Predicted Genre(s):  drama ,  short  

Enter a movie plot (or type 'exit' to stop):
she found the mystery box with a key inside
Predicted Genre(s):  short ,  documentary ,  mystery  

Enter a movie plot (or type 'exit' to stop):
exit


In [None]:
predicted_genres = mlb.inverse_transform(y_pred)

result_df = pd.DataFrame({
    'ID': test_df['ID'],
    'Predicted_Genre': ['|'.join(genres) for genres in predicted_genres]
})

result_df.to_csv('/content/drive/MyDrive/mgc/predicted_output.csv', index=False)
print("Predictions saved to Drive!")

Predictions saved to Drive!
