In [8]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm

In [9]:
genre_list = [ 'action', 'adult', 'adventure', 'animation', 'biography', 'comedy', 'crime', 'documentary', 'family', 'fantasy', 'game-show', 'history', 'horror', 'music', 'musical', 'mystery', 'news', 'reality-tv', 'romance', 'sci-fi', 'short', 'sport', 'talk-show', 'thriller', 'war', 'western' ]

In [10]:
fallback_genre = 'Unknown'

In [11]:
try:
    with tqdm(total=50, desc="Loading Train Data") as pbar:
        train_data = pd.read_csv("/content/drive/MyDrive/train_data.txt", sep=':::', header=None, names=['SerialNumber', 'MOVIE_NAME', 'GENRE', 'MOVIE_PLOT'], engine='python')
        pbar.update(50)
except Exception as e:
    print(f"Error loading train_data: {e}")
    raise

Loading Train Data: 100%|██████████| 50/50 [00:00<00:00, 116.98it/s]


In [None]:
train_data

Unnamed: 0,SerialNumber,MOVIE_NAME,GENRE,MOVIE_PLOT
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous r...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...
...,...,...,...,...
54209,54210,"""Bonino"" (1953)",comedy,This short-lived NBC live sitcom centered on ...
54210,54211,Dead Girls Don't Cry (????),horror,The NEXT Generation of EXPLOITATION. The sist...
54211,54212,Ronald Goedemondt: Ze bestaan echt (2008),documentary,"Ze bestaan echt, is a stand-up comedy about g..."
54212,54213,Make Your Own Bed (1944),comedy,Walter and Vivian live in the country and hav...


In [12]:
X_train = train_data['MOVIE_PLOT'].astype(str).apply(lambda doc: doc.lower())
genre_labels = []
for genre in train_data['GENRE']:
    if isinstance(genre, str):
        genre_labels.append(genre.split(', '))
    else:
        genre_labels.append([])

In [None]:
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(genre_labels)

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=6000)

In [None]:
with tqdm(total=50, desc="Vectorizing Training Data") as pbar:
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
    pbar.update(50)

Vectorizing Training Data: 100%|██████████| 50/50 [00:06<00:00,  7.62it/s]


In [None]:
with tqdm(total=50, desc="Training Model") as pbar:
    naive_bayes = MultinomialNB()
    multi_output_classifier = MultiOutputClassifier(naive_bayes)
    multi_output_classifier.fit(X_train_tfidf, y_train)
    pbar.update(50)

Training Model: 100%|██████████| 50/50 [00:01<00:00, 32.17it/s]


In [None]:
try:
    with tqdm(total=50, desc="Loading Test Data") as pbar:
        test_data = pd.read_csv('train_data.txt', sep=':::', header=None, names=['SerialNumber', 'MOVIE_NAME', 'MOVIE_PLOT'], engine='python')
        pbar.update(50)
except Exception as e:
    print(f"Error loading test_data: {e}")
    raise

Loading Test Data: 100%|██████████| 50/50 [00:00<00:00, 53.10it/s]


In [None]:
X_test = test_data['MOVIE_PLOT'].astype(str).apply(lambda doc: doc.lower())

In [None]:
with tqdm(total=50, desc="Vectorizing Test Data") as pbar:
    X_test_tfidf = tfidf_vectorizer.transform(X_test)
    pbar.update(50)

with tqdm(total=50, desc="Predicting on Test Data") as pbar:
    y_pred = multi_output_classifier.predict(X_test_tfidf)
    pbar.update(50)

Vectorizing Test Data: 100%|██████████| 50/50 [00:07<00:00,  6.55it/s]
Predicting on Test Data: 100%|██████████| 50/50 [00:00<00:00, 94.14it/s]


In [None]:
test_movie_names = test_data['MOVIE_NAME']
predicted_genres = mlb.inverse_transform(y_pred)
test_results = pd.DataFrame({'MOVIE_NAME': test_movie_names, 'PREDICTED_GENRES': predicted_genres})

NameError: name 'test_data' is not defined

In [None]:
test_results['PREDICTED_GENRES'] = test_results['PREDICTED_GENRES'].apply(lambda genres: [fallback_genre] if len(genres) == 0 else genres)

In [None]:
with open("model_evaluation.txt", "w", encoding="utf-8") as output_file:
    for _, row in test_results.iterrows():
        movie_name = row['MOVIE_NAME']
        genre_str = ', '.join(row['PREDICTED_GENRES'])
        output_file.write(f"{movie_name} ::: {genre_str}\n")

In [None]:
y_train_pred = multi_output_classifier.predict(X_train_tfidf)

accuracy = accuracy_score(y_train, y_train_pred)
precision = precision_score(y_train, y_train_pred, average='micro')
recall = recall_score(y_train, y_train_pred, average='micro')
f1 = f1_score(y_train, y_train_pred, average='micro')

In [None]:
    print(f"Accuracy: {accuracy * 100:.2f}%\n")
    print(f"Precision: {precision:.2f}\n")
    print(f"Recall: {recall:.2f}\n")
    print(f"F1-score: {f1:.2f}\n")

In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm

# Load train data
try:
    with tqdm(total=50, desc="Loading Train Data") as pbar:
        train_data = pd.read_csv("/content/drive/MyDrive/train_data.txt", sep=':::', header=None, names=['SerialNumber', 'MOVIE_NAME', 'GENRE', 'MOVIE_PLOT'], engine='python')
        pbar.update(50)
except Exception as e:
    print(f"Error loading train_data: {e}")
    raise

# Preprocess train data
X_train = train_data['MOVIE_PLOT'].astype(str).apply(lambda doc: doc.lower())
genre_labels = train_data['GENRE'].str.split(',').fillna('').apply(lambda x: [genre.strip() for genre in x])
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(genre_labels)

# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=10000, stop_words='english', token_pattern=r'\b\w{2,}\b')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)


Loading Train Data: 100%|██████████| 50/50 [00:01<00:00, 49.36it/s]


In [6]:
# Train Random Forest Classifier
with tqdm(total=50, desc="Training Model") as pbar:
    rf_classifier = RandomForestClassifier(n_estimators=50, random_state=42)
    multi_output_classifier = MultiOutputClassifier(rf_classifier)
    multi_output_classifier.fit(X_train_tfidf, y_train)
    pbar.update(50)


Training Model: 100%|██████████| 50/50 [11:10<00:00, 13.42s/it]
Loading Test Data: 100%|██████████| 50/50 [00:00<00:00, 98.97it/s]
Predicting on Test Data: 100%|██████████| 50/50 [00:53<00:00,  1.06s/it]


Accuracy: 99.28%
Precision: 1.00
Recall: 0.99
F1-score: 1.00


In [13]:

# Load test data
try:
    with tqdm(total=50, desc="Loading Test Data") as pbar:
        test_data = pd.read_csv('/content/drive/MyDrive/train_data.txt', sep=':::', header=None, names=['SerialNumber', 'MOVIE_NAME', 'GENRE', 'MOVIE_PLOT'], engine='python')
        pbar.update(50)
except Exception as e:
    print(f"Error loading test_data: {e}")
    raise

# Preprocess test data
X_test = test_data['MOVIE_PLOT'].astype(str).apply(lambda doc: doc.lower())
genre_labels_test = test_data['GENRE'].str.split(',').fillna('').apply(lambda x: [genre.strip() for genre in x])

# TF-IDF vectorization for test data
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Predict on test data
with tqdm(total=50, desc="Predicting on Test Data") as pbar:
    y_pred = multi_output_classifier.predict(X_test_tfidf)
    pbar.update(50)

# Inverse transform predictions to genre labels
predicted_genres = mlb.inverse_transform(y_pred)

# Evaluate model performance
accuracy = accuracy_score(y_train, multi_output_classifier.predict(X_train_tfidf))
precision = precision_score(y_train, multi_output_classifier.predict(X_train_tfidf), average='micro')
recall = recall_score(y_train, multi_output_classifier.predict(X_train_tfidf), average='micro')
f1 = f1_score(y_train, multi_output_classifier.predict(X_train_tfidf), average='micro')

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")


Loading Test Data: 100%|██████████| 50/50 [00:00<00:00, 151.99it/s]
Predicting on Test Data: 100%|██████████| 50/50 [00:51<00:00,  1.03s/it]


Accuracy: 99.28%
Precision: 1.00
Recall: 0.99
F1-score: 1.00
