In [50]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm



In [51]:
genre_list=['action','adult','animation','biography','comedy','crime','documnetry','family','fantasy','game-show','history','horror','music','musical','mystery','news','reality-tv','romance','sci-fi','short','sport','talk-show','thriller','war','western']

In [52]:
fallback_genre='Unknown'

In [53]:
try:
    with tqdm(total=50, desc="Loading Train Data") as pbar:
        train_data = pd.read_csv('train_data.txt.zip', sep='111', header=None, names=['SerialNumber', 'MOVIE_NAME', 'GENRE', 'MOVIE_PLOT'], engine='python')
        pbar.update(50)
except Exception as e:
    print(f"Error loading train_data: {e}")
    raise
print("Shape of train_data:", train_data.shape)
print(train_data.head())
print(train_data.isnull().sum())



Loading Train Data: 100%|██████████| 50/50 [00:01<00:00, 46.47it/s]

Shape of train_data: (54214, 4)
                                        SerialNumber MOVIE_NAME GENRE  \
0  1 ::: Oscar et la dame rose (2009) ::: drama :...       None  None   
1  2 ::: Cupid (1997) ::: thriller ::: A brother ...       None  None   
2  3 ::: Young, Wild and Wonderful (1980) ::: adu...       None  None   
3  4 ::: The Secret Sin (1915) ::: drama ::: To h...       None  None   
4  5 ::: The Unrecovered (2007) ::: drama ::: The...       None  None   

  MOVIE_PLOT  
0       None  
1       None  
2       None  
3       None  
4       None  
SerialNumber      111
MOVIE_NAME      54011
GENRE           54211
MOVIE_PLOT      54213
dtype: int64





In [54]:
X_train = train_data['MOVIE_PLOT'].astype(str).apply(lambda doc: doc.lower())
print("Shape of X_train:", X_train.shape)
train_data['GENRE'] = train_data['GENRE'].fillna('')
print(train_data.head())
genre_labels = [genre.split(', ') for genre in train_data['GENRE']]
print("Number of genre labels:", len(genre_labels))
print(genre_labels[:5])
mlb = MultiLabelBinarizer()
Y_train = mlb.fit_transform(genre_labels)
assert len(X_train) == len(genre_labels), "Number of samples in X_train and genre_labels do not match."


Shape of X_train: (54214,)
                                        SerialNumber MOVIE_NAME GENRE  \
0  1 ::: Oscar et la dame rose (2009) ::: drama :...       None         
1  2 ::: Cupid (1997) ::: thriller ::: A brother ...       None         
2  3 ::: Young, Wild and Wonderful (1980) ::: adu...       None         
3  4 ::: The Secret Sin (1915) ::: drama ::: To h...       None         
4  5 ::: The Unrecovered (2007) ::: drama ::: The...       None         

  MOVIE_PLOT  
0       None  
1       None  
2       None  
3       None  
4       None  
Number of genre labels: 54214
[[''], [''], [''], [''], ['']]


In [55]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

In [56]:
with tqdm(total=50, desc="Vectorizing Training Data") as pbar:
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
    pbar.update(50)
print("Shape of X_train_tfidf:", X_train_tfidf.shape)
    

Vectorizing Training Data: 100%|██████████| 50/50 [00:00<00:00, 156.88it/s]

Shape of X_train_tfidf: (54214, 116)





In [57]:
with tqdm(total=50, desc="Training Model") as pbar:
    naive_bayes = MultinomialNB()
    multi_output_classifier = MultiOutputClassifier(naive_bayes)
    multi_output_classifier.fit(X_train_tfidf, Y_train)
    pbar.update(50)

Training Model: 100%|██████████| 50/50 [00:00<00:00, 520.30it/s]


In [58]:
try:
    with tqdm(total=50, desc="Loading Test Data") as pbar:
        test_data = pd.read_csv('test_data.txt.zip', sep='111', header=None, names=['SerialNumber', 'MOVIE_NAME', 'GENRE', 'MOVIE_PLOT'], engine='python')
        pbar.update(50)
except Exception as e:
    print(f"Error loading test_data: {e}")
    raise

Loading Test Data: 100%|██████████| 50/50 [00:00<00:00, 54.58it/s]


In [59]:
X_test = test_data['MOVIE_PLOT'].astype(str).apply(lambda doc: doc.lower())

In [60]:
with tqdm(total=50, desc="Vectorizing Test Data") as pbar:
    X_test_tfidf = tfidf_vectorizer.transform(X_test)
    pbar.update(50)
    print("Shape of X_test_tfidf:", X_test_tfidf.shape)
    


Vectorizing Test Data: 100%|██████████| 50/50 [00:00<00:00, 163.19it/s]

Shape of X_test_tfidf: (54200, 116)





In [61]:
with tqdm(total=50, desc="Predicting Test Data") as pbar:
    Y_pred = multi_output_classifier.predict(X_test_tfidf)
    pbar.update(50)

Predicting Test Data: 100%|██████████| 50/50 [00:00<00:00, 2598.28it/s]


In [62]:
# Assuming test_data is a DataFrame containing the column 'MOVIE_NAME'
test_movie_names = test_data['MOVIE_NAME']  # Access the MOVIE_NAME column correctly

# Use the inverse_transform method of MultiLabelBinarizer to get the predicted genres
predicted_genres = mlb.inverse_transform(Y_pred)

# Create a DataFrame with the movie names and their predicted genres
test_results = pd.DataFrame({
    'MOVIE_NAME': test_movie_names,
    'PREDICTED_GENRES': ['; '.join(genres) for genres in predicted_genres]
})


In [63]:
fallback_genre = 'Unknown'  # If you prefer a single string as a fallback genre
test_results['PREDICTED_GENRES'] = test_results['PREDICTED_GENRES'].apply(lambda genres: [fallback_genre] if len(genres) == 0 else genres)


In [64]:
import zipfile

In [65]:
output_filename = "test_data_solution.txt"

In [66]:
with zipfile.ZipFile("test_data_solution.txt.zip", "w", zipfile.ZIP_DEFLATED) as zip_file:
    with zip_file.open("test_data_solution.txt", "w") as output_file:
        for index, row in test_results.iterrows():
            movie_name = row['MOVIE_NAME']
            genre_str = ', '.join(row['PREDICTED_GENRES'])
            output_file.write(f"{movie_name} ::: {genre_str}\n".encode('utf-8'))

In [67]:
Y_train_pred = multi_output_classifier.predict(X_train_tfidf)

In [68]:
accuracy=accuracy_score(Y_train,Y_train_pred)
precision=precision_score(Y_train,Y_train_pred,average='micro')
recall=recall_score(Y_train,Y_train_pred,average='micro')
f1=f1_score(Y_train,Y_train_pred,average='micro')


In [69]:
with open("test_data_solution.txt", "a", encoding="utf-8") as output_file:
    output_file.write("\n\nEvaluation:\n")
    output_file.write(f"Accuracy: {accuracy * 100:.2f}%\n")
    output_file.write(f"Precision: {precision:.2f}\n")
    output_file.write(f"Recall: {recall:.2f}\n")
    output_file.write(f"F1 Score: {f1:.2f}\n")


In [70]:
print("Evaluation has been saved to 'test_data_solution.txt'.")


Evaluation has been saved to 'test_data_solution.txt'.


In [71]:
# Open the file for reading
with open('test_data_solution.txt', 'r', encoding='utf-8') as file:
    # Read the entire contents of the file
    saved_data = file.read()

# Print the saved data
print(saved_data)



Evaluation:
Accuracy: 99.99%
Precision: 1.00
Recall: 1.00
F1 Score: 1.00


Evaluation:
Accuracy: 99.99%
Precision: 1.00
Recall: 1.00
F1 Score: 1.00


Evaluation:
Accuracy: 99.99%
Precision: 1.00
Recall: 1.00
F1 Score: 1.00

