# Imports

### Sources
https://scikit-learn.org/ <br>
https://www.nltk.org/ <br>
https://docs.python.org/3/library/re.html <br>
https://pandas.pydata.org/ <br>

In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
import re
nltk.download("punkt") # Download 'punkt' package which is a tokenizer model used to divide text into a list of sentences or words
nltk.download("wordnet") # Download 'wordnet' which is a large lexical database of English used by the WordNetLemmatizer
nltk.download("stopwords") # Download 'stopwords' which contains lists of stopwords for various languages


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
file_path = './data/Training-dataset.csv'
data = pd.read_csv(file_path)
# Combining title and plot synopsis
data['combined_text'] = data['title'] + " " + data['plot_synopsis']

# Data Cleaning and Pre-processing

In [5]:
from nltk.stem import WordNetLemmatizer
english_stopwords = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_and_lemmatize(word):
    document_words = re.sub('[^a-zA-Z]', ' ', word)                     # Remove non-alphabetical characters
    document_words = document_words.lower()                             # Case-folding
    document_words = nltk.word_tokenize(document_words)                 # Splits string text into words and punctuation
    document_words = [lemmatizer.lemmatize(w) for w in document_words if w not in english_stopwords] # Removes stopwords and lemmatizes the words
    return document_words, ' '.join(document_words)                     # Combine back to string and return both list of tokens and the combined string


for i in range(len(data)):
  # Process the text
  list_of_words, combined_text = preprocess_and_lemmatize(data.loc[i, 'combined_text'])
  data.at[i, 'normalized_list_of_words']  = ""
  # Assign the processed values to the DataFrame
  data.at[i, 'normalized_list_of_words'] = list_of_words
  data.at[i, 'normalized_combined_text'] = combined_text


long_string = ' '.join(data['normalized_combined_text'])
unique_words_string = nltk.word_tokenize(long_string)


# Define the genre columns that represent the different labels in the dataset
genre_columns = ['comedy', 'cult', 'flashback', 'historical', 'murder', 'revenge', 'romantic', 'scifi', 'violence']
# Extract the label data from the dataset
y = data[genre_columns].values

# Read the validation dataset and pre-process it
data_dev = pd.read_csv('Task-2-validation-dataset.csv')
data_dev['normalized_combined_text'] = data_dev.apply(lambda row: preprocess_and_lemmatize(row['title'] + ' ' + row['plot_synopsis'])[1], axis=1)

# **Task 2**

## Approach 1: Naïve Bayes

### Sources
https://youtu.be/jS1CKhALUBQ?si=hSdzqpEUyiO-03OO <br>


In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# Define the pipeline with the best parameters
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=10000,max_df=0.75, ngram_range=(1, 2))),  # Text Vectorization with best parameters
    ('clf', MultiOutputClassifier(MultinomialNB(alpha=0.01, fit_prior=False)))  # Classifier with best parameters
])

# Fit the pipeline to your data
pipeline.fit(data['normalized_combined_text'], y)

# GRID SEARCH METHOD USED TO FIND THE BEST PARAMS
# # Define the pipeline
# pipeline = Pipeline([
#     ('tfidf', TfidfVectorizer()),  # Text Vectorization
#     ('clf', MultiOutputClassifier(MultinomialNB()))       # Classifier
# ])

# # Parameter grid to search
# parameter_grid = {
#     'tfidf__max_df': (0.75, 0.85, 1.0),
#     'tfidf__ngram_range': [(1, 1), (1, 2)],  # unigrams or bigrams
#     'clf__estimator__alpha': [0.01, 0.1, 1],
#     'clf__estimator__fit_prior': [True, False]
# }

# # Set up the grid search
# grid_search = GridSearchCV(pipeline, parameter_grid, cv=3, scoring='accuracy')
# grid_search.fit(data['normalized_combined_text'], y)
# # Retrieve the best model from grid search
# best_grid_model = grid_search.best_estimator_
# naives_results = best_grid_model.predict(data_dev['normalized_combined_text'])

### Development Dataset

In [None]:
# # Predict on the validation dataset using the best model from the grid search
naives_results = pipeline.predict(data_dev['normalized_combined_text'])

# Initialize an empty list to store the results
results = []

# Iterate over each prediction and the corresponding doc_id
for doc_id, labels in zip(data_dev['ID'], naives_results):  # Replace 'doc_id' with your actual document ID column name
    result = {'doc_id': doc_id}
    for i, label in enumerate(genre_columns):
        result[label] = labels[i]
    results.append(result)

# Convert the results list into a DataFrame
results_df = pd.DataFrame(results)

# Save the DataFrame to a CSV file
results_df.to_csv('10556516-Task2-method-a-validation.csv', index=False, header=False)


### Test Dataset

In [22]:
# Read the validation dataset and pre-process it
data_test = pd.read_csv('./data/Task-2-test-dataset1.csv')
data_test['normalized_combined_text'] = data_test.apply(lambda row: preprocess_and_lemmatize(row['title'] + ' ' + row['plot_synopsis'])[1], axis=1)

# Predict on the validation dataset using the best model from the grid search
naives_results = pipeline.predict(data_test['normalized_combined_text'])

# Initialize an empty list to store the results
results = []

# Iterate over each prediction and the corresponding doc_id
for doc_id, labels in zip(data_test['ID'], naives_results):  # Replace 'doc_id' with your actual document ID column name
    result = {'doc_id': doc_id}
    for i, label in enumerate(genre_columns):
        result[label] = labels[i]
    results.append(result)

# Convert the results list into a DataFrame
results_df = pd.DataFrame(results)

# Save the DataFrame to a CSV file
results_df.to_csv('10556516-Task2-method-a.csv', index=False, header=False)


## Approach 2: bi-directional LSTM

### Sources
https://github.com/krishnaik06/Fake-New-LSTM/tree/master <br>
https://www.youtube.com/watch?v=RpTmnRGJvRQ&ab_channel=KrishNaik



In [16]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential

In [17]:
# Set the maximum number of words to be used in the tokenizer and the maximum sequence length for embeddings
MAX_FEATURES = 20000
MAX_SEQUENCE_LENGTH = 1000

# Initialize the tokenizer with a maximum number of words
tokenizer = Tokenizer(num_words=MAX_FEATURES)

# Fit the tokenizer on the normalized text data
tokenizer.fit_on_texts(data['normalized_combined_text'])

# Convert the text data to sequences of integers
sequences = tokenizer.texts_to_sequences(data['normalized_combined_text'])

# Pad the sequences to a fixed length to ensure uniform input size
embedded_docs = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [18]:
# Initialize a Sequential model for the LSTM network
lstm_model=Sequential()

# Add an Embedding layer with 20,000 as input dimension, 128-dimensional vectors, and input length of 10,000
# lstm_model.add(Embedding(MAX_FEATURES, output_dim=128, input_length=MAX_SEQUENCE_LENGTH))
lstm_model.add(Embedding(MAX_FEATURES, output_dim=256, input_length=MAX_SEQUENCE_LENGTH))

# Add a Bidirectional LSTM layer with 64 units
# lstm_model.add(Bidirectional(LSTM(64)))
lstm_model.add(Bidirectional(LSTM(256)))

# Add a Dense output layer with 9 units and a sigmoid activation function for multi-label classification
lstm_model.add(Dense(9,activation='sigmoid'))

# Compile the model with binary crossentropy loss (suitable for multi-label classification), the Adam optimizer, and accuracy as a metric
lstm_model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])


In [19]:
lstm_model.fit(embedded_docs, y, batch_size=32, epochs=10, validation_split=0.05)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x79971695d630>

### Development Dataset

In [20]:
import numpy as np

# Convert the validation dataset text to sequences using the previously fitted tokenizer
sequences_dev = tokenizer.texts_to_sequences(data_dev['normalized_combined_text'])
# Pad the sequences to ensure they have the same length for the LSTM model
embedded_docs_dev=pad_sequences(sequences_dev, maxlen=MAX_SEQUENCE_LENGTH)

# Use the trained LSTM model to predict on the validation set and round the predictions and convert to integers
lstm_results = np.round(lstm_model.predict(embedded_docs_dev))
lstm_results = lstm_results.astype(int)

# Initialize an empty list to store the results
results = []

# Iterate over each prediction and the corresponding doc_id
for doc_id, labels in zip(data_dev['ID'], lstm_results):
    result = {'doc_id': doc_id}
    for i, label in enumerate(genre_columns):
        result[label] = labels[i]
    results.append(result)

# Convert the results list into a DataFrame
results_df = pd.DataFrame(results)

# Save the DataFrame to a CSV file
results_df.to_csv('10556516-Task2-method-b-validation.csv', index=False, header=False)




### Test Dataset

In [23]:
import numpy as np

# Convert the validation dataset text to sequences using the previously fitted tokenizer
sequences_test = tokenizer.texts_to_sequences(data_test['normalized_combined_text'])
# Pad the sequences to ensure they have the same length for the LSTM model
embedded_docs_test=pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH)

# Use the trained LSTM model to predict on the validation set and round the predictions and convert to integers
lstm_results = np.round(lstm_model.predict(embedded_docs_test))
lstm_results = lstm_results.astype(int)

# Initialize an empty list to store the results
results = []

# Iterate over each prediction and the corresponding doc_id
for doc_id, labels in zip(data_test['ID'], lstm_results):
    result = {'doc_id': doc_id}
    for i, label in enumerate(genre_columns):
        result[label] = labels[i]
    results.append(result)

# Convert the results list into a DataFrame
results_df = pd.DataFrame(results)

# Save the DataFrame to a CSV file
results_df.to_csv('10556516-Task2-method-b.csv', index=False, header=False)


