In [1]:
import pandas as pd
import unicodedata
import re
import string
import time

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.multioutput import MultiOutputClassifier

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Bidirectional
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, words

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('words')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [3]:
training_dataset_path = r"./data/Training-dataset.csv"
validation_dataset_path = r"./data/Task-2-validation-dataset.csv"
test_dataset_path = r"./data/Task-2-test-dataset1.csv"

Text pre-processing for both models:

In [4]:
# Initialise the lemmatizer and the words dictionary
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
english_vocab = set(words.words()) - stop_words

def remove_accents(data):
  """ Removes accents from a word. """
  return ''.join(x for x in unicodedata.normalize('NFKD', data) if x in string.ascii_letters or x == " ")

def process_text(text):
  """ A function for tokenization and lemmatization. """
  # Remove punctuation
  # replace dashes with ' ' and replace everything else with a ''.
  pattern = re.compile('[%s]' % re.escape(string.punctuation.replace('-', '')))
  text = re.sub(pattern, '', text.replace('-', ' '))

  # Tokenize
  tokens = word_tokenize(text)

  # Remove extra whitespace, turn to lowercase, remove accents, lemmatize
  tokens = [lemmatizer.lemmatize(remove_accents(token.strip().lower())) for token in tokens]

  # Filter OOV words + stopwords
  tokens = [token for token in tokens if token in english_vocab]

  return tokens

In [5]:
# Load the dataset
training_df = pd.read_csv(training_dataset_path)

# Pre-process the data
training_df['text'] = training_df['title'] + ' ' + training_df['plot_synopsis']
training_df['text'] = training_df['text'].apply(process_text)
training_df['text'] = training_df['text'].apply(' '.join)

**Model 1: a) A traditional classification method - Naïve Bayes classifier**

In [6]:
def evaluate_bayes_model(input_path, output_path):
  """ Evaluates the model on a validation/test set.

      It does the following:
        1- Reads the input file
        2- Preprocesses the data.
        3- Classify the data.
        4- Store the results in the specified path.
  """

  # Load and preprocess the evaluation/test data
  validation_df = pd.read_csv(input_path)

  validation_df['text'] = validation_df['title'] + ' ' + validation_df['plot_synopsis']
  validation_df['text'] = validation_df['text'].apply(process_text)
  validation_df['text'] = validation_df['text'].apply(' '.join)

  # Use the trained Bayes classifier to classify the new data
  x = vectorizer.transform(validation_df['text'])
  y_pred = classifier.predict(x)

  # Store the output in a csv file
  output_df = pd.DataFrame(y_pred, columns=['comedy', 'cult', 'flashback', 'historical', 'murder', 'revenge', 'romantic', 'scifi', 'violence'])
  output_df = pd.concat([validation_df['ID'], output_df], axis=1)
  output_df.to_csv(output_path, index=False, header=False)

In [7]:
labels = training_df[['comedy', 'cult', 'flashback', 'historical', 'murder', 'revenge', 'romantic', 'scifi', 'violence']]

# ==================== Record the start time ====================
start_time = time.time()

# Initialise the term-document-matrix using term count (any term appearing in less than 10 documents will be excluded)
vectorizer = CountVectorizer(min_df=10)
term_doc_matrix = vectorizer.fit_transform(training_df['text'])

# Create and fit the naive Bayes classifier
# MultinomialNB() is wrapped by MultiOutputClassifier() to allow multi-label classification
classifier = MultiOutputClassifier(MultinomialNB())
classifier.fit(term_doc_matrix, labels)

# ==================== Record the end time ====================
end_time = time.time()

# Calculate and print the elapsed time
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time} seconds")

Time taken: 2.2311453819274902 seconds


Validation dataset:

In [8]:
# Record the start time
start_time = time.time()

# Run the model on the validation dataset
evaluate_bayes_model(validation_dataset_path, '10768356-Task2-method-a-validation.csv')

# Record the end time
end_time = time.time()

# Calculate and print the elapsed time
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time} seconds")

Time taken: 9.850985765457153 seconds


Test dataset:

In [9]:
# Record the start time
start_time = time.time()

# Run the model on the test dataset
evaluate_bayes_model(test_dataset_path, '10768356-Task2-method-a.csv')

# Record the end time
end_time = time.time()

# Calculate and print the elapsed time
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time} seconds")

Time taken: 10.004181146621704 seconds


**Model 2: b) A traditional deep learning method with Bi-LSTM**

In [10]:
def evaluate_LSTM_model(input_path, output_path):
  """ Evaluates the model on a validation/test set.

      It does the following:
        1- Reads the input file
        2- Preprocesses the data.
        3- Create sequences for the input tokens.
        4- Classify the data.
        5- Store the results in the specified path.
  """

  validation_df = pd.read_csv(input_path)

  validation_df['text'] = validation_df['title'] + ' ' + validation_df['plot_synopsis']
  validation_df['text'] = validation_df['text'].apply(process_text)
  validation_df['text'] = validation_df['text'].apply(' '.join)

  sequences_val = tokenizer.texts_to_sequences(validation_df['text'])
  x = pad_sequences(sequences_val)

  y_pred = model.predict(x)
  y_pred = [[1 if j>=0.5 else 0 for j in i] for i in y_pred]

  output_df = pd.DataFrame(y_pred, columns=['comedy', 'cult', 'flashback', 'historical', 'murder', 'revenge', 'romantic', 'scifi', 'violence'])
  output_df = pd.concat([validation_df['ID'], output_df], axis=1)
  output_df.to_csv(output_path, index=False, header=False)

In [11]:
y_train = training_df[['comedy', 'cult', 'flashback', 'historical', 'murder', 'revenge', 'romantic', 'scifi', 'violence']].values

# ==================== Record the start time ====================
start_time = time.time()

# Tokenize and pad sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(training_df['text'])
sequences_train = tokenizer.texts_to_sequences(training_df['text'])
x_train = pad_sequences(sequences_train)


# Define Adam optimiser with 0.001 learning rate
optimizer = Adam(lr=0.001)

# Define a callback to stop the training if the loss keeps dropping for 2 consecutive epochs
early_stopping = EarlyStopping(monitor='val_loss', patience=2)

# Build the neural network:
# Set the embedding dimension for the word embeddings
embedding_dim = 150

# Create a Sequential model
model = Sequential()

# Add an Embedding layer to the model
# - len(tokenizer.word_index): Number of unique words in the vocabulary (input dimension)
# - embedding_dim: Dimensionality of the dense embedding
# - trainable=True: Allow the embedding weights to be updated during training
model.add(Embedding(len(tokenizer.word_index), embedding_dim, trainable=True))

# Add a Bidirectional LSTM layer to the model
# - 200: Number of LSTM units in each direction (400 units in total)
model.add(Bidirectional(LSTM(200)))

# Add a Dense layer to the model for the output
# - 9: Number of output units, corresponding to the 9 classes (activation='sigmoid' for multi-label classification)
model.add(Dense(9, activation='sigmoid'))

# Compile the model
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(x_train,                                  # Training input data
                    y_train,                                  # Training target data (labels)
                    epochs=10,                                # Number of training epochs
                    batch_size=32,                            # Mini-batch size
                    validation_split=0.1,                     # Fraction of training data used for validation
                    callbacks=[early_stopping])               # Callback for early stopping


# ==================== Record the end time ====================
end_time = time.time()

# Calculate and print the elapsed time
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time} seconds")



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Time taken: 295.84762740135193 seconds


Validation dataset:

In [12]:
# Record the start time
start_time = time.time()

# Evaluate the model
evaluate_LSTM_model(validation_dataset_path, '10768356-Task2-method-b-validation.csv')

# Record the end time
end_time = time.time()

# Calculate and print the elapsed time
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time} seconds")

Time taken: 15.363977909088135 seconds


Test dataset:

In [13]:
# Record the start time
start_time = time.time()

# Run the model on the test dataset
evaluate_LSTM_model(test_dataset_path, '10768356-Task2-method-b.csv')

# Record the end time
end_time = time.time()

# Calculate and print the elapsed time
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time} seconds")

Time taken: 15.44764232635498 seconds
