In [None]:
#If any other libraries need installing, please add them here

# %pip install sklearn
# %pip install gensim
# %pip install nltk
# %pip install keras

import pandas as pd
import string
import nltk

from sklearn.multioutput import MultiOutputClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from keras.preprocessing.text import Tokenizer
from keras.layers import TextVectorization
from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, LSTM, Dense, BatchNormalization, Dropout
from keras.callbacks import EarlyStopping
import torch
import numpy as np

nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

In [None]:
df_train = pd.read_csv("./data/Training-dataset.csv", usecols = [0,2,3,4,5,6,7,8,9,10,11])
df_validation = pd.read_csv("./data/Task-2-validation-dataset.csv", usecols = [0,2,3,4,5,6,7,8,9,10,11])
df_test = pd.read_csv("./data/Task-2-test-dataset1.csv", usecols = [0,2])

In [None]:
labels = ['comedy', 'cult', 'flashback', 'historical', 'murder', 'revenge', 'romantic', 'scifi', 'violence']
train_Y = df_train[labels].values
validation_Y = df_validation[labels].values

<h1>SVM with Doc2Vec Document Embeddings</h1>

In [None]:
#Function pre-processes documents via normalisation, punctuation removal, tokenization, stop word removal and lemmatization

def d2v_preprocess_text(text):

    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))

    tokens = nltk.word_tokenize(text)

    stop_words = set(nltk.corpus.stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    lemma = nltk.wordnet.WordNetLemmatizer()
    tokens = [lemma.lemmatize(word) for word in tokens]

    return tokens

df_train['plot_synopsis_d2v_processed'] = df_train['plot_synopsis'].apply(d2v_preprocess_text)
train_documents = df_train['plot_synopsis_d2v_processed'].tolist()

df_validation['plot_synopsis_d2v_processed'] = df_validation['plot_synopsis'].apply(d2v_preprocess_text)
validation_documents = df_validation['plot_synopsis_d2v_processed'].tolist()

df_test['plot_synopsis_d2v_processed'] = df_test['plot_synopsis'].apply(d2v_preprocess_text)
test_documents = df_test['plot_synopsis_d2v_processed'].tolist()

In [None]:
#Doc2Vec Model with specified parameters

tagged_data = [TaggedDocument(words=doc, tags=[str(i)]) for i, doc in enumerate(train_documents)]
model_doc = Doc2Vec(vector_size=200,
                window=1,
                min_count=1,
                workers=2)

model_doc.build_vocab(tagged_data)
model_doc.train(tagged_data, total_examples=model_doc.corpus_count, epochs=10)

#Document embeddings for training, validation and test created from the trained Doc2Vec Model
train_X = np.array([model_doc.infer_vector(doc) for doc in train_documents])
validation_X = np.array([model_doc.infer_vector(doc) for doc in validation_documents])
test_X = np.array([model_doc.infer_vector(doc) for doc in test_documents])

In [None]:
#SVM multi-label classification model

svm_classifier = SVC(class_weight='balanced')
ova_classifier = MultiOutputClassifier(svm_classifier)
ova_classifier.fit(train_X, train_Y)

#Prediciting document labels for validation data
val_predicition_Y = ova_classifier.predict(validation_X)

# #Prediciting document labels for test data
test_predicition_Y = ova_classifier.predict(test_X)

In [None]:
#Save label predicitions for validation and test datasets to CSV

df_svm_validation = pd.concat([pd.DataFrame({'ID': df_validation['ID']}), pd.DataFrame(val_predicition_Y, columns=labels)], axis=1)
df_svm_validation.to_csv("10560407-Task2-method-a-validation.csv", index=False, header=False)

df_svm_test = pd.concat([pd.DataFrame({'ID': df_test['ID']}), pd.DataFrame(test_predicition_Y, columns=labels)], axis=1)
df_svm_test.to_csv("10560407-Task2-method-a.csv", index=False, header=False)

<h1>Bi-LSTM</h1>

In [None]:
#Function pre-processes documents via normalisation, tokenisation, punctuation removal, stop word removal and lemmatization

def bilstm_preprocess_text(text):

    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = nltk.word_tokenize(text)

    stop_words = set(nltk.corpus.stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    lemma = nltk.wordnet.WordNetLemmatizer()
    tokens = [lemma.lemmatize(word) for word in tokens]

    sentence = ' '.join(tokens)

    return sentence

df_train['plot_synopsis_bilstm_processed'] = df_train['plot_synopsis'].apply(bilstm_preprocess_text)
df_validation['plot_synopsis_bilstm_processed'] = df_validation['plot_synopsis'].apply(bilstm_preprocess_text)
df_test['plot_synopsis_bilstm_processed'] = df_test['plot_synopsis'].apply(bilstm_preprocess_text)

In [None]:
# Create vector representations of documents using TextVectorization

max_words = 120000
max_sequence_length = 400

vectorizer = TextVectorization(max_tokens=max_words, output_mode='int', output_sequence_length=max_sequence_length)
vectorizer.adapt(df_train['plot_synopsis_bilstm_processed'].values)

train_X_bilstm = vectorizer(df_train['plot_synopsis_bilstm_processed'].values)
validation_X_bilstm = vectorizer(df_validation['plot_synopsis_bilstm_processed'].values)
test_X_bilstm = vectorizer(df_test['plot_synopsis_bilstm_processed'].values)

In [None]:
# Build the Bi-LSTM model

#Class weights are calculated to accouint for under represented labels within the training data
class_counts = np.sum(train_Y, axis=0)
total_samples = len(train_Y)
class_weights = total_samples / (len(class_counts) * class_counts)
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}

embedding_dim = 500
lstm_units = 64

model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_sequence_length, mask_zero=True))
model.add(Bidirectional(LSTM(units=lstm_units, return_sequences=True)))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(units=lstm_units)))
model.add(Dropout(0.2))
model.add(BatchNormalization())
model.add(Dense(train_Y.shape[1], activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
# Training of the model

epochs = 5
batch_size = 64
early_stopping = EarlyStopping(monitor='val_accuracy', patience=2, restore_best_weights=True) #Tracks validation dataset accuracy at each epoch to allow for early stopping if accuracy diminshes

model.fit(train_X_bilstm,
          train_Y,
          epochs=epochs,
          batch_size=batch_size,
          validation_data=(validation_X_bilstm, validation_Y),
          class_weight=class_weight_dict,
          callbacks=[early_stopping])

In [None]:
#Bi-LSTM only calculates predicited probabilities of labels, so thresholding is used to determine actual labels
threshold = 0.12

#Prediciting document labels for validation data
val_predicition_Y_bilstm = (model.predict(validation_X_bilstm) > threshold).astype(int)


#Prediciting document labels for test data
test_predicition_Y_bilstm = (model.predict(test_X_bilstm) > threshold).astype(int)

In [None]:
#Save label predicitions for validation and test datasets to CSV

df_bilstm_validation = pd.concat([pd.DataFrame({'ID': df_validation['ID']}), pd.DataFrame(val_predicition_Y_bilstm, columns=labels)], axis=1)
df_bilstm_validation.to_csv("10560407-Task2-method-b-validation.csv", index=False, header=False)

df_bilstm_test = pd.concat([pd.DataFrame({'ID': df_test['ID']}), pd.DataFrame(test_predicition_Y_bilstm, columns=labels)], axis=1)
df_bilstm_test.to_csv("10560407-Task2-method-b.csv", index=False, header=False)