In [65]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [66]:
import nltk,re,time
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC,SVC
from sklearn.multiclass import OneVsRestClassifier
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline

import csv
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, LSTM,Dense, Dropout

nltk.download("stopwords")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [67]:
# Only use is in seeing the accuracy f1meausre,recall,precision etc during the epochs of bilstm
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [68]:
#CONSTANTS
OUTPUT_TARGETS = ['comedy', 'cult', 'flashback','historical', 'murder', 'revenge', 'romantic', 'scifi', 'violence']
VOCAB_SIZE = 5000
EPOCHS = 7
BATCH_SIZE = 32
EMBEDDING_DIM = 200

# Input Data
TRAINING_PATH = './data/Training-dataset.csv'
VALID_DATA_PATH = './data/Task-2-validation-dataset.csv'
TEST_DATA_PATH = './data/Task-2-test-dataset2.csv'

#Output data
TASK_A_VALID_OUTPUT_PATH = './data/10879475-Task2-method-a-validation.csv'
TASK_A_TEST_OUTPUT_PATH = './data/10879475-Task2-method-a.csv'

TASK_B_VALID_OUTPUT_PATH = './data/10879475-Task2-method-b-validation.csv'
TASK_B_TEST_OUTPUT_PATH = './data/10879475-Task2-method-b.csv'

In [69]:
#Create SVM class
class SVM:
  def __init_(self):
    '''SVM class intiaiser. Creates 2 elements: training corpus and validation corpus that are empty to start'''
    self.training_corpus = []
    self.validation_corpus = []
    self.testing_corpus = []

  def read_and_format_input_file(self, path,test=False):
    '''Reads input csv file at the {path}, modifies the read file slightly and combines the text in plot_synopsis and title column of the read file together into a new Column called Combined_TitlePlot
    Gets rid of plot_synopsis and title column after combining the 2 columsn together.
    Returns the read csv file at the path after the slight processing and formatting'''
    #Read input file and combine text in plot synosis and title column
    corpus = pd.read_csv(path)
    corpus['Combined_TitlePlot'] = corpus['title'] + ". " + corpus['plot_synopsis']
    corpus.drop(labels=['title','plot_synopsis'],axis=1,inplace=True)#Drop title and plot_synopsis column
    #Rearrange new columns
    if not test:
      corpus = corpus[['ID', 'Combined_TitlePlot', 'comedy', 'cult', 'flashback',
            'historical', 'murder', 'revenge', 'romantic', 'scifi', 'violence']]
    else:
      corpus = corpus[['ID', 'Combined_TitlePlot']]
    return corpus

  def clean_document(self,document):
    '''Cleans the text in the input document.
    https://towardsdatascience.com/multi-label-text-classification-with-scikit-learn-30714b7819c5'''
    #Normalise document to lower case
    document = document.lower()
    #Deal with common apostrophe phrases
    document = re.sub(r"what's", "what is ", document)
    document = re.sub(r"\'s", " ", document)
    document = re.sub(r"\'ve", " have ", document)
    document = re.sub(r"can't", "can not ", document)
    document = re.sub(r"n't", " not ", document)
    document = re.sub(r"i'm", "i am ", document)
    document = re.sub(r"\'re", " are ", document)
    document = re.sub(r"\'d", " would ", document)
    document = re.sub(r"\'ll", " will ", document)
    document = re.sub(r"\'scuse", " excuse ", document)
    #Replace text with space for any regular expression pattern that matches any non-alphanumeric character (equivalent to [^a-zA-Z0-9_])
    document = re.sub('\W', ' ', document)
    #Replace sequences of a more than single space with a single space
    document = re.sub('\s+', ' ', document)
    #Remove leading or trailing spaces from the document text
    document = document.strip(' ')
    return document

  def classifer_train_and_predict(self,targets,X_train,X_valid,X_test):
    '''https://towardsdatascience.com/multi-label-text-classification-with-scikit-learn-30714b7819c5
    Uses tfidf to transform the textual documents data into numerical feature vectors
    Uses OnevsRestClassifier using LinearSVC in order to train a seperate classifier for each class'''
    #Set up SVM pipeline using tfidf and onevsrest classifier
    SVM_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer()),
                ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
            ])

    #For each target label, train the classifer and predict on the test data for each document. The dictionary results will contain a key value pair such that the key is a target label and the value is an array of values representing whether a label applies to a document or not(a value of 0 at index 2 of the array means that the 3 document isn't given that label)
    validation_results = {}
    testing_results = {}
    for target in targets:
      # Train for each target
      SVM_pipeline.fit(X_train,self.training_corpus[target])

      #Make predictions for current label for all the documents in the validation dataset
      validation_prediction = SVM_pipeline.predict(X_valid)
      validation_results[target] = validation_prediction

      #Make predictions for the current label for all documents in the testing dataset
      testing_prediction = SVM_pipeline.predict(X_test)
      testing_results[target] = testing_prediction

    return validation_results,testing_results

  def output_results_to_file(self,results,output_path,ids):
    '''Output the results to a csv file such that the format is:
    column1 = documentID, column:2-10= 1 or 0 representing whether a label applies to the document or not'''
    output_df = pd.DataFrame({
      'document_id': ids,
      **results  # Unpack the results dictionary into columns
    })
    output_df.to_csv(output_path, index=False, header=False)



In [70]:
class BiLSTM:
  def __init__(self):
    '''Initialise the object with 3 empty fields for the training,validation and testing corpus'''
    self.training_corpus = []
    self.validation_corpus = []
    self.testing_corpus = []

  def read_and_format_input_file(self, path,test=False):
    '''Reads input csv file at the {path}, modifies the read file slightly and combines the text in plot_synopsis and title column of the read file together into a new Column called Combined_TitlePlot
    Gets rid of plot_synopsis and title column after combining the 2 columsn together.
    Returns the read csv file at the path after the slight processing and formatting'''
    #Read input file and combine text in plot synosis and title column
    corpus = pd.read_csv(path)
    corpus['Combined_TitlePlot'] = corpus['title'] + ". " + corpus['plot_synopsis']
    corpus.drop(labels=['title','plot_synopsis'],axis=1,inplace=True)#Drop title and plot_synopsis column
    #Rearrange new columns
    if not test:
      corpus = corpus[['ID', 'Combined_TitlePlot', 'comedy', 'cult', 'flashback',
            'historical', 'murder', 'revenge', 'romantic', 'scifi', 'violence']]
    else:
      corpus = corpus[['ID', 'Combined_TitlePlot']]
    return corpus

  def clean_document(self,document):
    '''Cleans the text in the input document.
    https://towardsdatascience.com/multi-label-text-classification-with-scikit-learn-30714b7819c5'''
    #Normalise document to lower case
    document = document.lower()
    #Deal with common apostrophe phrases
    document = re.sub(r"what's", "what is ", document)
    document = re.sub(r"\'s", " ", document)
    document = re.sub(r"\'ve", " have ", document)
    document = re.sub(r"can't", "can not ", document)
    document = re.sub(r"n't", " not ", document)
    document = re.sub(r"i'm", "i am ", document)
    document = re.sub(r"\'re", " are ", document)
    document = re.sub(r"\'d", " would ", document)
    document = re.sub(r"\'ll", " will ", document)
    document = re.sub(r"\'scuse", " excuse ", document)
    #Replace text with space for any regular expression pattern that matches any non-alphanumeric character (equivalent to [^a-zA-Z0-9_])
    document = re.sub('\W', ' ', document)
    #Replace sequences of a more than single space with a single space
    document = re.sub('\s+', ' ', document)
    #Remove leading or trailing spaces from the document text
    document = document.strip(' ')
    return document

  def tokenise_and_padd(self,VOCAB_SIZE):
    '''Tokenises the text into words then tokenises the words into sequence of tokens for all of the different types of corpus(training,testing and validation)
    It then padds the sequence tokens such that they are all of the same length
    returns the MAX sequence lengtht used, the labels for each of the different corpora along with padded tokens as well as the non padded sequences tokens for each
    of the different corpora'''
    #Tokenize the text data
    tokenizer = Tokenizer(num_words=VOCAB_SIZE,oov_token="<OOV>")
    tokenizer.fit_on_texts(self.training_corpus['Combined_TitlePlot'].values)

    #Convert text tokens into sequences
    X_train = tokenizer.texts_to_sequences(self.training_corpus['Combined_TitlePlot'].values)
    X_valid = tokenizer.texts_to_sequences(self.validation_corpus['Combined_TitlePlot'].values)
    X_test = tokenizer.texts_to_sequences(self.testing_corpus['Combined_TitlePlot'].values)

    # MAX_SEQ_LEN = max(len(seq) for seq in X_train)
    MAX_SEQ_LEN = 500

    #Pad sequences
    X_train_padded = pad_sequences(X_train, maxlen=MAX_SEQ_LEN, padding='post', truncating='post')
    train_labels = self.training_corpus[OUTPUT_TARGETS].values

    X_valid_padded = pad_sequences(X_valid, maxlen=MAX_SEQ_LEN, padding='post', truncating='post')
    validation_labels = self.validation_corpus[OUTPUT_TARGETS].values

    X_test_padded = pad_sequences(X_test, maxlen=MAX_SEQ_LEN, padding='post', truncating='post')

    return MAX_SEQ_LEN,X_train,X_valid,X_test, X_train_padded, X_valid_padded,X_test_padded,train_labels,validation_labels

  def build_and_train_model(self,VOCAB_SIZE,MAX_SEQ_LEN,EPOCHS,BATCH_SIZE,EMBEDDING_DIM,X_train_padded,train_labels):
    '''Builds a Bi-LSTM model using 2 hidden layers and sigmoid activation function
    After that it trains the model and returns the trained model'''
    # Build the Bi-LSTM model
    model = Sequential()
    model.add(Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM, input_length=MAX_SEQ_LEN))
    model.add(Bidirectional(LSTM(64,return_sequences=True)))
    model.add(Dropout(0.2))
    model.add(Bidirectional(LSTM(64)))
    model.add(Dropout(0.2))
    model.add(Dense(9, activation='sigmoid'))

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc',f1_m,precision_m, recall_m])

    model.fit(X_train_padded, train_labels, epochs=EPOCHS, batch_size=BATCH_SIZE)
    return model

  def predict(self,trained_model ,data_to_predict_for):
    '''Makes predictions on the input data to the function. Uses thresholding to predict convert probabilites from the sigmoid activation funciton into binary
    labels
    returns the binary predictions for all of the classes on the entire dataset'''
    # predict on validation data
    prediction = trained_model.predict(data_to_predict_for)
    thresholds = [0.5, 0.5, 0.5, 0.2, 0.5, 0.5, 0.5, 0.5, 0.5]
    # y_pred_binary = (prediction > threshold).astype(int)
    y_pred_binary = (prediction > np.array(thresholds)).astype(int)
    return y_pred_binary

  def write_results_to_file(self,binary_predictions,output_path,document_ids):
    '''Outputs predictions to a new csv file'''
    # Create a DataFrame with document IDs and predictions
    df = pd.DataFrame(document_ids, columns=['Document ID'])
    df = pd.concat([df, pd.DataFrame(binary_predictions)], axis=1)

    # Write the DataFrame to a CSV file without headers
    df.to_csv(output_path, index=False, header=False)

In [71]:
def Svm_experiment():
  #Instantiate an instance of the SVM class
  svm_classifier = SVM()

  #Read in the training corpus and validation corpus and testing corpus
  svm_classifier.training_corpus = svm_classifier.read_and_format_input_file(TRAINING_PATH)
  svm_classifier.validation_corpus = svm_classifier.read_and_format_input_file(VALID_DATA_PATH)
  svm_classifier.testing_corpus = svm_classifier.read_and_format_input_file(TEST_DATA_PATH,test=True)

  #Clean the training and validation corpus as well as the testing corpus
  svm_classifier.training_corpus['Combined_TitlePlot'] = svm_classifier.training_corpus['Combined_TitlePlot'].map(lambda doc: svm_classifier.clean_document(doc))
  svm_classifier.validation_corpus['Combined_TitlePlot'] = svm_classifier.validation_corpus['Combined_TitlePlot'].map(lambda doc: svm_classifier.clean_document(doc))
  svm_classifier.testing_corpus['Combined_TitlePlot'] = svm_classifier.testing_corpus['Combined_TitlePlot'].map(lambda doc: svm_classifier.clean_document(doc))

  #Take the textual data from the training and valiadtion corpus
  X_train = svm_classifier.training_corpus.Combined_TitlePlot
  X_valid = svm_classifier.validation_corpus.Combined_TitlePlot
  X_test = svm_classifier.testing_corpus.Combined_TitlePlot

  #Train the using the training data and get predictions for each class for the documents in the validation data and testing dataset
  Validation_Predictions_for_each_label,Testing_Predictions_for_each_label = svm_classifier.classifer_train_and_predict(OUTPUT_TARGETS,X_train,X_valid,X_test)

  # Output results to file for validation predicitons
  validation_doc_ids = svm_classifier.validation_corpus['ID'].values
  svm_classifier.output_results_to_file(Validation_Predictions_for_each_label,TASK_A_VALID_OUTPUT_PATH,validation_doc_ids)

  # Output predictions to file for testing dataset
  testing_doc_ids = svm_classifier.testing_corpus['ID'].values
  svm_classifier.output_results_to_file(Testing_Predictions_for_each_label,TASK_A_TEST_OUTPUT_PATH,testing_doc_ids)

def bi_lstm_experiment():
  # Instantiate the BiLSTM object
  bilstm_model = BiLSTM()
  #Read in the training corpus and validation corpus and testing corpus
  bilstm_model.training_corpus = bilstm_model.read_and_format_input_file(TRAINING_PATH)
  bilstm_model.validation_corpus = bilstm_model.read_and_format_input_file(VALID_DATA_PATH)
  bilstm_model.testing_corpus = bilstm_model.read_and_format_input_file(TEST_DATA_PATH,test=True)

  #Clean the training and validation corpus as well as the testing corpus
  bilstm_model.training_corpus['Combined_TitlePlot'] = bilstm_model.training_corpus['Combined_TitlePlot'].map(lambda doc: bilstm_model.clean_document(doc))
  bilstm_model.validation_corpus['Combined_TitlePlot'] = bilstm_model.validation_corpus['Combined_TitlePlot'].map(lambda doc: bilstm_model.clean_document(doc))
  bilstm_model.testing_corpus['Combined_TitlePlot'] = bilstm_model.testing_corpus['Combined_TitlePlot'].map(lambda doc: bilstm_model.clean_document(doc))

  # Tokenise the data
  MAX_SEQ_LEN,X_train,X_valid,X_test, X_train_padded, X_valid_padded,X_test_padded,train_labels,validation_labels = bilstm_model.tokenise_and_padd(VOCAB_SIZE)

  #Train the model
  trained_model = bilstm_model.build_and_train_model(VOCAB_SIZE,MAX_SEQ_LEN,EPOCHS,BATCH_SIZE,EMBEDDING_DIM,X_train_padded,train_labels)

  #Make predicitons on the validation dataset and write to file
  valid_b_pred = bilstm_model.predict(trained_model,X_valid_padded)
  valid_document_ids = bilstm_model.validation_corpus['ID'].values
  bilstm_model.write_results_to_file(valid_b_pred,TASK_B_VALID_OUTPUT_PATH,valid_document_ids)

  #Make predicitons on the testing dataset and write to file
  test_b_pred = bilstm_model.predict(trained_model,X_test_padded)
  test_document_ids = bilstm_model.testing_corpus['ID'].values
  bilstm_model.write_results_to_file(test_b_pred,TASK_B_TEST_OUTPUT_PATH,test_document_ids)

In [None]:
def time_taken_formatting(elapsed_time):
  hrs, remainder= divmod(elapsed_time,3600)
  mins,remainder = divmod(remainder,60)
  secs, ms = divmod(remainder,1)
  formatted_time_printing = "{:02} hrs {:02} mins {:02} secs {:3} ms".format(int(hrs), int(mins), int(secs), round(ms * 1000))
  print(f"Time taken - {formatted_time_printing}")

def main():
  Svm_experiment()
  bi_lstm_experiment()

main()