In [14]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [15]:
#Import necessary libaries
import nltk,string,csv,re,time
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models.phrases import Phrases
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from nltk.corpus import stopwords

nltk.download("punkt")
nltk.download("wordnet")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
#Constants for both BoW and Word2Vec
STOP_WORDS = set(stopwords.words('english'))

TRAINING_PATH = './data/Training-dataset.csv'
VALID_DATA_PATH = './data/Task-1-validation-dataset.csv'
TEST_DATA_PATH = './data/Task-1-test-dataset2.csv'

TASK_A_VALID_OUTPUT_PATH = './data/10879475-Task1-method-a-validation.csv'
TASK_A_TEST_OUTPUT_PATH = './data/10879475-Task1-method-a.csv'

TASK_B_VALID_OUTPUT_PATH = './data/10879475-Task1-method-b-validation.csv'
TASK_B_TEST_OUTPUT_PATH = './data/10879475-Task1-method-b.csv'

LEMMATIZER = WordNetLemmatizer()


class BoWTFIDF:
  def __init__(self,training_corpus):
    '''Initialise BoW with the training corpus such that a pre processing function is automatically called on the input value in order to clearn the corpus and this processed corpus is then stored as a field of the class'''
    self.processed_corpus = training_corpus.apply(self.__pre_process_corpus__)

  def __pre_process_corpus__(self,plot_document):
    '''Expects a single document, it tokenises the document in to a list of tokens using word_tokenize from gensim, it then removes unncessary things like stopwords and punctuations
    then it goes on to lemmatize the remaining tokens. It then joins the remaining tokens back and returns this'''
    #Tokenize the text
    tokens = word_tokenize(plot_document)

    #Remove punctuation
    tokens = [token.lower() for token in tokens if token not in STOP_WORDS and token not in string.punctuation]

    #lemmatize after removing some words
    tokens = [LEMMATIZER.lemmatize(token) for token in tokens]

    #Lowercase text
    processed_document = ' '.join(tokens)
    return processed_document

  def calcualte_similarity(self,word1, word2,tfidf,tfidf_matrix):
    '''Finds the word index of the word in the vector representation for each of the 2 input terms, if it doens't exist, it gives a default value and hence manages to deal with OOVs
    It then uses that index to find the word vector representation in the tfidfmatrix, transposes it and then uses the cosine simialrity function in order to find the similarity between the 2 terms'''
    word1_ind = tfidf.vocabulary_.get(word1,-1)
    word2_ind = tfidf.vocabulary_.get(word2,-1)
    cosine_sim = cosine_similarity(tfidf_matrix[:,word1_ind].T,tfidf_matrix[:,word2_ind].T)
    return cosine_sim[0][0]

  def output_results_to_file(self,out_path,input_data_path,tf_idf,tf_idf_matrix):
    '''Helper function in order to read in the testing data, process it similar to the training corpus data and then calculates the simialrity between the words in the testing data.
    It then writes to a new csv file the id and the similarity score for each row in the testing data file'''
    with open(out_path,'w',newline='') as output_csv:
      csv_writer = csv.writer(output_csv)
      with open(input_data_path) as f:
        read = csv.reader(f)
        for col in read:
          id = col[0]
          term1 = LEMMATIZER.lemmatize(col[1]).lower()
          term2 = LEMMATIZER.lemmatize(col[2]).lower()
          similarity = self.calcualte_similarity(term1,term2,tf_idf,tf_idf_matrix)
          csv_writer.writerow([id,similarity])

#Create class for Word2Vec implementation using skip gram
class Word2VecSG:
  def __init__(self,training_corpus):
    '''Initialise Word2vecsg with the training corpus such that a pre processing function is automatically called on the input value in order to clean the corpus and
     this processed corpus is then stored as a field of the class'''
    self.processed_tokens = self.__tokenisation_and_pre_processing__(training_corpus)

  def clean_document(self,document):
    '''Cleans the text in the input document.
    https://towardsdatascience.com/multi-label-text-classification-with-scikit-learn-30714b7819c5'''
    #Normalise document to lower case
    document = document.lower()
    #Deal with common apostrophe phrases
    document = re.sub(r"what's", "what is ", document)
    document = re.sub(r"\'s", " ", document)
    document = re.sub(r"\'ve", " have ", document)
    document = re.sub(r"can't", "can not ", document)
    document = re.sub(r"n't", " not ", document)
    document = re.sub(r"i'm", "i am ", document)
    document = re.sub(r"\'re", " are ", document)
    document = re.sub(r"\'d", " would ", document)
    document = re.sub(r"\'ll", " will ", document)
    document = re.sub(r"\'scuse", " excuse ", document)
    #Replace text with space for any regular expression pattern that matches any non-alphanumeric character (equivalent to [^a-zA-Z0-9_])
    document = re.sub('\W', ' ', document)
    #Replace sequences of a more than single space with a single space
    document = re.sub('\s+', ' ', document)
    #Remove leading or trailing spaces from the document text
    document = document.strip(' ')
    return document

  def __tokenisation_and_pre_processing__(self,training_corpus):
    '''Cleans the documents in the training corpus, tokenises each document and then removes punctuation and stopwords, it also adds in bigrams to the tokens and then returns this list of list of tokens'''
    training_corpus['plot_synopsis'] = training_corpus['plot_synopsis'].map(lambda doc: self.clean_document(doc))
    #Tokenize the each document,remove stopwords and lemmatize
    tokens = [[LEMMATIZER.lemmatize(token) for token in word_tokenize(doc) if token not in STOP_WORDS] for doc in training_corpus['plot_synopsis']]
    #Create bigrams from tokens
    potential_bigrams = Phrases(tokens)
    tokens_with_bigrams = list(potential_bigrams[tokens])
    return tokens_with_bigrams

  def train(self,model,vector_size,window_size,min_count,workers=4,epochs=2,sg=1):
    '''Trains the word2vec model'''
    #By default 2 epochs as it yields the best results.
    model = Word2Vec(sentences=self.processed_tokens,vector_size=vector_size,window=window_size,sg=sg,min_count=min_count,epochs=epochs,workers=workers)
    return model

  def calcualte_similarity(self,word2vec_model,word1, word2):
    '''Finds the vector representation of the terms from the model, if not found then gives a default representation,
    Uses the vector representaiton of the 2 terms in order to find the cosine similairty between the 2 input terms'''
    # Deal with OOVs and find term similarity
    term1_vector = word2vec_model.wv[word1] if word1 in word2vec_model.wv else [0] * 300
    term2_vector = word2vec_model.wv[word2] if word2 in word2vec_model.wv else [0] * 300
    return cosine_similarity([term1_vector],[term2_vector])[0][0]

  def process_test_terms(self,term1,term2):
    '''Processes the test terms so that they are in the same format as the data that the model has been trained on. if a multi word term is foudn then it combine it using an underscore
    as this is how bigram terms are stored in the tokens given by the tokenisation and pre processing function'''
    term1 = self.clean_document(term1)
    term2 = self.clean_document(term2)
    word1_tokens = [LEMMATIZER.lemmatize(token) for token in word_tokenize(term1)]
    word2_tokens = [LEMMATIZER.lemmatize(token) for token in word_tokenize(term2)]
    # Join tokens with underscore if there's more than one word
    word1_lemmatized = '_'.join(word1_tokens) if len(word1_tokens) > 1 else word1_tokens[0]
    word2_lemmatized = '_'.join(word2_tokens) if len(word2_tokens) > 1 else word2_tokens[0]
    return word1_lemmatized,word2_lemmatized

  def output_results_to_file(self,word2vec_model,out_path,input_data_path):
    '''Outputs results to new file after finding similarity between the 2 input terms from the file'''
    with open(out_path,'w',newline='') as output_csv:
      csv_writer = csv.writer(output_csv)
      with open(input_data_path) as f:
        read = csv.reader(f)
        for col in read:
          id = col[0]
          #Process terms
          processed_term1,processed_term2 = self.process_test_terms(col[1],col[2])
          similarity =self.calcualte_similarity(word2vec_model,processed_term1,processed_term2)
          csv_writer.writerow([id,similarity])


In [17]:
def BoW_experiment():
  start_time = time.time()
  #Extract data for relevant for BoW implementation
  bow_training_corpus = pd.read_csv(TRAINING_PATH)['plot_synopsis']

  #Instantiate BoW with TFIDF class. Pass in the training corpus which will be automatically be processed
  BoW = BoWTFIDF(bow_training_corpus)

  tfidf = TfidfVectorizer(ngram_range=(1,2))#introduce bigrams as well

  # Get the matrix representation for the words in the corpus
  tfidf_matrix = tfidf.fit_transform(BoW.processed_corpus)

  # Validate on new words. Calculate cosine similarity and output results
  BoW.output_results_to_file(TASK_A_VALID_OUTPUT_PATH,VALID_DATA_PATH,tfidf,tfidf_matrix)

  # Test on new words. Calculate cosine similarity and output results
  BoW.output_results_to_file(TASK_A_TEST_OUTPUT_PATH,TEST_DATA_PATH,tfidf,tfidf_matrix)

  #Determine and print out time taken for BoW experiment from start to finish
  end_time = time.time()
  elapsed_time = end_time - start_time
  time_taken_formatting(elapsed_time,"BoW")

def Word2Vec_experiment():
  start_time = time.time()
  # Read in the training corpus
  word2vec_training_corpus = pd.read_csv(TRAINING_PATH)
  #Instantiate word2vec model with the relevant training corpus, This will automtically clean/pre_process the training data, it will also tokenise each document and this will be saved as a field(processed_tokens) of the object
  word2vec = Word2VecSG(word2vec_training_corpus)

  #Train the model by default will train for 2 epochs(default of Word2Vec from gensim is 5)
  word2vec_sg_model = word2vec.train(word2vec,vector_size=300,window_size=5,min_count=1)

  # Validate by finding cosine similarity on new words from validation data and ouput the results to a new file
  word2vec.output_results_to_file(word2vec_sg_model,TASK_B_VALID_OUTPUT_PATH,VALID_DATA_PATH)

  # Test by finding cosine similarity on new words from testing data and output the results into a new file
  word2vec.output_results_to_file(word2vec_sg_model,TASK_B_TEST_OUTPUT_PATH,TEST_DATA_PATH)

  #Determine time taken for Word2Vec experiment from start to finish
  end_time = time.time()
  elapsed_time = end_time - start_time
  time_taken_formatting(elapsed_time,"Word2Vec")

def time_taken_formatting(elapsed_time,exp_name):
  '''Helper function in order to format the time taken into hours minutes seconds and milliseconds'''
  hrs, remainder= divmod(elapsed_time,3600)
  mins,remainder = divmod(remainder,60)
  secs, ms = divmod(remainder,1)
  formatted_time_printing = "{:02} hrs {:02} mins {:02} secs {:3} ms".format(int(hrs), int(mins), int(secs), round(ms * 1000))
  print(f"Time taken for {exp_name} experiment - {formatted_time_printing}")

In [18]:
def main():
  BoW_experiment()
  Word2Vec_experiment()

In [None]:
main()