# 1. Import Libraries

In [5]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases
import string
import numpy as np

import nltk
nltk.download('stopwords')
nltk.download('punkt')

from google.colab import drive
drive.mount('/content/drive')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 2. Create Class & Functions

In [12]:
class DistributionalSemantics:

  def __init__(self) -> None:
    self.method = 0

  def read_data(self, path: str) -> pd.DataFrame:
    """
    Read csv files for use as the data
    :param path: path where data is located
    :return: list of all the data
    """
    files = os.listdir(path)
    corpus = []

    for f in files:
      file_name, file_extension = os.path.splitext(f)
      if file_extension == '.csv':
        df = pd.read_csv(f"{path}/{f}")
      corpus.append(df)

    return pd.concat(corpus)

  def preprocess_data(self, document: list) -> list:
    """
    Applies the following to the data: tokenization, stop words, stemming, punctuation & lowercase
    :param document: list of data to be cleaned
    :return: list of cleaned data
    """

    # Tokenization
    tokenized = nltk.tokenize.word_tokenize(document)

    # Stop words
    stop_words = set(nltk.corpus.stopwords.words("english"))
    cleaned_words = [word for word in tokenized if word not in stop_words]

    # Stemming
    porter_stemmer = nltk.stem.PorterStemmer()
    cleaned_words = [porter_stemmer.stem(word) for word in cleaned_words]

    # Punctuation
    punctuations = set(string.punctuation)
    cleaned_words = [word for word in cleaned_words if word not in punctuations]

    # Lowercase
    cleaned_words = [term.lower() for term in cleaned_words]

    return cleaned_words

  def process_tfidf(self, document: list) -> list:
    """
    Process data to produce a vectorizer and matrix
    :param document: list of data
    :return: a list of the vectorizer and matrix of the document
    """

    # Setup tfidf
    tfidf = TfidfVectorizer(tokenizer=self.preprocess_data, ngram_range=(1,2), stop_words='english')
    tfidf_matrix = tfidf.fit_transform(document['plot_synopsis'])
    print(f'Vocab Size: {len(tfidf.vocabulary_)}')

    return [tfidf, tfidf_matrix]

  def process_word2vec(self, document: list) -> list:
    """
    Process data to produce a vectorizer and matrix
    :param document: list of data
    :return: a list of the vectorizer and matrix of the document
    """

    # Setup word2vec
    tokenized = [self.preprocess_data(doc) for doc in document['plot_synopsis']]
    bigrams = Phrases(tokenized)
    model = Word2Vec(sentences=bigrams[tokenized], vector_size=200, window=5, min_count=1, workers=4)

    # Save and load model if needed
    # model.save("word2vec.model")
    # model = Word2Vec.load('word2vec.model')

    return model


  def calc_similarity(self, vectorizer, matrix, term1: str, term2: str) -> float:
    """
    Process data to produce a matrix
    :param vectorizer: the vectorizer object being used (eg. tfidf)
    :param matrix: the matrix transformed from the vectorizer
    :param term1, term2: terms we are checking the similarity of
    :return: cosine similarity value of both terms
    """

    # Get index of terms
    term1_index = vectorizer.vocabulary_.get(term1, -1)
    term2_index = vectorizer.vocabulary_.get(term2, -1)

    # If both are in vocab, then cosine the two terms
    if term1_index != -1 and term2_index != -1:
      term1_vector = matrix[:, term1_index].transpose()
      term2_vector = matrix[:, term2_index].transpose()

      cos_sim = cosine_similarity(term1_vector, term2_vector)[0][0]
      return cos_sim

    # If not in vocab, no similarity
    return 0

  def results_to_csv(self, processed_data: list, test_data: list) -> pd.DataFrame:
    """
    Output results into a csv, with term_pair_id and similarity
    :param processed_data: a list of the vectorizer and matrix for tfidf
    :param test_data: document that has the terms to compare
    """

    stemmer = nltk.stem.PorterStemmer()
    indices = []
    similarities = []

    # Loop through evaluation data
    if self.method == 0:
      for index, row in test_data.iterrows():
        term1 = ' '.join([stemmer.stem(word) for word in row[1].split()])
        term2 = ' '.join([stemmer.stem(word) for word in row[2].split()])
        cos_sim = self.calc_similarity(processed_data[0], processed_data[1], term1, term2)
        indices.append(row[0])
        similarities.append(cos_sim)

    elif self.method == 1:
      for index, row in test_data.iterrows():
        term1 = ' '.join([stemmer.stem(word) for word in row[1].split()])
        term2 = ' '.join([stemmer.stem(word) for word in row[2].split()])
        if term1 in processed_data.wv and term2 in processed_data.wv:
          cos_sim = processed_data.wv.similarity(term1, term2)
        else:
          cos_sim = 0
        indices.append(row[0])
        similarities.append(cos_sim)

    # Write values to csv (id, sim_val)
    results = pd.DataFrame({
        'term_pair_id': indices,
        'similarity': similarities
    })

    results.to_csv(f'results_{self.method}.csv', header=False, index=False)

    return results

  def train_and_test(self, path: str, train_data_name: str, test_data_name: str, to_print=True) -> None:
    """
    Trains and tests data accordingly depending on the method set
    :param path: path to directory with data
    :param train_data_name: name of the csv for training
    :param test_data_name: name of the csv for testing
    :param to_print: choose whether to print results
    :return: none
    """

    # Read in data
    train_data = pd.read_csv(f'{path}/{train_data_name}')
    test_data = pd.read_csv(f'{path}/{test_data_name}', header=None)

    # 0: tfidf
    if self.method == 0:
      train_data = self.process_tfidf(train_data)
      results = self.results_to_csv(train_data, test_data)

    # 1: Word2Vec
    elif self.method == 1:
      train_data = self.process_word2vec(train_data)
      results = self.results_to_csv(train_data, test_data)

    # Print results
    if to_print:
      print(results)

# 3. TF*IDF

In [13]:
def main() -> None:
  dist = DistributionalSemantics()

  dist.method = 0

  files_path = './data/'
  train_data_name = 'Training-dataset.csv'
  test_data_name = 'Task-1-test-dataset1.csv'

  dist.train_and_test(files_path, train_data_name, test_data_name)

test = main()



Vocab Size: 2280231
     term_pair_id  similarity
0             816    0.090296
1             957    0.094483
2             809    0.155944
3             911    0.100518
4             242    0.023100
..            ...         ...
97            160    0.000000
98             14    0.099998
99             16    0.085868
100          4012    0.001862
101          4013    0.003492

[102 rows x 2 columns]


# 4. Word2Vec

In [14]:
def main() -> None:
  dist = DistributionalSemantics()

  dist.method = 1

  files_path = './data/'
  train_data_name = 'Training-dataset.csv'
  test_data_name = 'Task-1-test-dataset1.csv'

  dist.train_and_test(files_path, train_data_name, test_data_name)

test = main()

     term_pair_id  similarity
0             816    0.707323
1             957    0.633769
2             809    0.680188
3             911    0.552187
4             242    0.599650
..            ...         ...
97            160    0.287538
98             14    0.682318
99             16    0.475540
100          4012    0.000000
101          4013    0.000000

[102 rows x 2 columns]
