# Load Modules and Define Preliminary Functions

In [5]:
import os
import subprocess
import csv
import re
import random
from scipy.sparse import lil_matrix
import pandas as pd
import numpy as np
import scipy

def read_snli():
    """Reads in the Shakespeare dataset and processes it into a list of tuples.
       Also reads in the vocab and play name lists from files.

    Each tuple consists of
    tuple[0]: The name of the play
    tuple[1] A line from the play as a list of tokenized words.

    Returns:
      tuples: A list of tuples in the above format.
      document_names: A list of the plays present in the corpus.
      vocab: A list of all tokens in the vocabulary.
    """

    tuples = []
    vocab = []
    with open("snli.csv") as f:
        csv_reader = csv.reader(f, delimiter=",")
        for i, row in enumerate(csv_reader):
            if i == 0:
                continue
            sentence_name = row[0]
            sentence = row[2]
            line_tokens = re.sub(r"[^a-zA-Z0-9\s]", " ", sentence).split()
            line_tokens = [token.lower() for token in line_tokens]
            tuples.append((sentence_name, line_tokens))
            vocab.extend(line_tokens)
    
    vocab = list(set(vocab))

    with open("identity_labels.txt") as f:
        id_labels = [line.strip() for line in f]

    return tuples, id_labels, vocab

def get_row_vector(matrix, row_id):
    """A convenience function to get a particular row vector from a numpy matrix

    Inputs:
      matrix: a 2-dimensional numpy array
      row_id: an integer row_index for the desired row vector

    Returns:
      1-dimensional numpy array of the row vector
    """
    return matrix[row_id, :]


def get_column_vector(matrix, col_id):
    """A convenience function to get a particular column vector from a numpy matrix

    Inputs:
      matrix: a 2-dimensional numpy array
      col_id: an integer col_index for the desired row vector

    Returns:
      1-dimensional numpy array of the column vector
    """
    return matrix[:, col_id]

def compute_cosine_similarity(vector1, vector2):
    """Computes the cosine similarity of the two input vectors.

    Inputs:
      vector1: A nx1 numpy array
      vector2: A nx1 numpy array

    Returns:
      A scalar similarity value.
    """
    # Check for 0 vectors
    if not np.any(vector1) or not np.any(vector2):
        sim = 0

    else:
        sim = 1 - scipy.spatial.distance.cosine(vector1, vector2)

    return sim

# Read SNLI

In [6]:
tuples, id_labels, vocab = read_snli()


# Term-Context Matrix

In [56]:
def create_term_context_matrix(line_tuples, vocab, context_window_size=1):
    """Returns a numpy array containing the term context matrix for the input lines.

    Inputs:
      line_tuples: A list of tuples, containing the name of the document and
      a tokenized line from that document.
      vocab: A list of the tokens in the vocabulary

    # NOTE: THIS DOCSTRING WAS UPDATED ON JAN 24, 12:39 PM.

    Let n = len(vocab).

    Returns:
      tc_matrix: A nxn numpy array where A_ij contains the frequency with which
          word j was found within context_window_size to the left or right of
          word i in any sentence in the tuples.
    """
    # YOUR CODE HERE
    n = len(vocab)
    tc_matrix = lil_matrix((n, n), dtype=int)
    vocab_index, index = {}, 0 
    for word in vocab:
        vocab_index[word] = index
        index += 1

    # inverse_vocab_index = {index: word for word, index in vocab_index.items()}

    word_location_mapping = {word: [] for word in vocab}

    for line in line_tuples:
        doc_name = line[0]
        words = line[1]
        for word in words:
            word_location_mapping[word].append(doc_name)

    for word, locations in word_location_mapping.items():
        for location in locations:
            doc = line_tuples[int(location)][1]
            doc_length = len(line_tuples[int(location)][1])
            word_indexes = [i for i, doc_words in enumerate(doc) if doc_words == word]
            for idx in word_indexes:
                # Check for if word is close to start or end of document, given context_window_size
                if  context_window_size <= idx <= (doc_length - 3):
                    context_bag = line_tuples[int(location)][1][(idx - context_window_size):(idx + context_window_size + 1)]
                elif context_window_size > idx: 
                    l_context = context_window_size - (context_window_size - idx)
                    context_bag = line_tuples[int(location)][1][(idx - l_context):(idx + context_window_size + 1)]
                elif idx > (doc_length - 1 - context_window_size):
                    r_context = doc_length - idx
                    context_bag = line_tuples[int(location)][1][(idx - context_window_size):(idx + r_context + 1)]
                for context_word in context_bag:
                    tc_matrix[vocab_index[word], vocab_index[context_word]] += 1
            
            tc_matrix[vocab_index[word], vocab_index[word]] += -len(word_indexes)
    return(tc_matrix)

In [57]:
print("Computing term context matrix...")
tc_matrix = create_term_context_matrix(tuples, vocab, context_window_size=4)


Computing term context matrix...


# PPMI Matrix

In [30]:
def create_ppmi_matrix(term_context_matrix):
    """Given the term context matrix, output a PPMI weighted version.

    See section 6.6 in the textbook.

    Hint: Use numpy matrix and vector operations to speed up implementation.

    Input:
      term_context_matrix: Numpy array where each column represents a context word
      and each row, the frequency of a word that occurs with that context word.

    Returns:
      A numpy array with the same dimension as term_context_matrix, where
      A_ij is weighted by PPMI.
    """

    # YOUR CODE HERE
    total_word_count = np.sum(term_context_matrix)
    word_counts = np.sum(term_context_matrix, axis=1) / total_word_count
    context_counts = np.sum(term_context_matrix, axis=0) / total_word_count
    ppmi_matrix = lil_matrix((term_context_matrix.shape[0], term_context_matrix.shape[1]))

    for word in range(term_context_matrix.shape[0]):
        nonzero_contexts = np.nonzero(get_row_vector(term_context_matrix, word))
        for context in nonzero_contexts[0]:
            word_context_p = term_context_matrix[word, context] / total_word_count
            outer_word_context_p = word_counts[word] * context_counts[context]
            if word_context_p == 0.0:
                ppmi_matrix[word, context] = 0.0
            else:
                ppmi_matrix[word, context] = max(np.log2(word_context_p / outer_word_context_p), 0.0)
    return(ppmi_matrix)

In [31]:
print("Computing PPMI matrix...")
ppmi_matrix = create_ppmi_matrix(tc_matrix.toarray())

Computing PPMI matrix...


In [34]:
def rank_words(target_word_index, matrix):
    """Ranks the similarity of all of the words to the target word using compute_cosine_similarity.

    Inputs:
      target_word_index: The index of the word we want to compare all others against.
      matrix: Numpy matrix where the ith row represents a vector embedding of the ith word.

    Returns:
      A length-n list of integer word indices, ordered by decreasing similarity to the
      target word indexed by word_index
      A length-n list of similarity scores, ordered by decreasing similarity to the
      target word indexed by word_index
    """
    # YOUR CODE HERE
    similarity = []
    if isinstance(matrix, lil_matrix):
      target_vector = matrix[target_word_index].toarray()[0]
      for i in range(matrix.shape[0]): 
          if i == target_word_index:
            continue
          
          # Convert only the row of the matrix that is currently evaluated with i to a numpy array
          tmp_comparison_vector = matrix[i].toarray()[0]
          
          similarity.append(compute_cosine_similarity(target_vector, tmp_comparison_vector))
    else:
      target_vector = matrix[target_word_index, :]
      for i in range(matrix.shape[0]): 
        if i == target_word_index:
          continue
      
        similarity.append(compute_cosine_similarity(target_vector, matrix[i]))

    word_and_sim = list(zip([i for i in range(0, matrix.shape[0]) if i != target_word_index], similarity))

    sorted_word_and_sim = sorted(word_and_sim, key=lambda x: x[1], reverse = True)

    sorted_words = [sorted[0] for sorted in sorted_word_and_sim]
    sorted_sims = [sorted[1] for sorted in sorted_word_and_sim]

    return sorted_words, sorted_sims

In [55]:

# random_idx = random.randint(0, len(document_names) - 1)

word = "amputee"
vocab_to_index = dict(zip(vocab, range(0, len(vocab))))

print(
    '\nThe 10 most similar words to "%s" using cosine-similarity on term-context frequency matrix are:'
    % (word)
)
ranks, scores = rank_words(vocab_to_index[word], tc_matrix)
for idx in range(0,15):
    word_id = ranks[idx]
    print("%d: %s; %s" %(idx+1, vocab[word_id], scores[idx]))

print(
    '\nThe 10 most similar words to "%s" using cosine-similarity on PPMI matrix are:'
    % (word)
)
ranks, scores = rank_words(vocab_to_index[word], ppmi_matrix)
for idx in range(0,15):
    word_id = ranks[idx]
    print("%d: %s; %s" %(idx+1, vocab[word_id], scores[idx]))



The 10 most similar words to "death" using cosine-similarity on term-context frequency matrix are:
1: with; 0.9663260999092221
2: feed; 0.9261687264035401
3: use; 0.9250086542417137
4: get; 0.922587446921485
5: save; 0.915665619171251
6: bring; 0.9115230675388477
7: wants; 0.9111131804170239
8: hoping; 0.9110270606665863
9: go; 0.9098087247332903
10: avoid; 0.909574860306109
11: start; 0.9089512382928748
12: decided; 0.9064967725769992
13: due; 0.9062844856796357
14: return; 0.9039716593113606
15: beat; 0.9036115570303765

The 10 most similar words to "death" using cosine-similarity on PPMI matrix are:
1: circa; 0.11868729614372064
2: goind; 0.11863624441285392
3: snu; 0.11288009556271639
4: stiltwalkers; 0.10897926272005165
5: adreneline; 0.10753306488174852
6: hillbilies; 0.10716050833427071
7: condolence; 0.10392740955270807
8: measurement; 0.09222900879983009
9: chearleaders; 0.08940831044946951
10: vulcan; 0.08660858944161942
11: injuring; 0.08615091919105677
12: swords; 0.085281

In [54]:
word_location_mapping = {word: [] for word in vocab}

for line in tuples:
    doc_name = line[0]
    words = line[1]
    for word in words:
        word_location_mapping[word].append(doc_name)

# Print the mapping
for word, locations in word_location_mapping.items():
    if word not in ['amputee']:
        continue
    print(word)
    for location in locations:
        doc = tuples[int(location)][1]
        print(doc)

death
['two', 'child', 'boxers', 'fight', 'to', 'the', 'death', 'in', 'mexico', 'for', 'blood', 'money']
['woman', 'fell', 'to', 'her', 'death', 'while', 'attempting', 'rock', 'climbing', 'while', 'drunk']
['a', 'young', 'group', 'of', 'people', 'are', 'protesting', 'a', 'sign', 'to', 'protect', 'african', 'americans', 'from', 'death', 'or', 'being', 'killed', 'while', 'under', 'custody', 'of', 'the', 'united', 'states', 'government']
['the', 'old', 'people', 'are', 'on', 'their', 'death', 'beds', 'watching', 'tv']
['a', 'man', 'falls', 'to', 'his', 'death', 'down', 'stone', 'steps']
['a', 'man', 'lets', 'the', 'concrete', 'block', 'crush', 'someone', 'to', 'death', 'there', 's', 'blood', 'everywhere']
['a', 'man', 'is', 'being', 'dragged', 'behind', 'a', 'boat', 'down', 'a', 'dirt', 'road', 'to', 'his', 'death']
['a', 'woman', 'laughs', 'after', 'surviving', 'a', 'near', 'death', 'experience', 'while', 'scuba', 'diving']
['a', 'baby', 'and', 'a', 'dog', 'are', 'fighting', 'to', 'the',