In [None]:
import random
import numpy as np
import math
import collections
import os
import re
import json




class Sequence_as_vec:
    def __init__(self, inventory):
        self.inventory = inventory #A list of words of letters
        self.vector_size = 10000
        self.inventory_dict = collections.defaultdict()
        for l in inventory:
          for s in l.split():
            self.inventory_dict[s] = 2*np.random.randint(0, 2, size=(self.vector_size))- np.ones((self.vector_size))
            #This creates polar vectors that consist of values that are either 1 or minus 1.
    def get_initial_zero_vector(self):
        return np.zeros(self.vector_size)
    def cossim(self,x,y):
        #Cosine similarity
        return np.dot(x,np.transpose(y))/(math.sqrt(np.dot(x,np.transpose(x)))*math.sqrt(np.dot(y,
        np.transpose(y))))
    def pm(self,x,offset):
        #Permute a vector x by an offset.
        return np.concatenate((x[offset:], x[0:offset]))
    def ipm(self,x):
        #Unpermute a vector one step.
        return np.concatenate((x[-1:], x[:-1]))
    def most_similar(self,x):
        closest = None
        best_cos = -1
        for vec in self.inventory:
            dis = self.cossim(x,self.inventory_dict[vec])
            if dis > best_cos:
                closest = vec
                best_cos = dis
        return (self.inventory_dict[closest], closest, best_cos)
    def create_vector_for_sequence(self,sequence):
        sequence_as_list = list(sequence)
        #print(sequence_as_list)
        vec = np.zeros((self.vector_size))
        for i, l in enumerate(sequence_as_list):
            if l in self.inventory_dict.keys():
                vec = np.add(vec, self.pm(self.inventory_dict[l],i))
        return vec
    def unbind_vec(self, vec):
        residue = vec
        cos = -1
        for i in range(500):
          prev_cos = cos
          (best_vec, closest, cos) = self.most_similar(residue)
          if prev_cos / cos > 3:
            break
          print(closest, end='')
          if cos == 1:
            break
          residue = self.ipm(np.subtract(residue, best_vec))

In [None]:
import string

#part 1
chars = list(string.printable+" ")
a = Sequence_as_vec(chars)
long_word = "The quick brown fox jumps over the lazy dog"
print(long_word)
temp_vec = a.create_vector_for_sequence(long_word)
a.unbind_vec(temp_vec)
print(temp_vec)

The quick brown fox jumps over the lazy dog
The quick brown fox jumps over the lazy dog[-9.  3. -1. ... -1. -1. -9.]


In [None]:
#part 2
def get_words(book_titles):
    sentences = []
    book_words = []
    for book_title in book_titles:
        words = []
        if os.path.isfile(book_title):
            print('Processing file', book_title)
            with open(book_title, 'r') as f0:
                sentence = ""
                for i, line in enumerate(f0.readlines()):
                    if i % 1000 == 0:
                        print('Processed', i, 'lines.')
                    line = line.rstrip()
                    if len(line) < 1:
                        continue
                    if re.search(r'^[A-Z][A-Z][A-Z]', line):
                        continue
                    if line[0] == '[':
                        continue
                    line = re.sub(r'([\.,;:!\?”])', r' \1', line)
                    line = re.sub(r'(“)', r'\1 ', line)
                    line = re.sub(r'[_‘]', '', line)
                    line = re.sub('—', ' ', line)
                    line = re.sub(r'[^a-zA-Z\.’ ]', '', line)
                    #print(line)
                    buffer_empty = True
                    lal = line.split()
                    for wd in lal:
                        if buffer_empty == False:
                            sentence += wd.lower() + " "
                            if wd not in words:
                                words.append(wd.lower())
                        buffer_empty = False
                        if wd in ['.', '!', '?', ':', ';']:
                            sentences.append(sentence)
                            sentence = ""
                            buffer_empty = True
            book_words.append(words)
        else:
            print('No file found with  name', book_title)
            exit()
    return book_words, sentences

book_titles = ['GreatExpectations_nll.txt', 'WizardOfOz.nnl', 'emma.txt','tale_of_2_cities_nll.txt']

#part 3 and 4
book_words, sentences = get_words(book_titles)
seq = Sequence_as_vec(sentences)
#part 5
for i, bw1 in enumerate(book_words):
  for j, bw2 in enumerate(book_words):
    vec1 = seq.create_vector_for_sequence(bw1)
    vec2 = seq.create_vector_for_sequence(bw2)
    cos_sim = seq.cossim(vec1, vec2)
    print("cosine similary between:", book_titles[i], " and ", book_titles[j], ": ", cos_sim)

#part 6
print()
new_sentence = input("Enter a sentence:")
best_cos, best_book = 0, 0
for idx, bw in enumerate(book_words):
  new_vec = seq.create_vector_for_sequence(new_sentence)
  vec = seq.create_vector_for_sequence(bw)
  if seq.cossim(new_vec, vec) > best_cos:
    best_cos = seq.cossim(new_vec, vec)
    best_book = idx
print("Closet book to input sentence is: ", book_titles[best_book], " with cosine similarity of: ", best_cos)



Processing file GreatExpectations_nll.txt
Processed 0 lines.
Processed 1000 lines.
Processed 2000 lines.
Processed 3000 lines.
Processed 4000 lines.
Processing file WizardOfOz.nnl
Processed 0 lines.
Processed 1000 lines.
Processed 2000 lines.
Processing file emma.txt
Processed 0 lines.
Processed 1000 lines.
Processed 2000 lines.
Processing file tale_of_2_cities_nll.txt
Processed 0 lines.
Processed 1000 lines.
Processed 2000 lines.
Processed 3000 lines.
cosine similary between: GreatExpectations_nll.txt  and  GreatExpectations_nll.txt :  1.0
cosine similary between: GreatExpectations_nll.txt  and  WizardOfOz.nnl :  0.008942251836429292
cosine similary between: GreatExpectations_nll.txt  and  emma.txt :  0.8881185667214301
cosine similary between: GreatExpectations_nll.txt  and  tale_of_2_cities_nll.txt :  0.8791174111190005
cosine similary between: WizardOfOz.nnl  and  GreatExpectations_nll.txt :  0.008942251836429292
cosine similary between: WizardOfOz.nnl  and  WizardOfOz.nnl :  1.000