Fill your name and student number in here:

Name: Seyed Ali Mirferdos
<br>
Student Number: 99201465

In [1]:
import numpy as np
import pandas as pd
import gensim
import gensim.downloader as api
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

np.random.seed(65)

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Load the data

In [3]:
df = pd.read_csv('./entire_library_have_fun.csv')

# Preprocess the data

I've copied them from the previous exercise:

## 1. Lowercase

In [4]:
df['Processed Track Name'] = df['Track Name'].map(str.lower)

## 2. Removing numbers

In [5]:
df['Processed Track Name'] = df['Processed Track Name'].map(lambda x: re.sub(r'\d+', '', x))

## 3. Removing all non-ascii characters

In [6]:
df['Processed Track Name'] = df['Processed Track Name'].map(lambda x: re.sub(r'[^\x00-\x7F]+', '', x))

## 4. Removing punctuation

In [7]:
df['Processed Track Name'] = df['Processed Track Name'].map(lambda x: x.translate(str.maketrans('','', string.punctuation)))

## 5. Tokenizing

In [8]:
df['Processed Track Name'] = df['Processed Track Name'].map(lambda x: [i for i in word_tokenize(x)])

## 6. Removing single letter words

In [9]:
df['Processed Track Name'] = df['Processed Track Name'].map(lambda x: 
                                        [w for w in x if len(w) > 1])

# LSH

## 1. Create Fake Data

In [10]:
def create_fake_data(N, d):
    # todo: complete this section
    # returns X array with shape (N, d)
    return np.random.uniform(low=-10, high=10 ,size=(N, d))

## 2. Cosine Distance and k-Nearest Neighours

In [11]:
def cosine_distance(x, y):
    # todo: complete this section
    # returns cosine distance between x and y
    return 1 - np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

In [12]:
def find_k_nearest_neighbours(X, q, k):
    # todo: complete this section
    # returns indexes of the k-nearest-neighbours of vector q
    dist = np.array([cosine_distance(i, q) for i in X])
    return dist.argsort()[:k]

## 3. Signer class

In [13]:
class Signer:
    def __init__(self, f, d):
        # todo: complete this section
        # initiates f random vectors with dimension = d
        self.hyper_planes = create_fake_data(f, d)
        
    def hash_point(self, x):
        # x: (1, d)
        # todo: complete this section
        # returns list of numbers in {0, 1} in shape (1, f)
        y = []
        for p in self.hyper_planes:
          dot_product = np.dot(x, p)
          binary_res = int(dot_product > 0)
          y.append(binary_res)
        return np.array(y)

## 4. LSHIndex

In [14]:
class LSHIndex:
    def __init__(self, f, b, d, data):
        # todo: complete this section
        # initiates empty hash tables
        self.f = f
        self.b = b
        self.f_b = f // b
        self.signer = Signer(f, d)
        self.hash_tables = [{} for i in range(b)]
        self.data = data
    
    def index(self, x_index):
        # The signature of the method is changed following the discussions in 
        # the group.
        # todo: complete this section
        # returns a boolean, indicating success or failure
        x = self.data[x_index, :]
        x_hash = self.signer.hash_point(x) # the output would be (1, f)
        parts = np.array_split(x_hash, b)
        
        for i in range(b):
          curr_table = self.hash_tables[i]
          part = int(''.join(parts[i].astype(str)), 2)
          
          if part in curr_table:
            curr_table[part].append(x_index)
          else:
            curr_table[part] = [x_index]
    
    def index_all(self):
      for i in range(self.data.shape[0]):
        self.index(i)

    def query(self, q, k):
        # todo: complete this section
        # q: (1, d)
        # returns k-approximate-nearest-neighbours of vector q
        q_hash = self.signer.hash_point(q)
        parts = np.array_split(q_hash, b)
        candidates = set()

        for i in range(b):
          curr_table = self.hash_tables[i]
          part = int(''.join(parts[i].astype(str)), 2)
          if part in curr_table:
              candidates.update(curr_table[part])
        
        x = self.data[list(candidates)]
        result = find_k_nearest_neighbours(x, q, k)
        return result

# Word2Vec

## Using Word2Vec on the data

In [15]:
model = api.load("glove-wiki-gigaword-50")

In [16]:
def get_word2vec(words):

  vecs = []
  for w in words:
    if w in model:
      vecs.append(model[w])
  
  mean_vecs = np.mean(vecs, axis=0)
  return mean_vecs

In [17]:
df['Track Name Vector'] = df['Processed Track Name'].apply(get_word2vec)

  out=out, **kwargs)


In [18]:
df.dropna(subset=["Track Name Vector"], inplace=True)

## Getting LSH to work

In [19]:
b = 20
f = 100
d = 50
k = 10

In [20]:
X = np.array(df["Track Name Vector"].tolist())

In [21]:
lsh = LSHIndex(f, b, d, X)
lsh.index_all()

## Querying on the data

In [22]:
q1 = 'end of world'
q2 = 'he and his friends'

In [23]:
q1 = [i for i in word_tokenize(q1)]
q2 = [i for i in word_tokenize(q2)]

In [24]:
q1_vec = get_word2vec(q1)
q2_vec = get_word2vec(q2)

In [25]:
q1_result = lsh.query(q1_vec, k)
q2_result = lsh.query(q2_vec, k)

In [26]:
df.iloc[q1_result]['Track Name']

1932                       IN SPITE
1929                       Psalm 69
6089                       Kingsize
195                      Empty Room
319                  Gold And Myrrh
6087                  Cordless Kids
2562           One Night Last Stand
422     The Devil Is In The Details
4200          AS Gardens Need Walls
2634                       Talisman
Name: Track Name, dtype: object

In [29]:
df.iloc[q2_result]['Track Name']

2014                             Victim
1771                       The Omission
5939                      Chopper Fight
5024                   To Be a Ghost...
1091                Einer von den Guten
644     Death Awakens (feat. TenGraphs)
798                             Eternal
1693                        More Tunnel
8614                        Dream To Me
8527                    Heard It's True
Name: Track Name, dtype: object