In [4]:
import pandas as pd

from nltk import wordpunct_tokenize
from nltk import tokenize
import numpy as np

import time

from string import digits
import re

#Data Pre-processing

In [5]:
from nltk.corpus import stopwords
import nltk

In [6]:
nltk.download('stopwords')
stop_words = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


##Reading

Read test corpus 

In [7]:
corpus_test = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/corpusTest.csv")
print(corpus_test.shape)
print(corpus_test.head())

(5374, 2)
   Id                                            Content
0   0              How do I get good marks in college?\n
1   1  Can an android app use SMS only to communicate...
2   2  What small detail from an Indian movie do you ...
3   3  Why can not Hindu women be the soldier of Hind...
4   4  How would you write out twelve lakh twelve tho...


Read train corpus

In [8]:
corpus_train = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/corpusTrain.csv")
print(corpus_train.shape)
print(corpus_train.head())

(531990, 2)
   Id                                            Content
0   0  How many people are going towards using phones...
1   1  What audio format should I use for getting aud...
2   2  What is the corporate culture like at Edwards ...
3   3        What is the best barbecue in Kansas City?\n
4   4  "Can I combine the output of two bolts to one ...


##Shallow Cleaning

In [9]:
def blank_space(x):
  return re.sub('[^A-Za-z0-9]+', ' ', x)

def numbers(x):
  return re.sub(r'[0-9]+', '', x)

def standarize_sentence(x):
  return ''.join(''.join(word)[:2] for word in x) 

def apostrophe_words(x):
  Apos_dict={"'s":" is","n't":" not","'m":" am","'ll":" will", 
           "'d":" would","'ve":" have","'re":" are"} 
  for key,value in Apos_dict.items(): 
      if key in x: 
          return x.replace(key,value)
  return x

def split_words(x):
  return " ".join([word for word in re.split("([A-Z][a-z]+[^A-Z]*)",x) if word])

def shallow_cleaning(df):
  remove_digits = str.maketrans('', '', digits)
  df['Content'] = df['Content'].apply(lambda x: blank_space(str(x)))
  df['Content'] = df['Content'].apply(lambda x: numbers(str(x)))
  df['Content'] = df['Content'].apply(lambda x: split_words(str(x)))
  df['Content'] = df['Content'].apply(lambda x: standarize_sentence(str(x)))
  df['Content'] = df['Content'].apply(lambda x: apostrophe_words(str(x)))
  df['Content'] = df['Content'].str.strip()
  df['Content'] = df['Content'].str.lower()
  df['Content'].apply(lambda x: [item for item in x if item not in stop_words])
  return df

In [10]:
corpus_train = shallow_cleaning(corpus_train)
corpus_test = shallow_cleaning(corpus_test)

##Special Characters

Remove special characters from "Content" column since they do not contribute to the duplicates identification

In [11]:
spec_chars = ["!", '"', "#", "%", "&", "'", "(", ")", "*", "+", ",", "-", ".", "/", ":", ";", "<",
              "=", ">", "?", "@", "[", "\\", "]", "^", "_", "`", "{", "|", "}", "~", "–"]

for char in spec_chars:
    corpus_test['Content'] = corpus_test['Content'].str.replace(char, '')

for char in spec_chars:
    corpus_train['Content'] = corpus_train['Content'].str.replace(char, '')

In [12]:
print(corpus_test.head())

   Id                                            Content
0   0                how do  i get good marks in college
1   1  can an android app use  sms only to communicat...
2   2  what small detail from an  indian movie do you...
3   3  why can not  hindu women be the soldier of  hi...
4   4  how would you write out twelve lakh twelve tho...


In [13]:
print(corpus_train.head())

   Id                                            Content
0   0  how many people are going towards using phones...
1   1  what audio format should  i use for getting au...
2   2  what is the corporate culture like at  edwards...
3   3         what is the best barbecue in  kansas  city
4   4  can  i combine the output of two bolts to one ...


##Vectorization

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
vectorizer=TfidfVectorizer(max_features=10000, stop_words= stop_words, ngram_range=(1, 2))

temp = corpus_train['Content']

train_vector = vectorizer.fit_transform(temp)

temp = corpus_test['Content']

test_vector = vectorizer.fit_transform(temp)

In [16]:
print(train_vector.shape)
print(test_vector.shape)

(531990, 10000)
(5374, 10000)


In [17]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer=CountVectorizer(max_features=10000)

corpus_compined = pd.DataFrame(corpus_test['Content'].append(corpus_train['Content'], ignore_index = True)) 
temp = corpus_compined['Content']

vectorized_combined_content = vectorizer.fit_transform(temp)


In [61]:
print(vectorized_combined_content.shape)

(537364, 10000)


In [18]:
vectorizer=CountVectorizer(max_features=10000)

corpus_compined = pd.DataFrame(corpus_test['Content'].append(corpus_train['Content'], ignore_index = True)) 
temp = corpus_compined['Content']

vectorized_combined_content_extended = vectorizer.fit_transform(temp)

"\nvectorizer=CountVectorizer(max_features=10000)\n\ncorpus_compined = pd.DataFrame(corpus_test['Content'].append(corpus_train['Content'], ignore_index = True)) \ntemp = corpus_compined['Content']\n\nvectorized_combined_content_extended = vectorizer.fit_transform(temp)\n"

#MinHash LSH
http://ekzhu.com/datasketch/lsh.html


MinHash LSH contribution to this project, in order to eliminate the big runtime complexity of Cosine Similarty and especially Jaccard Similaty metrics. With thousands of questions, it is not computationally feasible to compare all to all questions in real time. We wish to only compare questions that are “similar” in order to save computational resources.

LSH is is an algorithmic technique that hashes similar input items into the same “buckets” with high probability. This technique is commonly paired with MinHashing to de-duplicate documents.

In [62]:
!pip install --upgrade datasketch

Requirement already up-to-date: datasketch in /usr/local/lib/python3.6/dist-packages (1.5.3)


In [63]:
from tqdm import tqdm, tnrange, tqdm_notebook
from datasketch import MinHash, MinHashLSH

##Map question id to set representation of question

The representation is the following. 
For example, if question with id = 19 is "What is the best barbecue in Kansas City?", it will be represented as *{'m19': 'What is the best barbecue in Kansas City?'}*. This applies to both train and test corpus. 

In [64]:
#for train corpus
train_dict={} 
count = 1
for question in tqdm_notebook([x for x in corpus_train['Content'] if type(x)==str]):
    temp = []
    for shingle in question.split(' '):
        if shingle not in stop_words:
            temp.append(shingle)
    
    train_dict["m{0}".format(count)] = set(temp)
    count +=1

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=531990.0), HTML(value='')))




In [65]:
#for test corpus
test_dict={}
count=1

for question in tqdm_notebook([x for x in corpus_test['Content'] if type(x)==str]):
    temp = []
    for shingle in question.split(' '):
        if shingle not in stop_words:
            temp.append(shingle)

    test_dict["m{0}".format(count)] = set(temp)
    count += 1

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


HBox(children=(FloatProgress(value=0.0, max=5374.0), HTML(value='')))




In [None]:
prem_list = [16, 32, 64]
l = [0, 0, 0]
minhash_lsh_res = {'#Permutations': prem_list, 'BuildTime':l,'QueryTime':l, 'TotalTime':l, 'Candidate Duplicates':l}

minhash_lsh = pd.DataFrame(minhash_lsh_res)

##Create minHash signatures: map question to MinHash signatures



Compute the MinHash signatures for each question. MinHash creates a numeric representation for each question based on the number of permutations. 
To compare the similarity between two questions we compare their respective numeric representations and calculate their Jaccard Index.

In [88]:
num_perm = 64

In [89]:
min_train_dict = {}
count = 1

start = time.time() 

for val in tqdm_notebook(train_dict.values()): 
    m = MinHash(num_perm=num_perm)
    for shingle in val:
        m.update(shingle.encode('utf8'))

    min_train_dict["m{}".format(count)] = m
    count += 1

end = time.time()
 
for i in range(3):
  if minhash_lsh['#Permutations'][i] == num_perm:
    minhash_lsh['BuildTime'][i] = minhash_lsh['BuildTime'][i] + (end - start)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=531990.0), HTML(value='')))




In [90]:
min_test_dict = {}
count = 1

start = time.time() 

for val in tqdm_notebook(test_dict.values()): 
    m = MinHash(num_perm=num_perm)
    for shingle in val:
        m.update(shingle.encode('utf8'))

    min_test_dict["m{}".format(count)] = m
    count += 1

end = time.time()
 
for i in range(3):
  if minhash_lsh['#Permutations'][i] == num_perm:
    minhash_lsh['BuildTime'][i] = minhash_lsh['BuildTime'][i] + (end - start)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=5374.0), HTML(value='')))




##Create LSH index


We loop through the MinHash signatures created in the train and test dictionary and apply to them a fixed number of hash functions. We then bucket those hash functions into bands. 
Datasketch stores these in a dictionary format, where the key is 'm'+ question id and the values are all the questions deemed similar based on the defined threshold. 

In [91]:
start = time.time() 

lsh = MinHashLSH(threshold=0.8, num_perm=num_perm)

for key in tqdm_notebook(min_train_dict.keys()):
    lsh.insert(key,min_train_dict[key])

end = time.time()
 
for i in range(3):
  if minhash_lsh['#Permutations'][i] == num_perm:
    minhash_lsh['BuildTime'][i] = minhash_lsh['BuildTime'][i] + (end - start)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


HBox(children=(FloatProgress(value=0.0, max=531990.0), HTML(value='')))




##Create candidate pairs

We used the function called create_cand_pairs in order to combine the similar questions into pairs, as a very helpful step for next stage evaluation. It simply changes the format of the dictionary to be a list of lists with each sub-list being a candidate pair.

Method to create the candidate pairs in the LSH index. We need to keep in those from the train set.

In [92]:
def create_cand_pairs():
  l = []

  #query the test corpus
  for query in min_test_dict.keys():
    bucket = lsh.query(min_test_dict[query])
    if len(bucket) > 0: 
      l.append(query)

  return l
  
start = time.time()
        
cand_pairs = create_cand_pairs()

end = time.time()

for i in range(3):
  if minhash_lsh['#Permutations'][i] == num_perm:
    minhash_lsh['QueryTime'][i] = minhash_lsh['QueryTime'][i] + (end - start)
    k = i
    break

minhash_lsh['TotalTime'][k] = minhash_lsh['BuildTime'][k] + minhash_lsh['QueryTime'][k]
minhash_lsh['Candidate Duplicates'][k] = len(cand_pairs)

In [93]:
print("LSH Jaccard #Duplicates = " + str(len(cand_pairs))) 
print("Build Time = " + str(minhash_lsh['BuildTime'][k])) 
print("Query Time = " + str(minhash_lsh['QueryTime'][k])) 

LSH Jaccard #Duplicates = 920
Build Time = 531
Query Time = 0


In [94]:
minhash_lsh

Unnamed: 0,#Permutations,BuildTime,QueryTime,TotalTime,Candidate Duplicates
0,16,289,0,289,954
1,32,368,0,368,857
2,64,531,0,531,920


#Random Projection LSH

We have used the following LSH cosine-based implementation for the LSH-Cosine requirement: 
http://ethen8181.github.io/machine-learning/recsys/content_based/lsh_text.html#Locality-Sensitive-Hashing-(LSH)---Cosine-Distance

In [None]:
k_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
l = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
lsh_res = {'Parameter k': k_list, 'BuildTime':l,'QueryTime':l, 'TotalTime':l, 'Candidate Duplicates':l}

random_projection_lsh = pd.DataFrame(lsh_res)

First step is to generate a collection of random vectors from the standard Gaussian distribution. Each vector can be used to compute one bit in the bin encoding. We generate 16 vectors, leading to a 16-bit encoding of the bin index for each document.

In [55]:
def generate_random_vectors(dim, n_vectors):
    return np.random.randn(dim, n_vectors)

##Train LSH

In [56]:
from collections import defaultdict

def train_lsh(train_bow, n_vectors, seed=None):    
    if seed is not None:
        np.random.seed(seed)

    dim = train_bow.shape[1]
    random_vectors = generate_random_vectors(dim, n_vectors)  

    bin_indices_bits = train_bow.dot(random_vectors) >= 0
    powers_of_two = 1 << np.arange(n_vectors - 1, -1, step=-1)
    bin_indices = bin_indices_bits.dot(powers_of_two)

    table = defaultdict(list)
    for idx, bin_index in enumerate(bin_indices):
        table[bin_index].append(idx)
    
    model = {'table': table,
             'random_vectors': random_vectors,
             'bin_indices': bin_indices,
             'bin_indices_bits': bin_indices_bits}
             
    return model

##Search Nearby Bins

In [57]:
from itertools import combinations

def search_nearby_bins(query_bin_bits, table, search_radius, candidate_set):
  
    if candidate_set is None:
        candidate_set = set()

    n_vectors = query_bin_bits.shape[0]
    powers_of_two = 1 << np.arange(n_vectors - 1, -1, step=-1)

    for different_bits in combinations(range(n_vectors), search_radius):
        index = list(different_bits)
        alternate_bits = query_bin_bits.copy()
        alternate_bits[index] = np.logical_not(alternate_bits[index])

        nearby_bin = alternate_bits.dot(powers_of_two)

        if nearby_bin in table:
            candidate_set.update(table[nearby_bin])

    return candidate_set

##Get Nearest Neighbors

In [58]:
from sklearn.metrics.pairwise import pairwise_distances

def get_nearest_neighbors(train_bow, query_vector, model, max_search_radius):
    table = model['table']
    random_vectors = model['random_vectors']

    bin_index_bits = np.ravel(query_vector.dot(random_vectors) >= 0)

    candidate_set = set()
    for search_radius in range(max_search_radius + 1):
        candidate_set = search_nearby_bins(bin_index_bits, table, search_radius, candidate_set)

    candidate_list = list(candidate_set)
    candidates = train_bow[candidate_list]
    distance = pairwise_distances(candidates, query_vector, metric='cosine').flatten()
    
    distance_col = 'distance'
    nearest_neighbors = pd.DataFrame({
        'id': candidate_list, distance_col: distance
    }).sort_values(distance_col).reset_index(drop=True)
    
    return nearest_neighbors

##Query LSH

In [112]:
k = 5

In [113]:
start = time.time() 
# build the model
n_vectors = k
model = train_lsh(vectorized_combined_content, n_vectors, seed=143)

end = time.time()
build_time = end - start

lsh_cands = []

start = time.time() 

for i in range(5374):
  query_vector = vectorized_combined_content[i]
    
  nearest_neighbors = get_nearest_neighbors(vectorized_combined_content, query_vector, model, 0)

  num_candidates = nearest_neighbors.shape[0]
  count = 0 

  for l in range(num_candidates): 
    if nearest_neighbors['distance'][l] < 0.2 and nearest_neighbors['id'][l] >= 5374: 
      count = count + 1
    if count >= 1: 
      lsh_cands.append(i)
      break

end = time.time()
query_time = end - start

In [114]:
random_projection_lsh['QueryTime'][k-1] = query_time
random_projection_lsh['BuildTime'][k-1] = build_time
random_projection_lsh['TotalTime'][k-1] = query_time + build_time
random_projection_lsh['Candidate Duplicates'][k-1] = len(lsh_cands)

In [115]:
random_projection_lsh

Unnamed: 0,Parameter k,BuildTime,QueryTime,TotalTime,Candidate Duplicates
0,1,0,0,0,0
1,2,0,0,0,0
2,3,0,0,0,0
3,4,0,0,0,0
4,5,0,660,660,1351
5,6,0,0,0,0
6,7,0,0,0,0
7,8,0,0,0,0
8,9,0,0,0,0
9,10,0,45,45,854


#Random Projection 2
https://pypi.org/project/lshashpy3/

In [109]:
!pip install --upgrade lshashpy3

Collecting lshashpy3
  Downloading https://files.pythonhosted.org/packages/0a/5e/746d7c54f883b1c0b216771d023b8e91e86b41eb9240b97baee8a849962f/lshashpy3-0.0.8.tar.gz
Building wheels for collected packages: lshashpy3
  Building wheel for lshashpy3 (setup.py) ... [?25l[?25hdone
  Created wheel for lshashpy3: filename=lshashpy3-0.0.8-cp36-none-any.whl size=8877 sha256=5e5e8e4d08787f1ad89813b7b545c0fe05f38e34dd7590f34ad9798e34c4290e
  Stored in directory: /root/.cache/pip/wheels/c6/81/29/a48985e27d56ddea4e601fda82b9be9962d20801b3cfa82c2d
Successfully built lshashpy3
Installing collected packages: lshashpy3
Successfully installed lshashpy3-0.0.8


In [110]:
from lshashpy3 import LSHash

In [26]:
vectorized_combined_content

<537364x77866 sparse matrix of type '<class 'numpy.int64'>'
	with 5498066 stored elements in Compressed Sparse Row format>

In [None]:
#def create_lsh_index(k):
d = 10000 # Dimension of Feature vector
lsh = LSHash(hash_size=k, input_dim=d)
for i in range(5374,100000):
  lsh.index(vectorized_combined_content[i].toarray().ravel().astype('int8'), extra_data=str(i))
  #return lsh

In [107]:
def create_lsh_index(k):
 # d = 10000 # Dimension of Feature vector
  #lsh = LSHash(hash_size=k, input_dim=d)
  for i in range(100000, 200000):
    lsh.index(vectorized_combined_content[i].toarray().ravel().astype('int8'), extra_data=str(i))
  return lsh

In [29]:
def query_data(lsh):
  cands = []
  for i in range(5374):
    nn = lsh.query(vectorized_combined_content[i].toarray().ravel().astype('int8'), num_results=1, distance_func="cosine")
    for ((vec,extra_data),distance) in nn:
      print(extra_data)
      if 1 - distance >= 0.8:
        if i not in cands: 
          cands.append(i)
          #print(len(cands))
  return cands

In [None]:
for k in k_list:
  lsh = create_lsh_index(k)
  cands = query_data(lsh)
  print(len(cands))
  break

In [1]:
!pip install --upgrade lshashing

Requirement already up-to-date: lshashing in /usr/local/lib/python3.6/dist-packages (1.0.1)


#Exact Cosine Similarity
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.cosine_similarity.html

In [15]:
from sklearn.metrics.pairwise import cosine_similarity

In [16]:
dupcs_exact_cosine = []
query_time_exact_cosine = 0

In [17]:
def check_threshold(res):
  for i in range(len(res)):
    for c in res[i]:
      if c >= 0.8 and i not in dupcs_exact_cosine: 
        dupcs_exact_cosine.append(i)
        break

In [18]:
start = time.time() 
res = cosine_similarity(vectorized_combined_content_extended[0:500], vectorized_combined_content_extended[5374:])
check_threshold(res)
end = time.time()
query_time_exact_cosine = query_time_exact_cosine + (end - start)

In [19]:
print("Duplicates found until test data row 500: " + str(len(dupcs_exact_cosine)))

Duplicates found until test data row 500: 183


In [20]:
start = time.time() 
res = cosine_similarity(vectorized_combined_content_extended[500:1100], vectorized_combined_content_extended[5374:])
check_threshold(res)
end = time.time()
query_time_exact_cosine = query_time_exact_cosine + (end - start)

In [21]:
print("Duplicates found until test data row 1100: " + str(len(dupcs_exact_cosine)))

Duplicates found until test data row 1100: 344


In [22]:
start = time.time() 
res = cosine_similarity(vectorized_combined_content_extended[1100:1700], vectorized_combined_content_extended[5374:])
check_threshold(res)
end = time.time()
query_time_exact_cosine = query_time_exact_cosine + (end - start)

In [23]:
print("Duplicates found until train data row 1700: " + str(len(dupcs_exact_cosine)))

Duplicates found until train data row 1700: 436


In [24]:
start = time.time() 
res = cosine_similarity(vectorized_combined_content_extended[1700:2400], vectorized_combined_content_extended[5374:])
check_threshold(res)
end = time.time()
query_time_exact_cosine = query_time_exact_cosine + (end - start)

In [25]:
print("Duplicates found until test data row 2400: " + str(len(dupcs_exact_cosine)))

Duplicates found until test data row 2400: 528


In [26]:
start = time.time() 
res = cosine_similarity(vectorized_combined_content_extended[2400:3100], vectorized_combined_content_extended[5374:])
check_threshold(res)
end = time.time()
query_time_exact_cosine = query_time_exact_cosine + (end - start)

In [27]:
print("Duplicates found until test data row 3100: " + str(len(dupcs_exact_cosine)))

Duplicates found until test data row 3100: 593


In [28]:
start = time.time() 
res = cosine_similarity(vectorized_combined_content_extended[3100:4000], vectorized_combined_content_extended[5374:])
check_threshold(res)
end = time.time()
query_time_exact_cosine = query_time_exact_cosine + (end - start)

In [29]:
print("Duplicates found until test data row 4000: " + str(len(dupcs_exact_cosine)))

Duplicates found until test data row 4000: 703


In [30]:
start = time.time() 
res = cosine_similarity(vectorized_combined_content_extended[4000:4700], vectorized_combined_content_extended[5374:])
check_threshold(res)
end = time.time()
query_time_exact_cosine = query_time_exact_cosine + (end - start)

In [31]:
print("Duplicates found until test data row 4700: " + str(len(dupcs_exact_cosine)))

Duplicates found until test data row 4700: 727


In [32]:
start = time.time() 
res = cosine_similarity(vectorized_combined_content_extended[4700:5374], vectorized_combined_content_extended[5374:])
check_threshold(res)
end = time.time()
query_time_exact_cosine = query_time_exact_cosine + (end - start)

In [33]:
print("Duplicates identified with exact cosine: " + str(len(dupcs_exact_cosine)))
print("Total query time with exact cosine: " + str(query_time_exact_cosine) + " s")

Duplicates identified with exact cosine: 739
Total query time with exact cosine: 532.7807335853577 s


In [38]:
print(dupcs_exact_cosine[0:100])

[0, 2, 5, 7, 9, 12, 18, 22, 23, 28, 29, 31, 35, 40, 43, 49, 50, 51, 52, 54, 57, 59, 60, 61, 65, 67, 72, 73, 76, 80, 91, 92, 94, 95, 101, 104, 107, 108, 110, 114, 115, 116, 117, 118, 123, 129, 133, 135, 137, 138, 144, 149, 156, 159, 161, 164, 166, 167, 169, 172, 177, 178, 180, 181, 183, 184, 186, 189, 193, 199, 200, 201, 202, 203, 208, 214, 219, 220, 222, 224, 227, 233, 234, 236, 238, 239, 240, 244, 246, 248, 252, 259, 264, 271, 272, 273, 274, 277, 280, 285]


#Exact Jaccard Similarity

In [116]:
def jaccard(list1, list2):
  intersection = len(list(set(list1).intersection(list2)))
  union = (len(list1) + len(list2)) - intersection
  return float("{:.2f}".format(float(intersection) / union))

In [117]:
splitted_train_corpus = []

for j in range(len(corpus_train)):
  splitted_train_corpus.append(corpus_train['Content'][j].split(" "))

In [None]:
jaccard_dups = []
start_time = time.time()

l = len(corpus_train)

for i in range(6):
  for j in range(l):
    j = jaccard(corpus_test['Content'][i].split(" "), splitted_train_corpus[j])
    if j > 0.8: 
      dups.append(i)
      break

end_time = time.time()
jaccard_query_time = end_time-start_time

print("Duplicates identified with exact jaccard: " + str(len(jaccard_dups)))
print("Total query time with exact jaccard: " + str(jaccard_query_time) + " s")

In [None]:
print("Duplicates identified with exact jaccard: " + str(len(jaccard_dups)))
print("Total query time with exact jaccard: " + str(jaccard_query_time) + " s")

#Results

In [34]:
l = ['-', '-','-', '-']
res = {'Type': ["Exact-Jaccard", "Exact-Cosine", "LSH-Cosine", "LSH-Jaccard"], 
      'BuildTime':l,
      'QueryTime':l,
       'TotalTime':l, 
       '#Duplicates':l, 
       'Parameters':l} 

res_df = pd.DataFrame(res)

#exact-cosine results
res_df['QueryTime'][1] = "532.8 s"
res_df['TotalTime'][1] = "532.8 s"
res_df['#Duplicates'][1] = 739

res_df

Unnamed: 0,Type,BuildTime,QueryTime,TotalTime,#Duplicates,Parameters
0,Exact-Jaccard,-,-,-,-,-
1,Exact-Cosine,-,532.8 s,532.8 s,739,-
2,LSH-Cosine,-,-,-,-,-
3,LSH-Jaccard,-,-,-,-,-
