In [1]:
!pip install --upgrade datasketch

Requirement already up-to-date: datasketch in /usr/local/lib/python3.6/dist-packages (1.5.3)


In [2]:
import pandas as pd

from nltk import wordpunct_tokenize
from nltk import tokenize

from tqdm import tqdm
from tqdm import tnrange, tqdm_notebook
from datasketch import MinHash, MinHashLSH
from tqdm import tqdm

from string import digits
import re

#Data Pre-processing

In [3]:
from nltk.corpus import stopwords
import nltk

In [4]:
nltk.download('stopwords')
stop_words = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


##Reading

In [5]:
corpus_test = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/corpusTest.csv")
corpus_test.shape
corpus_test.head()

Unnamed: 0,Id,Content
0,0,How do I get good marks in college?\n
1,1,Can an android app use SMS only to communicate...
2,2,What small detail from an Indian movie do you ...
3,3,Why can not Hindu women be the soldier of Hind...
4,4,How would you write out twelve lakh twelve tho...


In [6]:
corpus_train = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/corpusTrain.csv")
corpus_train.shape
corpus_train.head()

Unnamed: 0,Id,Content
0,0,How many people are going towards using phones...
1,1,What audio format should I use for getting aud...
2,2,What is the corporate culture like at Edwards ...
3,3,What is the best barbecue in Kansas City?\n
4,4,"""Can I combine the output of two bolts to one ..."


In [7]:
corpus_train.shape


(531990, 2)

##Shallow Cleaning

In [8]:
def blank_space(x):
  return re.sub('[^A-Za-z0-9]+', ' ', x)

def numbers(x):
  return re.sub(r'[0-9]+', '', x)

def standarize_sentence(x):
  return ''.join(''.join(word)[:2] for word in x) 

def apostrophe_words(x):
  Apos_dict={"'s":" is","n't":" not","'m":" am","'ll":" will", 
           "'d":" would","'ve":" have","'re":" are"} 
  for key,value in Apos_dict.items(): 
      if key in x: 
          return x.replace(key,value)
  return x

def split_words(x):
  return " ".join([word for word in re.split("([A-Z][a-z]+[^A-Z]*)",x) if word])

def shallow_cleaning(df):
  remove_digits = str.maketrans('', '', digits)
  df['Content'] = df['Content'].apply(lambda x: blank_space(str(x)))
  df['Content'] = df['Content'].apply(lambda x: numbers(str(x)))
  df['Content'] = df['Content'].apply(lambda x: split_words(str(x)))
  df['Content'] = df['Content'].apply(lambda x: standarize_sentence(str(x)))
  df['Content'] = df['Content'].apply(lambda x: apostrophe_words(str(x)))
  df['Content'] = df['Content'].str.strip()
  df['Content'] = df['Content'].str.lower()
  df['Content'].apply(lambda x: [item for item in x if item not in stop_words])
  return df

In [9]:
corpus_train = shallow_cleaning(corpus_train)
corpus_test = shallow_cleaning(corpus_test)

##Special Characters

In [10]:
spec_chars = ["!", '"', "#", "%", "&", "'", "(", ")", "*", "+", ",", "-", ".", "/", ":", ";", "<",
              "=", ">", "?", "@", "[", "\\", "]", "^", "_", "`", "{", "|", "}", "~", "–"]

for char in spec_chars:
    corpus_test['Content'] = corpus_test['Content'].str.replace(char, '')

for char in spec_chars:
    corpus_train['Content'] = corpus_train['Content'].str.replace(char, '')

In [11]:
corpus_test.head()

Unnamed: 0,Id,Content
0,0,how do i get good marks in college
1,1,can an android app use sms only to communicat...
2,2,what small detail from an indian movie do you...
3,3,why can not hindu women be the soldier of hi...
4,4,how would you write out twelve lakh twelve tho...


In [12]:
corpus_train.head()

Unnamed: 0,Id,Content
0,0,how many people are going towards using phones...
1,1,what audio format should i use for getting au...
2,2,what is the corporate culture like at edwards...
3,3,what is the best barbecue in kansas city
4,4,can i combine the output of two bolts to one ...


In [13]:
#comb_ques = corpus_train['Content'].append(corpus_test['Content']).reset_index()
#comb_ques.head()

#Vectorization

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

In [15]:
vectorizer = CountVectorizer(lowercase = True, stop_words = 'english', max_features = 10000)
vectorizer.fit(corpus_test['Content'])
vectorized_test_content = vectorizer.transform(corpus_test['Content'])
print(vectorized_test_content.shape)

(5374, 8380)


In [16]:
vectorizer = CountVectorizer(lowercase = True, stop_words = 'english', max_features = 10000)
vectorizer.fit(corpus_train['Content'])
vectorized_train_content = vectorizer.transform(corpus_train['Content'])
print(vectorized_train_content.shape)

(531990, 10000)


In [17]:
corpus_compined = pd.DataFrame(corpus_test['Content'].append(corpus_train['Content'], ignore_index = True)) 
print(len(corpus_compined['Content']))

537364


In [18]:
vectorizer = CountVectorizer(lowercase = True, stop_words = 'english', max_features = 10000)
vectorizer.fit(corpus_compined['Content'])
vectorized_combined_content = vectorizer.transform(corpus_compined['Content'])
print(vectorized_combined_content.shape)

(537364, 10000)


#MinHash LSH

##Map question id (eg 'm23') to set representation of question

In [27]:
#for train corpus
set_dict={} 
count=1
for question in tqdm_notebook([x for x in corpus_train['Content'] if type(x)==str]):
    temp_list = []
    for shingle in question.split(' '):
        if shingle not in stop_words:
            temp_list.append(shingle.lower())
    set_dict["m{0}".format(count)] = set(temp_list)
    count +=1

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=531990.0), HTML(value='')))




In [28]:
#for test corpus
set_dict_test={}
count=1
for question in tqdm_notebook([x for x in corpus_test['Content'] if type(x)==str]):
    temp_list = []
    for shingle in question.split(' '):
        if shingle not in stop_words:
            temp_list.append(shingle.lower())
    set_dict_test["m{0}".format(count)] = set(temp_list)
    count +=1

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=5374.0), HTML(value='')))




##Create minHash signatures: map question id (eg 'm23') to min hash signatures

In [29]:
num_perm = 32

In [30]:
min_dict = {}
count2 = 1
for val in tqdm_notebook(set_dict.values()): 
    m = MinHash(num_perm=num_perm)
    for shingle in val:
        m.update(shingle.encode('utf8'))
    min_dict["m{}".format(count2)] = m
    count2+=1

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=531990.0), HTML(value='')))




In [31]:
min_dict_test = {}
count2 = 1
for val in tqdm_notebook(set_dict_test.values()): 
    m = MinHash(num_perm=num_perm)
    for shingle in val:
        m.update(shingle.encode('utf8'))
    min_dict_test["m{}".format(count2)] = m
    count2+=1

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=5374.0), HTML(value='')))




##Create LSH index

In [32]:
lsh = MinHashLSH(threshold=0.8, num_perm=num_perm)

In [33]:
#for train corpus
for key in tqdm_notebook(min_dict.keys()):
    lsh.insert(key,min_dict[key])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=531990.0), HTML(value='')))




In [34]:
#for test corpus
for key in tqdm_notebook(min_dict_test.keys()):
  lsh.insert("m"+ str(int(re.sub("[^0-9]", "", key)) + 531990),min_dict_test[key])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=5374.0), HTML(value='')))




##Create candidate pairs

Method to create the candidate pairs in the LSH index. We need to keep in those from the train set.

In [35]:
def create_cand_pairs():
    big_list = []
    #query the test corpus
    for query in min_dict_test.keys():
        bucket = lsh.query(min_dict_test[query])
        if len(bucket)==1:
            big_list.append([bucket[0],"None"])
        if len(bucket)>1:
            first_val = bucket[0]
            for val in bucket[1:]:
                second_val = val
                big_list.append([first_val,second_val])
    return big_list
        
        
cand_pairs = create_cand_pairs()

In [36]:
import re
count = 0
lsh_dup_cand = []
for c in cand_pairs: 
  if c[0] != "None" and c[1] != "None":
    first = int(re.sub("[^0-9]", "", c[0]))
    second = int(re.sub("[^0-9]", "", c[1]))
    if first > 531989 and second <= 531989:
      count = count + 1 
      lsh_dup_cand.append([first, second])
      continue
    elif second > 531989 and first <= 531989:
      count = count + 1 
      lsh_dup_cand.append([second, first])
print("candidate duplicate pairs = " + str(count))

candidate duplicate pairs = 1092


#Exact Cosine Similarity

In [19]:
from sklearn.metrics.pairwise import cosine_similarity

In [20]:
count = 0
i = 0
similar = []
res = cosine_similarity(vectorized_combined_content[0:5374], vectorized_combined_content[5374:100000])
for inner_res in res:
  for c in inner_res:
    if c > 0.8: 
      similar.append(i)
      count = count + 1
      break
  i = i + 1
print(count)

1116


In [21]:
i = 0
res = cosine_similarity(vectorized_combined_content[0:5374], vectorized_combined_content[100000:200000])
for inner_res in res:
  for c in inner_res:
    if c > 0.8 and i not in similar: 
      similar.append(i)
      count = count + 1
      break
  i = i +1
print(count)

1529


In [22]:
i = 0
res = cosine_similarity(vectorized_combined_content[0:5374], vectorized_combined_content[200000:300000])
for inner_res in res:
  for c in inner_res:
    if c > 0.8 and i not in similar: 
      similar.append(i)
      count = count + 1
      break
  i = i +1
print(count)

1790


In [23]:
i = 0
res = cosine_similarity(vectorized_combined_content[0:5374], vectorized_combined_content[300000:400000])
for inner_res in res:
  for c in inner_res:
    if c > 0.8 and i not in similar: 
      similar.append(i)
      count = count + 1
      break
  i = i +1
print(count)

1998


In [24]:
i = 0
res = cosine_similarity(vectorized_combined_content[0:5374], vectorized_combined_content[400000:])
for inner_res in res:
  for c in inner_res:
    if c > 0.8 and i not in similar: 
      similar.append(i)
      count = count + 1
      break
  i = i +1
print(count)

2218


In [25]:
print("Duplicates identified with exact cosine: " + str(len(similar)))

Duplicates identified with exact cosine: 2218


#Exact Jaccard Similarity

In [None]:
import numpy as np
from sklearn.metrics import jaccard_score, accuracy_score

In [None]:
vectorized_test_arr = vectorized_combined_content[0:5374].toarray()

In [None]:
def jaccard(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union

In [None]:
for j in range(5374):
  c = jaccard_score(vectorized_test_arr[5373], vectorized_test_arr[j], average='macro')
  if c > 0.8: 
    count = count + 1

In [None]:
count = 0
temp = []
for i in range(5374, 6000):
  temp = vectorized_combined_content[i].toarray()
  for j in range(5374):
    c = jaccard_score(temp[0], vectorized_test_arr[j], average='macro')
    if c > 0.8: 
      count = count + 1
      break
print(count)

38


#LSH-Cosine

In [44]:
count = 0
i = 0
similar_lsh_con = []
for cand in lsh_dup_cand:
  res = cosine_similarity(vectorized_combined_content[cand[0]], vectorized_combined_content[5375:])
  for inner_res in res:
    for c in inner_res:
      if c > 0.8: 
        similar_lsh_con.append(i)
        count = count + 1
        break
  i = i + 1
print(count)

1082


In [41]:
lsh_dup_cand[0]
res = cosine_similarity(vectorized_combined_content[lsh_dup_cand[0]], vectorized_combined_content[5375:])
res

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.25819889, ..., 0.        , 0.        ,
        0.        ]])

# LSH Jaccard
