In [1]:
import pandas as pd
import numpy as np
import re
import nltk

from signatures_utils import *

In [2]:
data = pd.concat([pd.read_feather('data/text_df_1.feather'),
                  pd.read_feather('data/text_df_2.feather'),
                  pd.read_feather('data/text_df_3.feather')])

In [10]:
data_sample = data.sample(int(len(data)/10), random_state=123)
data_sample.shape

(56845, 4)

In [4]:
html_clean = re.compile(r'[\d.,!?;:@#&]+|<[^>]*>|https?://\S+|["\'\(\)_/+\\$-]')
data_sample['Cleaner_text'] = (
    data_sample['Text']
    .apply(lambda x: html_clean.sub(r' ',x))
    .apply(lambda x: re.sub(r'\s+', ' ', x))
)

data_sample['Ngrams'] = (
    data_sample['Cleaner_text']
    .apply(lambda x: set(list(nltk.ngrams(x.lower().split(), 2))))
    .apply(lambda x: [" ".join(w) for w in x])
)

In [5]:
data_agg = data_sample[['UserId', 'Ngrams']].groupby('UserId').apply('sum')
data_agg['Ngrams'] = data_agg['Ngrams'].apply(lambda x: set(x))

k=100
data_agg['Minhash'] = data_agg['Ngrams'].apply(lambda x: minhash2(x, k))

In [6]:
data_agg = data_agg.reset_index()
data_agg.to_csv('data/sample.csv', index=False)

In [7]:
from tqdm import tqdm

seed = 123
jaccard_threshold = 0.035

lsh_dict = dict(zip(data_agg.index, data_agg.Minhash))
list_keys = list(lsh_dict.keys())

similar_items = {}
count = 0
for i in tqdm(range(len(list_keys)-1)):
    for j in range(i+1, len(list_keys)):
        count +=1
        common_values = np.intersect1d(lsh_dict[list_keys[i]], lsh_dict[list_keys[j]])
        if len(common_values) > 0:
            # we found a candidate
            similarity_score = jaccard(list_keys[i], list_keys[j], lsh_dict)
            if similarity_score > jaccard_threshold:
                # print(similarity_score)
                similar_items[(list_keys[i], list_keys[j])] = similarity_score
print(len(similar_items))
print(count)

100%|██████████| 2791/2791 [04:19<00:00, 10.76it/s] 

8393
3896236





In [8]:
similar_items_new = {data_agg.UserId[k[0]]+"|"+data_agg.UserId[k[1]]:v for k, v in similar_items.items()}

In [9]:
# create json object from dictionary
import json
json = json.dumps(similar_items_new)

# open file for writing, "w" 
f = open("data/similarity_dict.json","w")

# write json object to file
f.write(json)

# close file
f.close()