In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from tqdm import tqdm
import json

# importing functions for signatures calculation
from signatures_utils import *

In [2]:
# loading the data
data = pd.concat([pd.read_feather('data/text_df_1.feather'),
                  pd.read_feather('data/text_df_2.feather'),
                  pd.read_feather('data/text_df_3.feather')])
# creating a sample
data_sample = data.sample(int(len(data)/20), random_state=123)
data_sample.to_csv('data/sample_no_agg.csv', index=False)
data_sample.shape

(28422, 4)

### 2-grams 

In [3]:
# cleaning the reviews from punctuations, html tags etc.
html_clean = re.compile(r'[\d.,!?;:@#&]+|<[^>]*>|https?://\S+|["\'\(\)_/+\\$-]')
data_sample['Cleaner_text'] = (
    data_sample['Text']
    .apply(lambda x: html_clean.sub(r' ',x))
    .apply(lambda x: re.sub(r'\s+', ' ', x))
)
# extracting 2-grams (shingles)
data_sample['Ngrams'] = (
    data_sample['Cleaner_text']
    .apply(lambda x: set(list(nltk.ngrams(x.lower().split(), 2))))
    .apply(lambda x: [" ".join(w) for w in x])
)

### Minhashing

In [4]:
# aggregating 2-grams per user
data_agg = data_sample[['UserId', 'Ngrams']].groupby('UserId').apply('sum')
# dropping duplicated 2-grams
data_agg['Ngrams'] = data_agg['Ngrams'].apply(lambda x: set(x))
# minhashing
k = 100
data_agg['Minhash'] = data_agg['Ngrams'].apply(lambda x: minhash2(x, k))

In [7]:
# saving aggregated data
data_agg = data_agg.reset_index()
data_agg.to_csv('data/sample.csv', index=False)

### Jaccard similarity

In [8]:
seed = 123
# threshold for similarity between users
jaccard_threshold = 0.035
# dictionary with signatures ('user' : signature)
lsh_dict = dict(zip(data_agg.index, data_agg.Minhash))
list_keys = list(lsh_dict.keys())
# loop for calculating the similarity
similar_items = {}
count = 0
# iterate over users
for i in tqdm(range(len(list_keys)-1)):
    for j in range(i+1, len(list_keys)):
        count +=1
        # find common values within signatures of two users
        common_values = np.intersect1d(lsh_dict[list_keys[i]], lsh_dict[list_keys[j]])
        # if there are any common values
        if len(common_values) > 0:
            # calculate the similarity
            similarity_score = jaccard(list_keys[i], list_keys[j], lsh_dict)
            # if similarity is higher than threshold
            if similarity_score > jaccard_threshold:
                # saving pair of users to dict alongside their similarity score
                similar_items[(list_keys[i], list_keys[j])] = similarity_score
# number of pair of users with similarity higher than threshold
print(len(similar_items))
# all possible pairs of users
print(count)

100%|██████████| 24556/24556 [2:10:29<00:00,  3.14it/s]  

668550
301510846





In [9]:
# saving similarity information into a dictionary ('user1|user2' : similarity)
similar_items_new = {data_agg.UserId[k[0]]+"|"+data_agg.UserId[k[1]]:v for k, v in similar_items.items()}

In [10]:
# create json object from dictionary
json = json.dumps(similar_items_new)
# open file for writing, "w" 
f = open("data/similarity_dict.json","w")
# write json object to file
f.write(json)
# close file
f.close()