In [1]:
import pandas as pd
import numpy as np
import pickle
import gc
import os
import sys
import time
import copy
import multiprocessing as mp
from tqdm import tqdm
from scipy import sparse, spatial
import warnings
warnings.filterwarnings('ignore')

In [2]:
topic = pd.read_pickle(os.path.join('../pkl', 'topic.pkl'))
topic.shape

In [3]:
topic_embeddings = np.vstack(list(topic['vector'].values))
topic_embeddings.shape

(100000, 64)

In [4]:
def cos_sim(a, b):
    return 1 - spatial.distance.cosine(a, b)

In [5]:
%%time

if not os.path.exists('./tmp'):
    os.mkdir('./tmp')

tic = time.time()
    
def process(i):
    topic_comb_sim = np.zeros(100000)
    for j in range(i+1, 100000):
        a = topic_embeddings[i]
        b = topic_embeddings[j]
        sim = cos_sim(a, b)
        # topic_comb_sim_dict[j+1] = sim
        topic_comb_sim[j] = sim
    np.save('./tmp/%d.npy' % i, topic_comb_sim)
    ## cal time
    ut = time.time() - tic
    pp = len(os.listdir('./tmp'))
    tt = ut / pp * 100000
    lt = tt - ut
    sys.stdout.write(
        '\r>> Processing data %d/%d, Time used: %ds, less: %ds, total: %ds' % (pp, 100000, ut, lt, tt))
    sys.stdout.flush()
    return 1

with mp.Pool(7) as pool:
    ret = pool.map(process, np.arange(100000))

>> Processing data 100000/100000, Time used: 24523s, less: 0s, total: 24523ssssCPU times: user 1min 8s, sys: 1min 10s, total: 2min 19s
Wall time: 6h 48min 43s


In [5]:
topic_comb_sim_mat = np.zeros((100000, 100000))
for i in tqdm(range(100000)):
    topic_comb_sim_mat[i] = np.load('./tmp/%d.npy' % i)

100%|██████████| 100000/100000 [26:41<00:00, 62.43it/s] 


In [6]:
topic_comb_sim_mat.shape

(100000, 100000)

In [7]:
topic_comb_sim_mat

array([[ 0.        , -0.15703143, -0.02313213, ...,  0.13295594,
         0.12396958, -0.04231013],
       [ 0.        ,  0.        ,  0.35224531, ..., -0.12606979,
        -0.21545197,  0.03766749],
       [ 0.        ,  0.        ,  0.        , ..., -0.08080298,
        -0.03183382,  0.24938245],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.21733336,  0.01743048],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        , -0.02675648],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [8]:
np.save('../pkl/topic_comb_sim_mat.npy', topic_comb_sim_mat)

In [9]:
def get_topic_sim(i, j):
    if i == j:
        return 1
    if i > j:
        return topic_comb_sim_mat[j][i]
    else:
        return topic_comb_sim_mat[i][j]

In [10]:
get_topic_sim(0,1)

-0.1570314343444803

In [11]:
get_topic_sim(1,0)

-0.1570314343444803

In [12]:
get_topic_sim(1, 1)

1