In [1]:
import pickle
tfidf_model = pickle.load(open('../data/processed/Text_Models/tfidf_max_df_0.5','rb'))

In [2]:
w2v_model = pickle.load(open('../data/processed/Text_Models/w2v_model.pkl','rb'))

In [3]:
len(w2v_model.wv.index_to_key)

86763

In [4]:
len(tfidf_model.vocabulary_)

316783

In [5]:
import numpy as np
vocab = set(w2v_model.wv.index_to_key)
tf_idf_vocab = set(tfidf_model.vocabulary_.keys())

print(len(vocab.intersection(tf_idf_vocab)),len(vocab),len(tf_idf_vocab))
def get_idf_score(word):
    
    if word in tf_idf_vocab:
        idx = tfidf_model.vocabulary_[word]
        return tfidf_model.idf_[idx]
    return 0.5

def get_vector_w2v_model(text):

    vec_size = w2v_model.wv.vectors.shape[1]
    vectors_em = np.zeros(shape = (len(text),vec_size))
    for idx,para in enumerate(text):
        counter = 0
        rem = 0
        vec = np.zeros(vec_size)
        for word in para.split(' '):
                if word in vocab:
                    vec += w2v_model.wv.get_vector(word)
                    # counter +=1
                else:
                    rem+=1
        if counter != 0:
            vectors_em[idx,:] = vec
            # print(counter,rem,end=',')
    # print()
    return vectors_em

def tfidf_weight_w2v_model(text):
    # print("Using: Idf_w2v")
    vec_size = w2v_model.wv.vectors.shape[1]
    vectors_em = np.zeros(shape = (len(text),vec_size))
    for idx,para in enumerate(text):
        counter = 0
        rem = 0
        vec = np.zeros(vec_size)
        for word in para.split(' '):
                if word in vocab:
                    idf_score = get_idf_score(word)
                    vec += idf_score*w2v_model.wv.get_vector(word)
                    counter +=1
                else:
                    rem+=1
        if counter != 0:
            vectors_em[idx,:] = vec
            # print(counter,rem,end=',')
    # print()
    return vectors_em

def tfidf_matrix(paras):
    return tfidf_model.transform(paras)
    

74194 86763 316783


### Input prep

In [6]:
from glob import glob
from tqdm import tqdm
docs = []
for path in tqdm(glob("../data/processed/processed_judgements/*")):
    docs.append((open(path).read(), path.split('/')[-1].split('.')[0]))

100%|████████████████████████████████████| 53897/53897 [02:29<00:00, 360.14it/s]


In [19]:
method_name = 'w2v_idf' #  tfidf , w2v , w2v_idf
def get_matrix(inputs):
    '''
    Text   : is a list of string can be a single string
    Method : Method to be used for vetorization
    '''
    text,jid = inputs
    text = text.split('\n')
    out = None
    if method_name=='tfidf':
        out = tfidf_matrix(text)
    elif method_name=='w2v':
        out = get_vector_w2v_model(text)
    elif method_name=='w2v_idf':
        out = tfidf_weight_w2v_model(text)
    path = f'../data/processed/paragraph_matrix/{method_name}/{jid}.pkl'
    file = open(path,'wb')
    pickle.dump(out,file)
    file.close()

In [20]:
docs[0][1]

'134092'

In [21]:
method_name

'w2v_idf'

In [22]:
import os
from multiprocessing import Pool
from datetime import datetime

now = datetime.now()
print(now)
preped_text = []
workers = os.cpu_count()
print(workers)
with Pool(workers) as p:
        p.map(get_matrix, docs)

2023-01-22 20:36:50.119513
40


Process ForkPoolWorker-43:
Process ForkPoolWorker-80:
Process ForkPoolWorker-78:
Process ForkPoolWorker-42:
Process ForkPoolWorker-53:
Process ForkPoolWorker-60:
Process ForkPoolWorker-68:
Process ForkPoolWorker-58:
Process ForkPoolWorker-48:
Process ForkPoolWorker-45:
Process ForkPoolWorker-76:
Process ForkPoolWorker-55:
Process ForkPoolWorker-64:
Process ForkPoolWorker-47:
Process ForkPoolWorker-57:
Process ForkPoolWorker-75:
Process ForkPoolWorker-73:
Process ForkPoolWorker-50:
Process ForkPoolWorker-77:
Process ForkPoolWorker-72:
Process ForkPoolWorker-54:
Process ForkPoolWorker-66:
Process ForkPoolWorker-62:
Process ForkPoolWorker-41:
Process ForkPoolWorker-56:
Process ForkPoolWorker-67:
Process ForkPoolWorker-74:
Process ForkPoolWorker-44:
Process ForkPoolWorker-69:
Process ForkPoolWorker-70:
Process ForkPoolWorker-49:
Process ForkPoolWorker-51:
Process ForkPoolWorker-46:
Process ForkPoolWorker-63:
Process ForkPoolWorker-79:
Process ForkPoolWorker-59:
Process ForkPoolWorker-71:
P

KeyboardInterrupt: 

In [None]:
print(datetime.now())

### Combine

In [None]:
matrix_map = dict()
for path in tqdm(glob(f'../data/processed/paragraph_matrix/{method_name}/*')):
    key = path.split('/')[-1].split('.')[0]
    matrix_map[key] = pickle.load(open(path,'rb'))

In [None]:
file = open(f'../data/processed/paragraph_matrix/{method_name}.pkl','wb')
pickle.dump(matrix_map,file)

In [None]:
file.close()

In [None]:
del matrix_map

In [None]:
import gc
gc.collect()