In [1]:
import os, sys
import json
import re
from clean_text import cleaned_text
import tfidf_searching
import pandas as pd
from multiprocessing import Pool
import time
from tqdm import tqdm

In [79]:
import importlib
importlib.reload(tfidf_searching)

<module 'tfidf_searching' from '/home/chentianyu/libgen/tf-idf/tfidf_searching.py'>

In [2]:
with open('{path_to_libgen}/libgen/dataset/npm_reports.json', 'r') as f:
    pypi_reports = json.load(f)
top_names = set(['npm:'+lib['package']['name'] for report in pypi_reports for lib in report['affected']])

In [3]:
pypi_corpus = pd.DataFrame({'object':list(top_names)})
pypi_corpus['token'] = pypi_corpus['object'].apply(lambda x: cleaned_text(x))
pypi_corpus = pypi_corpus[pypi_corpus['token'].apply(lambda x: len(x) > 0)]
pypi_corpus['token'] = pypi_corpus['token'].apply(lambda x: ' '.join(x))

In [4]:
search_engine = tfidf_searching.TfidfSearching(pypi_corpus, 512, 2)

In [5]:
for vuln in tqdm(pypi_reports):
    tokens = cleaned_text(vuln['details'])
    vuln['top_k'] = search_engine.search_topk_objects(tokens, [])

100%|██████████| 3193/3193 [01:15<00:00, 42.21it/s]


In [6]:
for vuln in tqdm(pypi_reports):
    vuln['top_k'] = [{'lib_name': lib, 'website_description': ' '} for lib in vuln['top_k']]

100%|██████████| 3193/3193 [00:00<00:00, 12130.30it/s]


In [19]:
for vuln in tqdm(pypi_reports):
    vuln['labels'] = extract_libraries(vuln)

100%|██████████| 3193/3193 [00:00<00:00, 438161.71it/s]


In [8]:
k = 512
p = [precision(vuln, vuln['top_k'], k) for vuln in pypi_reports if precision(vuln, vuln['top_k'], k) != None]
r = [recall(vuln, vuln['top_k'], k) for vuln in pypi_reports if precision(vuln, vuln['top_k'], k) != None]
sum(p) / len(p), sum(r) / len(r), f1_score(sum(p) / len(p), sum(r) / len(r))

(0.8867375072152064, 0.8867375072152064, 0.8867375072152064)

In [13]:
train_sep_idx, valid_sep_idx = int(len(pypi_reports) * 3 / 5), int(len(pypi_reports) * 4 / 5)
train = pypi_reports[:train_sep_idx]
valid = pypi_reports[train_sep_idx:valid_sep_idx]
test = pypi_reports[valid_sep_idx:]

In [20]:
target_dir = '{path_to_libgen}/libgen/tf-idf/npm'
if not os.path.isdir(target_dir):
    os.mkdir(target_dir)
    
with open(os.path.join(target_dir, 'train.json'), 'w') as f:
    json.dump(train, f)
with open(os.path.join(target_dir, 'valid.json'), 'w') as f:
    json.dump(valid, f)
with open(os.path.join(target_dir, 'test.json'), 'w') as f:
    json.dump(test, f)

In [7]:
def extract_libraries(vuln):
    libs = ['npm:'+lib['package']['name'].lower() for lib in vuln['affected']]
    return libs

def closest_lib(lib_name):
    return lib_name
    weights = (1, 4, 4)
    global npm_lib_names
    if lib_name in npm_lib_names:
        return lib_name
    distances = [(Levenshtein.distance(lib_name, item,\
                    weights = weights), item) for item in npm_lib_names]
    return min(distances)[1]

def precision(vuln, pred, k):
    # pred = [closest_lib(lib) for lib in pred]
    pred = [lib['lib_name'] for lib in pred]
    labels = extract_libraries(vuln)
    # print('pred: ',pred,'\nlabels: ', labels)
    if len(labels) == 0:
        return None
    inter = set(labels) & set(pred[:k])
    return len(inter) / min(k, len(labels))

def recall(vuln, pred, k):
    # pred = [closest_lib(lib) for lib in pred]
    pred = [lib['lib_name'] for lib in pred]
    labels = extract_libraries(vuln)
    if len(labels) == 0:
        return None
    inter = set(labels) & set(pred[:k])
    # print(len(inter))
    # print(len(inter) / len(labels))
    return len(inter) / len(labels)

def f1_score(p, r):
    return 2*p*r/(p+r)

In [None]:
desc_inputs = [cleaned_text(vuln['details']) for vuln in pypi_reports]
with Pool(processes=32) as pool:
    tf_idf_res = list(tqdm(pool.imap(search_engine.search_topk_objects, desc_inputs)))

In [62]:
'tensorflow' in pypi_list

True

In [None]:
[word.split() for word in search_engine.corpus['token']]

In [68]:
search_engine.lib_name_index['tensorflow']

('tensorflow', 447282)

In [14]:
cleaned_text('tensorflow')

['tensorflow']

In [None]:
search_engine.search_topk_objects(['tensorflow'], [])

In [None]:
search_engine.search_topk_objects(['tensorflow'])