In [31]:
import pandas as pd
import numpy as np
import os
import json
import spacy
from sklearn.neighbors import KDTree
from sklearn.preprocessing import normalize
from tqdm import tqdm

In [33]:
ds_location='../../spike/datasets/tacred-train-labeled/ann'
spacy_model='en_core_web_lg'

In [112]:
rows=[]
for f in os.listdir(ds_location):
    with open(os.path.join(ds_location,f)) as fin:
        js=json.load(fin)
        sent=js['sentences'][0]
        rows.append([f]+[sent[x]  for x in 'entities words tags'.split()])
df=pd.DataFrame(rows,columns='filename entities words tags'.split())
df['text']=df.words.map(lambda x: ' '.join(x))
corpus=df.groupby(['text']).first().reset_index()['text'].values

In [34]:

nlp=spacy.load(spacy_model)

In [124]:
vecs=[]
for txt in tqdm(corpus,position=0):
    vec=nlp(txt,disable=['ner','parser','tagger']).vector
    vecs.append(vec)
vecs=normalize(vecs)

100%|██████████| 37311/37311 [00:05<00:00, 7260.40it/s]


In [125]:
tree = KDTree(vecs)



In [126]:
def cos_dist_to_normed_euc(cos_dist_query):
    cos_dist_query_rad=np.arccos(1-cos_dist_query)
    euc_dist=np.sin(cos_dist_query_rad/2)*2
    return euc_dist

In [145]:
def gen_samples(pos_vecs):
    n_ret=50
    returned=set()
    while True:
        dists,queried = tree.query(pos_vecs,n_ret)
        queried=np.concatenate(queried)
        dists=np.concatenate(dists)
        order=np.argsort(dists)
        for i in order:
            this_idx=queried[i]
            this_dist=dists[i]
            if this_idx in returned or this_dist <0.01:
                continue
            returned.add(this_idx) 
            yield this_idx
        n_ret*=2


def get_close_sents(positives,negatives,dropout,n_trials,neg_dist,seed,n_per_page,page_num):
    np.random.seed(seed)
    p_vectors=normalize([nlp(x,disable=['ner','parser','tagger']).vector for x in positives])
    
    
    stop_list=set()
    if negatives:
        n_vectors=normalize([nlp(x,disable=['ner','parser','tagger']).vector for x in negatives])
        neg_dist_euc=cos_dist_to_normed_euc(neg_dist)
        stop_list=set(tree.query_radius(n_vectors,neg_dist_euc)[0])
    
    sampled_means=[]
    
    for x in range(n_trials):
        mask=0
        while np.sum(mask) == 0:
            mask=np.random.uniform(size=len(positives))<dropout 
        sampled_means.append(np.mean(p_vectors[mask],0))
        
    sampled_means=normalize(sampled_means)
    
    current_page=0
    ret=[]
    for x in gen_samples(sampled_means):
        if x in stop_list:
            continue
        ret.append(x)
        if len(ret) == n_per_page:
            if current_page == page_num:
                break
            else:
                ret=[]
                current_page+=1
                print('aaa')
    
    return list(corpus[ret])
            
    
get_close_sents(['john is a the founder of microsoft'],['i started the engine'],0.5,3,neg_dist=0.2,seed=123,n_per_page=10,page_num=0)

['SOCIAL NETWORKING SITE IS REPORTED TO BE FOR SALE Plaxo was founded in 2001 by Todd Masonis along with a fellow Stanford engineering student , Cameron Ring , and Sean Parker , who was also a founder of Napster , the music downloading site .',
 "Posted on Monday March 17 , 2008 --------------------------------------- s e anderson is author of `` The Black Holocaust for Beginners '' Social Activism is not a hobby : it 's a Lifestyle lasting a Lifetime http://blackeducatorblogspotcom --------------------------------------- The authors of the study are Richard Lapchick , director of the institute , and Eric Little .",
 'WASHINGTON Mr Dobson , founder and chairman of the group Focus on the Family , is a psychologist .',
 "By the way , Rubin 's ploy is rich given that Richard Perle of the American Enterprise Institute brought a former LaRouchie in to brief the Pentagon on Saudi Arabia when he was chair of the Defense Advisory Board .",
 'Plaxo was founded in 2001 by Todd Masonis along with

In [83]:
x=np.array([1,2,3,4,5])
x[np.random.uniform(size=5)<0.5]

array([1, 2, 3, 4])

In [84]:
np.sum(0)

0

In [146]:

d={'positives':['john is a the founder of microsoft'],'negatives':['i started the engine'],}

In [148]:
json.dumps(d)

'{"positives": ["john is a the founder of microsoft"], "negatives": ["i started the engine"]}'