In [178]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.util import ngrams 
from nltk.tokenize import sent_tokenize
from collections import Counter
import regex as re 
from numpy import dot
from numpy.linalg import norm
import numpy as np 
import json 
import random 
import spacy 
import glob 
import tqdm 

In [2]:
nlp = spacy.load('xx_sent_ud_sm')
nlp.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x7fbbef5ec6c0>

In [2]:
label_map = {} 
with open('/scratch/useful/subject_set_labels.jsonl', 'r') as f:
    for line in f:
        line = json.loads(line)
        label_map.update(line)

In [3]:
prop_dict = {} 
with open('/scratch/useful/ontology_props.jsonl', 'r') as f:
    for line in f:
        line = json.loads(line)
        prop_dict.update({line['name']: line['properties']})

In [4]:
type_dict = {} 
with open('/scratch/useful/mapping_transitive.ttl', 'r') as f:
    for line in f:
        line = line.split()
        if(line[0] not in type_dict):
            type_dict[line[0]] = set() 
        type_dict[line[0]].add(line[2].split('/')[-1])

In [5]:
type_dict['<http://dbpedia.org/resource/!!!>']

{'Agent>',
 'DUL.owl#Agent>',
 'DUL.owl#SocialPerson>',
 'Group>',
 'MusicGroup>',
 'Organisation>',
 'Organization>',
 'Q215380>',
 'Q24229398>',
 'Q43229>',
 'owl#Thing>'}

In [6]:
def get_prop_name(item):
    if("#literal" in item):
        return item.split('#literal')[0], True
    if(item in label_map):
        tail_name = label_map[item]
    else:
        tail_name = re.sub('_', ' ',item.split('/')[-1])
    return tail_name, False

In [7]:
wanted_types = ["Place", "Person", "Organization", "Organisation"]

In [8]:
langs = ['ga', 'de', 'en']

In [12]:
candidates = {}

In [13]:
for lang in langs:
    print(lang)
    with(open(f'/scratch/useful/abstracts_{lang}.jsonl', 'r')) as f:
        pb = tqdm.tqdm(total=1000000)
        for i, line in enumerate(f):
            pb.update(1)
            item = json.loads(line)
            rsc = item['resource']
            txt = item['text']
            found = False 

            for t in wanted_types:
                if(f'{t}>' in type_dict[f'<{rsc}>']):
                    found = True 
            
            if(not(found)):
                continue  

            name = get_prop_name(rsc)[0]
            if(name not in candidates):
                candidates[name] = {}

                properties = [] 
                fw_props = prop_dict[rsc]['properties']
                for prop in fw_props:
                    for item in fw_props[prop]:
                        item_name = get_prop_name(item)[0]
                        properties.append((name, prop, item_name))
                
                rv_props = prop_dict[rsc]['reverse_properties']
                for prop in rv_props:
                    for item in rv_props[prop][:3]:
                        item_name = get_prop_name(item)[0]
                        properties.append((item_name, prop, name))
                
                candidates[name]['properties'] = properties

            if(lang == 'de'):
                sents = sent_tokenize(txt, language='german')
            else:
                sents = sent_tokenize(txt)
                
            filtered_sents = [] 
            for sent_ in sents:
                # sent_ = sent.text
                if(len(sent_.split()) > 5 and len(sent_.split()) < 200):
                    filtered_sents.append(sent_)
            candidates[name][f'{lang}_text'] = filtered_sents

ga


  0%|          | 373/1000000 [00:07<5:34:04, 49.87it/s]


de


  3%|▎         | 26258/1000000 [00:04<02:37, 6163.96it/s]
 64%|██████▍   | 642660/1000000 [01:30<00:55, 6388.83it/s] 

en


 64%|██████▍   | 642713/1000000 [01:30<00:50, 7091.71it/s]


In [14]:
save_name = 'filtered_candidates.json'
with open(save_name, 'w') as f:
    json.dump(candidates, f)

In [35]:
candidates = json.load(open('filtered_candidates.json', 'r'))

In [30]:
keys = list(candidates.keys())

In [36]:
candidates[keys[1]]['de_text']

['"’s-Hertogenbosch [ˌsɛrtoːɣə(n)ˈbɔs] (im allgemeinen Sprachgebrauch Den Bosch [dɛnˈbɔs]; deutsch Herzogenbusch, französisch Bois-le-Duc) ist die Hauptstadt der niederländischen Provinz Noord-Brabant.',
 'Die Gemeinde ’s-Hertogenbosch umfasst die Stadt ’s-Hertogenbosch sowie die Dörfer und Ortschaften Bokhoven, Empel, Engelen, Hintham, Kruisstraat, Meerwijk, Orthen, Rosmalen und Maliskamp.',
 'Am 1. Januar 2022 lebten laut CBS 156.521 Einwohner in der Gemeinde.',
 'Die Stadt ist Sitz des römisch-katholischen Bistums ’s-Hertogenbosch.',
 'Die Stadt ist ferner Sitz eines Gerichtes, der Provinzialverwaltung, verschiedener Krankenhäuser und psychiatrischer Anstalten sowie vieler überregional bedeutender Schulen.']

In [37]:
all_de = [] 
de_files = sorted(glob.glob('/home2/aditya_hari/gsoc/rdf-to-text/scraping/sents/*_translated_sents_de.txt'))
for file in de_files:
    with open(file, 'r') as f:
        for line in f:
            all_de.append(line.split('@@@')[1].strip())

In [38]:
all_ga = []
ga_files = sorted(glob.glob('/home2/aditya_hari/gsoc/rdf-to-text/scraping/sents/*_translated_sents_ga.txt'))
for file in ga_files:
    with open(file, 'r') as f:
        for line in f:
            all_ga.append(line.split('@@@')[1].strip())

In [39]:
de_ptr = 0
ga_ptr = 0

In [40]:
for entity in candidates:
    if('de_text' in candidates[entity]):
        candidates[entity]['de_translated'] = all_de[de_ptr:de_ptr+len(candidates[entity]['de_text'])]
        de_ptr += len(candidates[entity]['de_text'])
    if('ga_text' in candidates[entity]):
        candidates[entity]['ga_translated'] = all_ga[ga_ptr:ga_ptr+len(candidates[entity]['ga_text'])]
        ga_ptr += len(candidates[entity]['ga_text'])

In [41]:
candidates[keys[1]]['de_translated']

['"s-Hertogenbosch [ˌsɛrtoːɣə(n)ˈbɔs] (in common parlance Den Bosch [dɛnˈbɔs]; German Herzogenbusch, French Bois-le-Duc) is the capital of the Dutch province of Noord-Brabant.',
 'The municipality of s-Hertogenbosch includes the town of s-Hertogenbosch as well as the villages and towns of Bokhoven, Empel, Angels, Hintham, Kruisstraat, Meerwijk, Orten, Rosmalen and Maliskamp.',
 'As of January 1, 2022, according to CBS, the municipality had 156,521 inhabitants.',
 'The city is the seat of the Roman Catholic Diocese of s-Hertogenbosch.',
 'The city is also home to a court, the provincial administration, various hospitals and psychiatric institutions, and many important schools across the region.']

In [43]:
de_ptr, len(all_de), ga_ptr, len(all_ga)

(1542708, 1542708, 57677, 57677)

In [44]:
save_name = 'filtered_candidates.json'
with open(save_name, 'w') as f:
    json.dump(candidates, f, ensure_ascii=False)

In [None]:
candidates = json.load(open('/home2/aditya_hari/gsoc/rdf-to-text/scraping/notebooks/filtered_candidates.json', 'r'))

In [59]:
# regex pattern to split at camel case
pattern = re.compile(r'(?<!^)(?=[A-Z])')

In [201]:
# Function to compute average TF-IDF similarity between every pair of sentences in two lists of sentences
def get_similarity(sent_list1, sent_list2):
    vectorizer = TfidfVectorizer(ngram_range=(1, 2))
    sent_list1 = [' '.join([i for i in pattern.split(sent)]) for sent in sent_list1]
    sent_list2 = [' '.join([i for i in pattern.split(sent)]) for sent in sent_list2]
    X1 = vectorizer.fit_transform(sent_list1)
    X2 = vectorizer.transform(sent_list2)
    sim_mat = np.zeros((len(sent_list1), len(sent_list2)))
    for i in range(len(sent_list1)):
        for j in range(len(sent_list2)):
            if(norm(X1[i].toarray()[0])*norm(X2[j].toarray()[0]) == 0):
                sim_mat[i][j] = 0
            else:
                sim_mat[i][j] = dot(X1[i].toarray()[0], X2[j].toarray()[0])/(norm(X1[i].toarray()[0])*norm(X2[j].toarray()[0]))
    return sim_mat 

In [211]:
filtered_candidates = {} 

In [213]:
pb = tqdm.tqdm(total=len(keys))
for key in keys:
    pb.update(1)
    all_prop_strs = [] 
    props_filtered = [prop for prop in candidates[key]['properties'] if('' not in prop)]
    for prop in props_filtered:
        prop_str = ' '.join([' '.join(pattern.split(re.sub(r'[^\w]+', ' ', i))) for i in prop])
        all_prop_strs.append(prop_str)
    en_similarity_mat = get_similarity(all_prop_strs, candidates[key]['en_text'])
    en_above_thresh = np.where(en_similarity_mat > 0.25)
    if(len(en_above_thresh[1]) != 0):
        en_retained_props = [[] for _ in range(len(candidates[key]['en_text']))]
        for sent_idx, prop_idx in zip(en_above_thresh[1], en_above_thresh[0]):
            en_retained_props[sent_idx].append(props_filtered[prop_idx])
        if(key not in filtered_candidates):
            filtered_candidates[key] = {}
        filtered_candidates[key]['en_text'] = candidates[key]['en_text']
        filtered_candidates[key]['en_retained_props'] = en_retained_props

    if('de_translated' in candidates[key] and len(candidates[key]['de_translated']) != 0):
        de_similarity_mat = get_similarity(all_prop_strs, candidates[key]['de_translated'])
        de_above_thresh = np.where(de_similarity_mat > 0.25)
        if(len(de_above_thresh[1]) != 0):
            de_retained_props = [[] for _ in range(len(candidates[key]['de_translated']))]
            for sent_idx, prop_idx in zip(de_above_thresh[1], de_above_thresh[0]):
                de_retained_props[sent_idx].append(props_filtered[prop_idx])
            if(key not in filtered_candidates):
                filtered_candidates[key] = {}
            filtered_candidates[key]['de_translated'] = candidates[key]['de_translated']
            filtered_candidates[key]['de_retained_props'] = de_retained_props
    
    if('ga_translated' in candidates[key] and len(candidates[key]['ga_translated']) != 0):
        ga_similarity_mat = get_similarity(all_prop_strs, candidates[key]['ga_translated'])
        ga_above_thresh = np.where(ga_similarity_mat > 0.25)
        if(len(ga_above_thresh[1]) != 0):
            ga_retained_props = [[] for _ in range(len(candidates[key]['ga_translated']))]
            for sent_idx, prop_idx in zip(ga_above_thresh[1], ga_above_thresh[0]):
                ga_retained_props[sent_idx].append(props_filtered[prop_idx])
            if(key not in filtered_candidates):
                filtered_candidates[key] = {}
            filtered_candidates[key]['ga_translated'] = candidates[key]['ga_translated']
            filtered_candidates[key]['ga_retained_props'] = ga_retained_props

  0%|          | 329/512471 [00:33<35:48:01,  3.97it/s]

KeyboardInterrupt: 

In [1]:
import json 
import glob
import numpy as np 
import regex as re 

In [2]:
merged_dict = {} 
jsons = glob.glob('/home2/aditya_hari/gsoc/rdf-to-text/scraping/scripts/props/*')
for js in jsons:
    current_dict = json.load(open(js, 'r'))
    merged_dict.update(current_dict)

In [3]:
len(merged_dict)

512297

In [4]:
keys = list(merged_dict.keys())

In [5]:
merged_dict[keys[0]].keys()

dict_keys(['properties', 'de_text', 'en_text', 'de_translated', 'en_sim_mat', 'de_sim_mat'])

In [19]:
filtered_candidates = {}
for key, value in merged_dict.items(): 
    if('en_text' in value and 'en_sim_mat' in value):
        if(key not in filtered_candidates):
            filtered_candidates[key] = {}
        filtered_candidates[key]['en_text'] = value['en_text']
        en_sim_mat = np.array(value['en_sim_mat'])
        en_above_thresh = np.where(en_sim_mat > 0.25)
        if(len(en_above_thresh[1]) != 0):
            en_retained_props = [[] for _ in range(len(value['en_text']))]
            for sent_idx, prop_idx in zip(en_above_thresh[1], en_above_thresh[0]):
                if('' not in value['properties'] and value['properties'][prop_idx][0]!=value['properties'][prop_idx][2]):
                    en_retained_props[sent_idx].append(value['properties'][prop_idx])
            filtered_candidates[key]['en_retained_props'] = en_retained_props
    
    if('de_text' in value and 'de_sim_mat' in value):
        if(key not in filtered_candidates):
            filtered_candidates[key] = {}
        filtered_candidates[key]['de_text'] = value['de_text']
        de_sim_mat = np.array(value['de_sim_mat'])
        de_above_thresh = np.where(de_sim_mat > 0.25)
        if(len(de_above_thresh[1]) != 0):
            de_retained_props = [[] for _ in range(len(value['de_text']))]
            for sent_idx, prop_idx in zip(de_above_thresh[1], de_above_thresh[0]):
                if('' not in value['properties'] and value['properties'][prop_idx][0]!=value['properties'][prop_idx][2]):
                    de_retained_props[sent_idx].append(value['properties'][prop_idx])
            filtered_candidates[key]['de_retained_props'] = de_retained_props
    
    if('ga_text' in value and 'ga_sim_mat' in value):
        if(key not in filtered_candidates):
            filtered_candidates[key] = {}
        filtered_candidates[key]['ga_text'] = value['ga_text']
        ga_sim_mat = np.array(value['ga_sim_mat'])
        ga_above_thresh = np.where(ga_sim_mat > 0.25)
        if(len(ga_above_thresh[1]) != 0):
            ga_retained_props = [[] for _ in range(len(value['ga_text']))]
            for sent_idx, prop_idx in zip(ga_above_thresh[1], ga_above_thresh[0]):
                if('' not in value['properties'] and value['properties'][prop_idx][0]!=value['properties'][prop_idx][2]):
                    ga_retained_props[sent_idx].append(value['properties'][prop_idx])
            filtered_candidates[key]['ga_retained_props'] = ga_retained_props

In [20]:
keys = list(filtered_candidates.keys())

In [21]:
remove_spaces = lambda x: ' '.join(x.split())

In [22]:
sent_prop_pairs = [] 
sent_prop_src = [] 
for key in keys:
    if('en_retained_props' in filtered_candidates[key]):
        for sent_idx, props in enumerate(filtered_candidates[key]['en_retained_props']):
            for prop in props:
                sent_prop_pairs.append((filtered_candidates[key]['en_text'][sent_idx], ' | '.join(prop)))
                sent_prop_src.append((key, sent_idx, 'en'))
    
    if('de_retained_props' in filtered_candidates[key]):
        for sent_idx, props in enumerate(filtered_candidates[key]['de_retained_props']):
            for prop in props:
                sent_prop_pairs.append((filtered_candidates[key]['de_text'][sent_idx], ' | '.join(prop)))
                sent_prop_src.append((key, sent_idx, 'de'))
        
    if('ga_retained_props' in filtered_candidates[key]):
        for sent_idx, props in enumerate(filtered_candidates[key]['ga_retained_props']):
            for prop in props:
                sent_prop_pairs.append((filtered_candidates[key]['ga_text'][sent_idx], ' | '.join(prop)))
                sent_prop_src.append((key, sent_idx, 'ga'))

In [23]:
len(sent_prop_pairs)

5608734

In [24]:
sent_prop_src[5], sent_prop_pairs[5]

(('Maroua', 0, 'de'),
 ('"Maroua ist die Hauptstadt der kamerunischen Region Extrême-Nord und des Departements Diamaré.',
  'University of Maroua | city | Maroua'))

In [25]:
with(open('sent_prop_src.txt', 'w')) as f:
    for src in sent_prop_src:
        f.write('\t'.join([str(x) for x in src]) + '\n')

In [26]:
for i in range(4):
    with(open(f'sent_prop_pairs/sent_prop_pairs_{i}.txt', 'w')) as f:
        for pair in sent_prop_pairs[i*len(sent_prop_pairs)//4:(i+1)*len(sent_prop_pairs)//4]:
            f.write('\t'.join(pair) + '\n')

In [48]:
!zip -r sent_prop_pairs.zip sent_prop_pairs

  adding: sent_prop_pairs/ (stored 0%)
  adding: sent_prop_pairs/sent_prop_pairs_1.txt (deflated 82%)
  adding: sent_prop_pairs/sent_prop_pairs_4.txt (deflated 81%)
  adding: sent_prop_pairs/sent_prop_pairs_2.txt (deflated 81%)
  adding: sent_prop_pairs/sent_prop_pairs_3.txt (deflated 81%)
  adding: sent_prop_pairs/sent_prop_pairs_0.txt (deflated 82%)
