In [1]:
import requests 
import pickle 
import time 
import regex as re 
from SPARQLWrapper import SPARQLWrapper, JSON
from fuzzywuzzy import fuzz
import spacy

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch 

In [108]:
import signal
from contextlib import contextmanager

class TimedOut(Exception): pass

@contextmanager
def time_limit(seconds):
    def signal_handler(signum, frame):
        raise TimedOut("Timed out!")
    signal.signal(signal.SIGALRM, signal_handler)
    signal.alarm(seconds)
    try:
        yield
    finally:
        signal.alarm(0)

In [3]:
all_content = pickle.load(open('all_content.pkl', 'rb'))

In [4]:
pattern = r'<Label>([^<]*)</Label><URI>([^<]*)</URI>'

In [11]:
all_content['en']['A Man Asleep']

('written',
 'A Man Asleep',
 [['A Man Asleep (French: Un homme qui dort) is a 1967 novel by the French writer Georges Perec. It uses a second-person narrative, and follows a 25-year-old student, who one day decides to be indifferent about the world. A Man Asleep was adapted into a 1974 film, The Man Who Sleeps.'],
  ["The novel was published in France through Éditions Denoël in 1967. An English translation by Andrew Leak was published in 1990 through Collins Harvill in the United Kingdom and David R. Godine, Publisher in the United States, in a shared volume with Perec's first novel, Things: A Story of the Sixties."],
  ['Upon the American release, Richard Eder of the Los Angeles Times compared the two novels of the volume—Things and A Man Asleep—and wrote that Things was "the more engaging of the two, though less focused and ultimately, perhaps, less memorable." He wrote that in A Man Asleep, "Perec shows a beauty on the far side of the void; a humanity on the far side of refusal."']

In [13]:
for lang in all_content:
    print(lang)
    for title in all_content[lang]:
        r = requests.get(f'https://lookup.dbpedia.org/api/search?query={title}')
        if(r.status_code == 200):
            match = re.findall(pattern, r.text)[0]
            if(fuzz.ratio(match[0], title) > 80):
                all_content[lang][title] = (match[1], all_content[lang][title][0], all_content[lang][title][1], all_content[lang][title][2])

en
de
pt


In [14]:
all_content_dump = open('all_content_uris.pkl', 'wb')
pickle.dump(all_content, all_content_dump)
all_content_dump.close()

In [18]:
r = requests.get(f'https://lookup.dbpedia.org/api/search?query=A Man Asleep')

In [116]:
all_content = pickle.load(open('all_content_uris.pkl', 'rb'))

In [117]:
all_content_uris = {'en': {}, 'pt': {}, 'de': {}}
for lang in all_content:
    for title in all_content[lang]:
        if(len(all_content[lang][title]) == 4):
            if(all_content[lang][title][0] == all_content[lang][title][1]):
                continue 
            all_content_uris[lang][title] = {} 
            all_content_uris[lang][title]['uri'] = all_content[lang][title][0]
            all_content_uris[lang][title]['category'] = all_content[lang][title][1]
            all_content_uris[lang][title]['title'] = all_content[lang][title][2]
            all_content_uris[lang][title]['content'] = all_content[lang][title][3]

In [120]:
final_dump = open('all_content_uris_sub.pkl', 'wb')
pickle.dump(all_content_uris, final_dump)
final_dump.close()

In [67]:
object_query = """
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX dbo: <http://dbpedia.org/ontology/>
PREFIX dbp: <http://dbpedia.org/property/>

SELECT ?property ?propertyLabel ?subject ?subjectLabel
WHERE {
  <ENTITY_URI> ?property ?subject .
  
  OPTIONAL {
    ?property rdfs:label ?propertyLabel .
    FILTER (langMatches(lang(?propertyLabel), "en"))
  }
  
  OPTIONAL {
    ?subject rdfs:label ?subjectLabel .
    FILTER (langMatches(lang(?subjectLabel), "en"))
  }
  
  FILTER (
    ?property NOT IN (
      <http://www.w3.org/1999/02/22-rdf-syntax-ns#type>,
      <http://purl.org/dc/terms/subject>, 
      <http://dbpedia.org/ontology/wikiPageWikiLink>, 
      <http://dbpedia.org/property/wikiPageUsesTemplate>,
      <http://dbpedia.org/ontology/wikiPageRedirects>,
      <http://dbpedia.org/property/align>,
      <http://dbpedia.org/property/caption>,
      <http://dbpedia.org/property/format>,
      <http://dbpedia.org/property/float>,
      <http://dbpedia.org/property/footer>,
      <http://dbpedia.org/property/image>,
      <http://dbpedia.org/property/width>,
      <http://dbpedia.org/property/totalWidth>,
      <http://dbpedia.org/property/imageCaption>,
      <http://dbpedia.org/property/filename>,
      <http://dbpedia.org/property/singleLine>,
      <http://dbpedia.org/ontology/wikiPageDisambiguates>
    )   &&
    REGEX(STR(?property), "^http://dbpedia.org/")
  )
}
"""


subject_query = """
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX dbo: <http://dbpedia.org/ontology/>
PREFIX dbp: <http://dbpedia.org/property/>

SELECT ?property ?propertyLabel ?subject ?subjectLabel
WHERE {
  ?subject ?property <ENTITY_URI> .
  
  OPTIONAL {
    ?property rdfs:label ?propertyLabel .
    FILTER (langMatches(lang(?propertyLabel), "en"))
  }
  
  OPTIONAL {
    ?subject rdfs:label ?subjectLabel .
    FILTER (langMatches(lang(?subjectLabel), "en"))
  }
  
  FILTER (
    ?property NOT IN (
      <http://www.w3.org/1999/02/22-rdf-syntax-ns#type>,
      <http://purl.org/dc/terms/subject>, 
      <http://dbpedia.org/ontology/wikiPageWikiLink>, 
      <http://dbpedia.org/property/wikiPageUsesTemplate>,
      <http://dbpedia.org/ontology/wikiPageRedirects>,
      <http://dbpedia.org/property/align>,
      <http://dbpedia.org/property/caption>,
      <http://dbpedia.org/property/format>,
      <http://dbpedia.org/property/float>,
      <http://dbpedia.org/property/footer>,
      <http://dbpedia.org/property/image>,
      <http://dbpedia.org/property/width>,
      <http://dbpedia.org/property/totalWidth>,
      <http://dbpedia.org/property/imageCaption>,
      <http://dbpedia.org/property/filename>,
      <http://dbpedia.org/property/singleLine>,
      <http://dbpedia.org/ontology/wikiPageDisambiguates>
    )   &&
    REGEX(STR(?property), "^http://dbpedia.org/")
  )
}
"""

sparql = SPARQLWrapper("https://dbpedia.org/sparql")

In [68]:
entity_uri = "http://dbpedia.org/resource/Darkness_Visible_(memoir)"
query = re.sub("ENTITY_URI", entity_uri, object_query)

In [69]:
sparql.setQuery(query)
sparql.setReturnFormat(JSON)

In [71]:
results = sparql.query().convert()

In [77]:
results['results']['bindings']

[{'property': {'type': 'uri',
   'value': 'http://dbpedia.org/ontology/literaryGenre'},
  'propertyLabel': {'type': 'literal',
   'xml:lang': 'en',
   'value': 'literary genre'},
  'subject': {'type': 'uri', 'value': 'http://dbpedia.org/resource/Memoir'},
  'subjectLabel': {'type': 'literal', 'xml:lang': 'en', 'value': 'Memoir'}},
 {'property': {'type': 'uri', 'value': 'http://dbpedia.org/property/genre'},
  'propertyLabel': {'type': 'literal', 'xml:lang': 'en', 'value': 'genre'},
  'subject': {'type': 'uri', 'value': 'http://dbpedia.org/resource/Memoir'},
  'subjectLabel': {'type': 'literal', 'xml:lang': 'en', 'value': 'Memoir'}},
 {'property': {'type': 'uri',
   'value': 'http://dbpedia.org/ontology/literaryGenre'},
  'propertyLabel': {'type': 'literal',
   'xml:lang': 'en',
   'value': 'literary genre'},
  'subject': {'type': 'uri', 'value': 'http://dbpedia.org/resource/Memoir'},
  'subjectLabel': {'type': 'literal', 'xml:lang': 'en', 'value': 'Memoir'}},
 {'property': {'type': 'uri

In [93]:
all_content_uris['en'].keys()

dict_keys(['Darkness Visible (memoir)', 'Verbal Behavior', 'The Devil to Pay in the Backlands', 'Parmenides (dialogue)', 'Crito', "Plato's unwritten doctrines", 'Airbus A350', 'Airbus A320 family', 'Transall C-160', 'Boeing KC-135 Stratotanker', 'Bradley Fighting Vehicle', 'Rockwell B-1 Lancer', 'Messerschmitt Bf 108 Taifun', 'Boeing B-52 Stratofortress', 'Landing Vehicle Tracked', 'Boeing 737', 'Soyuz (spacecraft)', 'Flakpanzer Gepard', 'Dassault Rafale', 'Canberra', 'Uplengen', 'Beetzsee (municipality)', 'U2 (Berlin U-Bahn)', 'Kleinmachnow', 'Bhaktapur', 'Südbrookmerland', 'São Paulo', 'Moormerland', 'Memmingen', 'Wetzlar', 'Bombing of Dresden in World War II', 'Norden, Lower Saxony', 'Toronto', 'Schwieberdingen', 'Eberswalde', 'Freiburg im Breisgau', 'Fortifications of Frankfurt', 'Leipzig', 'Hattusa', 'Limes Germanicus', 'St. Nicholas Church, Potsdam', 'Poverty Point', 'Altes Stadthaus, Berlin', 'Federal Palace of Switzerland', 'The Whale House', 'Buried Pyramid', 'Sophienkirche', 

In [103]:
query = re.sub("ENTITY_URI", all_content_uris['en']['Everything Tastes Better with Bacon']['uri'], object_query)

In [105]:
sparql.setQuery(query)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
result_list_object = [] 
for result in results['results']['bindings']:
    if('propertyLabel' in result and 'subjectLabel' in result):
        result_list_object.append((result['propertyLabel']['value'], result['subjectLabel']['value']))

In [111]:
not_found_uris_subject = {'en': {}, 'pt': {}, 'de': {}}
not_found_uris_object = {'en': {}, 'pt': {}, 'de': {}}

In [114]:
for lang in all_content_uris:
    print(lang)
    for i, entity in enumerate(all_content_uris[lang]):
        print(i, entity)
        time.sleep(3)
        try:
            with time_limit(3):
                query = re.sub("ENTITY_URI", all_content_uris[lang][entity]['uri'], object_query)
                sparql.setQuery(query)
                sparql.setReturnFormat(JSON)
                results = sparql.query().convert()
                result_list_object = [] 
                for result in results['results']['bindings']:
                    if('propertyLabel' in result and 'subjectLabel' in result):
                        result_list_object.append((result['propertyLabel']['value'], result['subjectLabel']['value']))
                all_content_uris[lang][entity]['object_properties'] = result_list_object
        except TimedOut as e:   
            print("Timed out object!")
            not_found_uris_object[lang][entity] = all_content_uris[lang][entity]

        time.sleep(2)   

en
0 Darkness Visible (memoir)
Timed out subject!
1 Verbal Behavior
2 The Devil to Pay in the Backlands
Timed out object!
Timed out subject!
3 Parmenides (dialogue)
4 Crito
5 Plato's unwritten doctrines
6 Airbus A350
Timed out object!
Timed out subject!
7 Airbus A320 family
Timed out object!
8 Transall C-160
9 Boeing KC-135 Stratotanker


KeyboardInterrupt: 

In [None]:
dump = open('all_content_uris_relations.pkl', 'wb')
pickle.dump(all_content_uris, dump)
dump.close()

In [None]:
dump_not_found_subject = open('not_found_uris_subject.pkl', 'wb')
pickle.dump(not_found_uris_subject, dump_not_found_subject)
dump_not_found_subject.close()

dump_not_found_object = open('not_found_uris_object.pkl', 'wb')
pickle.dump(not_found_uris_object, dump_not_found_object)
dump_not_found_object.close()

In [10]:
all_content_uris_obj = pickle.load(open('all_content_uris_obj.pkl', 'rb'))
all_content_uris_sub = pickle.load(open('all_content_uris_sub.pkl', 'rb'))

In [19]:
all_content_uris_obj['en'].keys()

dict_keys(['Darkness Visible (memoir)', 'Verbal Behavior', 'The Devil to Pay in the Backlands', 'Parmenides (dialogue)', 'Crito', "Plato's unwritten doctrines", 'Airbus A350', 'Airbus A320 family', 'Transall C-160', 'Boeing KC-135 Stratotanker', 'Bradley Fighting Vehicle', 'Rockwell B-1 Lancer', 'Messerschmitt Bf 108 Taifun', 'Boeing B-52 Stratofortress', 'Landing Vehicle Tracked', 'Boeing 737', 'Soyuz (spacecraft)', 'Flakpanzer Gepard', 'Dassault Rafale', 'Canberra', 'Uplengen', 'Beetzsee (municipality)', 'U2 (Berlin U-Bahn)', 'Kleinmachnow', 'Bhaktapur', 'Südbrookmerland', 'São Paulo', 'Moormerland', 'Memmingen', 'Wetzlar', 'Bombing of Dresden in World War II', 'Norden, Lower Saxony', 'Toronto', 'Schwieberdingen', 'Eberswalde', 'Freiburg im Breisgau', 'Fortifications of Frankfurt', 'Leipzig', 'Hattusa', 'Limes Germanicus', 'St. Nicholas Church, Potsdam', 'Poverty Point', 'Altes Stadthaus, Berlin', 'Federal Palace of Switzerland', 'The Whale House', 'Buried Pyramid', 'Sophienkirche', 

In [22]:
len(all_content_uris_obj['en']['Apollo 11']['object_properties'])

38

In [24]:
len(set(all_content_uris_sub['en']['Apollo 11']['object_properties']))

15

In [26]:
for lang in all_content_uris_obj:
    for title in all_content_uris_obj[lang]:
        if('object_properties' in all_content_uris_obj[lang][title]):
            all_content_uris_obj[lang][title]['object_properties'] = list(set(all_content_uris_obj[lang][title]['object_properties']))
        else:
            all_content_uris_obj[lang][title]['object_properties'] = []
            
        if('object_properties' in all_content_uris_sub[lang][title]):
            all_content_uris_obj[lang][title]['subject_properties'] = list(set(all_content_uris_sub[lang][title]['object_properties']))
        else:
            all_content_uris_obj[lang][title]['subject_properties'] = []

In [28]:
all_content_uris_obj['en']['Apollo 11'].keys()

dict_keys(['uri', 'category', 'title', 'content', 'object_properties', 'subject_properties'])

In [37]:
for lang in all_content_uris_obj:
    print(lang)
    if(lang == 'en'):
        nlp = spacy.load('en_core_web_sm')
    elif(lang == 'pt'):
        nlp = spacy.load('pt_core_news_sm')
    elif(lang == 'de'):
        nlp = spacy.load('de_core_news_sm')
    for title in all_content_uris_obj[lang]:
        content = all_content_uris_obj[lang][title]['content']
        sent_content = []
        for para in content:
            doc = nlp(" ".join(para))
            sentences = list(doc.sents)
            sent_content.append(sentences)
        all_content_uris_obj[lang][title]['sentences'] = sent_content

en
pt
de


In [49]:
all_content_uris_obj['pt']['Alberto Henschel']['sentences']

[[Alberto Henschel (Berlim, 13 de Junho de 1827 — Rio de Janeiro(A), 30 de Junho de 1882) foi um fotógrafo teuto-brasileiro, considerado o mais diligente empresário da fotografia no Brasil do século XIX, com escritórios em Pernambuco, Bahia, Rio de Janeiro e São Paulo, Henschel foi também responsável pela vinda de outros fotógrafos profissionais ao país, como o seu compatriota Karl Ernest Papf — com quem trabalharia mais tarde — e seu filho, Jorge Henrique Papf, que sucederia ao pai no ramo da fotografia.,
  Henschel ficou conhecido por produzir belas imagens do Rio de Janeiro como fotógrafo paisagista e por ser um excelente retratista, o que lhe rendeu o título de Photographo da Casa Imperial, habilitando-o a retratar o cotidiano da monarquia brasileira durante o Segundo Reinado, inclusive fotografando o imperador Dom Pedro II e sua família.,
  Esse título valorizaria muito suas fotos, inclusive no preço.,
  Mas, certamente, sua principal contribuição à história da fotografia no Brasi

In [70]:
for lang in all_content_uris_obj:
    print(lang)
    if(lang == 'en'):
        nlp = spacy.load('en_core_web_sm')
    elif(lang == 'pt'):
        nlp = spacy.load('pt_core_news_sm')
    elif(lang == 'de'):
        nlp = spacy.load('de_core_news_sm')
    for title in all_content_uris_obj[lang]:
        paras = []
        for para in all_content_uris_obj[lang][title]['sentences']:
            filtered_sents = [] 
            for sent in para:
                document = nlp(sent.text)
                tok_tags = [token.pos_ for token in document]
                if('NOUN' in tok_tags or 'PROPN' in tok_tags or 'PRON' in tok_tags or 'ADJ' in tok_tags or 'VERB' in tok_tags or 'ADV' in tok_tags):
                    filtered_sents.append(sent.text)
            paras.append(filtered_sents)
        all_content_uris_obj[lang][title]['filtered_sentences'] = paras

en
pt
de


In [164]:
for lang in all_content_uris_obj:
    print(lang)
    for title in all_content_uris_obj[lang]:
        paras = []
        for para in all_content_uris_obj[lang][title]['filtered_sentences']:
            filtered_sents = [sent for sent in para if len(sent.split(" "))>5 and len(sent.split(" "))<100] 
            paras.append(filtered_sents)
        all_content_uris_obj[lang][title]['actual_filtered_sentences'] = paras

en
pt
de


In [165]:
final_content = {'en': {}, 'pt': {}, 'de': {}}
for lang in all_content_uris_obj:
    for title in all_content_uris_obj[lang]:
        if(len(all_content_uris_obj[lang][title]['filtered_sentences']) > 0) and (len(all_content_uris_obj[lang][title]['object_properties']) + len(all_content_uris_obj[lang][title]['subject_properties']) > 0):
            final_content[lang][title] = {}
            final_content[lang][title]['uri'] = all_content_uris_obj[lang][title]['uri']
            final_content[lang][title]['content'] = all_content_uris_obj[lang][title]['content']
            final_content[lang][title]['filtered_sentences'] = all_content_uris_obj[lang][title]['filtered_sentences']
            final_content[lang][title]['len_filtered_sentences'] = all_content_uris_obj[lang][title]['actual_filtered_sentences']
            final_content[lang][title]['object_properties'] = all_content_uris_obj[lang][title]['object_properties']
            final_content[lang][title]['subject_properties'] = all_content_uris_obj[lang][title]['subject_properties']

In [166]:
final_dump = open('sents_facts.pkl', 'wb')
pickle.dump(final_content, final_dump)
final_dump.close()

In [5]:
final_content = pickle.load(open('sents_facts.pkl', 'rb'))

In [48]:
de_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M", src_lang="deu_Latn")
pt_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M", src_lang="por_Latn")
en_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M", src_lang="eng_Latn")

In [4]:
nllb_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M").to('cuda')
nllb_model.eval()

M2M100ForConditionalGeneration(
  (model): M2M100Model(
    (shared): Embedding(256206, 1024, padding_idx=1)
    (encoder): M2M100Encoder(
      (embed_tokens): Embedding(256206, 1024, padding_idx=1)
      (embed_positions): M2M100SinusoidalPositionalEmbedding()
      (layers): ModuleList(
        (0): M2M100EncoderLayer(
          (self_attn): M2M100Attention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): LayerNorm((1024,)

In [8]:
trans_candidates = {'de': {'sents':[], 'src':[]}, 'pt': {'sents':[], 'src':[]}}
for lang in final_content:
    if(lang != 'en'):
        for title in final_content[lang]:
            src_para = [] 
            for i, sent in enumerate(final_content[lang][title]['len_filtered_sentences']):
                trans_candidates[lang]['sents'].extend(sent)
                trans_candidates[lang]['src'].extend((title, i) for _ in range(len(sent)))

In [9]:
len(trans_candidates['pt']['sents'])//128

209

In [10]:
translated_sents = {'de': [], 'pt': []}

In [11]:
for lang in trans_candidates:
    outs = [] 
    if(lang == 'de'):
        tokenizer = de_tokenizer
    elif(lang == 'pt'):
        tokenizer = pt_tokenizer
    sents_batched = [trans_candidates[lang]['sents'][i:i+32] for i in range(0, len(trans_candidates[lang]['sents']), 32)]
    for i, batch in enumerate(sents_batched):
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True).to('cuda')
        translated_tokens = nllb_model.generate(**inputs, forced_bos_token_id=tokenizer.lang_code_to_id['eng_Latn'], max_length=400)
        out = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)    
        outs.extend(out)
    translated_sents[lang] = outs

In [12]:
trans_dump = open('translated_sents.pkl', 'wb')
pickle.dump(translated_sents, trans_dump)
trans_dump.close()

In [18]:
trans_mapping = {'de': {}, 'pt': {}}
for lang in trans_candidates:
    for sent, mapping in zip(translated_sents[lang], trans_candidates[lang]['src']):
        if(mapping[0] not in trans_mapping[lang]):
            trans_mapping[lang][mapping[0]] = [[]]
        if(mapping[1] >= len(trans_mapping[lang][mapping[0]])):
            trans_mapping[lang][mapping[0]].append([])
        trans_mapping[lang][mapping[0]][mapping[1]].append(sent)

In [24]:
len(trans_mapping['de']['Landing Vehicle Tracked'][-1])

1

In [29]:
for lang in final_content:
    if(lang == 'en'):
        for title in final_content[lang]:
            final_content[lang][title]['translated_sents'] = [] 
    else:
        for title in final_content[lang]:
            final_content[lang][title]['translated_sents'] = trans_mapping[lang][title]

In [2]:
final_content = pickle.load(open('final_dump.pkl', 'rb'))

In [49]:
obj_facts_candidates = {'pt': {'facts': [], 'src': []}, 'de': {'facts': [], 'src': []}}
for lang in final_content:
    if(lang == 'en'):
        continue 
    for title in final_content[lang]:
        obj_facts_candidates[lang]['facts'].extend([(title, fc[0], fc[1]) for fc in final_content[lang][title]['object_properties']])
        obj_facts_candidates[lang]['src'].extend([title for _ in range(len(final_content[lang][title]['object_properties']))])

sub_facts_candidates = {'pt': {'facts': [], 'src': []}, 'de': {'facts': [], 'src': []}}
for lang in final_content:
    if(lang == 'en'):
        continue
    for title in final_content[lang]:
        sub_facts_candidates[lang]['facts'].extend([(fc[1], fc[0], title) for fc in final_content[lang][title]['subject_properties']])
        sub_facts_candidates[lang]['src'].extend([title for _ in range(len(final_content[lang][title]['subject_properties']))])

In [50]:
translated_facts = {'sub':{'de': [], 'pt': []}, 'obj': {'de': [], 'pt': []}}

In [57]:
for lang in obj_facts_candidates:
    outs = [] 
    if(lang == 'de'):
        out_lang = 'deu_Latn'
    elif(lang == 'pt'):
        out_lang = 'por_Latn'
    fact_sents = [f for fact in obj_facts_candidates[lang]['facts'] for f in fact]
    sents_batched = [fact_sents[i:i+32] for i in range(0, len(fact_sents), 32)]
    for i, batch in enumerate(sents_batched):
        inputs = en_tokenizer(batch, return_tensors="pt", padding=True, truncation=True).to('cuda')
        translated_tokens = nllb_model.generate(**inputs, forced_bos_token_id=tokenizer.lang_code_to_id[out_lang], max_length=400)
        out = en_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)    
        outs.extend(out)
    translated_facts['obj'][lang] = outs

In [59]:
for lang in sub_facts_candidates:
    outs = [] 
    if(lang == 'de'):
        out_lang = 'deu_Latn'
    elif(lang == 'pt'):
        out_lang = 'por_Latn'
    fact_sents = [f for fact in sub_facts_candidates[lang]['facts'] for f in fact]
    sents_batched = [fact_sents[i:i+32] for i in range(0, len(fact_sents), 32)]
    for i, batch in enumerate(sents_batched):
        inputs = en_tokenizer(batch, return_tensors="pt", padding=True, truncation=True).to('cuda')
        translated_tokens = nllb_model.generate(**inputs, forced_bos_token_id=tokenizer.lang_code_to_id[out_lang], max_length=400)
        out = en_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)    
        outs.extend(out)
    translated_facts['sub'][lang] = outs

In [63]:
for split in translated_facts:
    for lang in translated_facts[split]:
        translated_facts[split][lang] = [translated_facts[split][lang][i:i+3] for i in range(0, len(translated_facts[split][lang]), 3)]

In [28]:
translated_facts_dump = open('translated_facts.pkl', 'wb')
pickle.dump(translated_facts, translated_facts_dump)
translated_facts_dump.close()

In [71]:
trans_obj_mapping = {'de': {}, 'pt': {}}
for lang in obj_facts_candidates:
    for sent, mapping in zip(translated_facts['obj'][lang], obj_facts_candidates[lang]['src']):
        if(mapping not in trans_obj_mapping[lang]):
            trans_obj_mapping[lang][mapping] = []
        trans_obj_mapping[lang][mapping].append(sent)

trans_sub_mapping = {'de': {}, 'pt': {}}
for lang in sub_facts_candidates:
    for sent, mapping in zip(translated_facts['sub'][lang], sub_facts_candidates[lang]['src']):
        if(mapping not in trans_sub_mapping[lang]):
            trans_sub_mapping[lang][mapping] = []
        trans_sub_mapping[lang][mapping].append(sent)

In [72]:
len(list(final_content['de'].keys()))

75

In [73]:
len(list(trans_obj_mapping['de'].keys()))

72

In [74]:
for lang in final_content:
    if(lang == 'en'):
        for title in final_content[lang]:
            final_content[lang][title]['translated_object_properties'] = [] 
    else:
        for title in final_content[lang]:
            if(title not in trans_obj_mapping[lang]):
                final_content[lang][title]['translated_object_properties'] = []
            else:
                final_content[lang][title]['translated_object_properties'] = trans_obj_mapping[lang][title]

In [75]:
for lang in final_content:
    if(lang == 'en'):
        for title in final_content[lang]:
            final_content[lang][title]['translated_subject_properties'] = [] 
    else:
        for title in final_content[lang]:
            if(title not in trans_sub_mapping[lang]):
                final_content[lang][title]['translated_subject_properties'] = []
            else:
                final_content[lang][title]['translated_subject_properties'] = trans_sub_mapping[lang][title]

In [92]:
final_dump = open('all_data.pkl', 'wb')
pickle.dump(final_content, final_dump)
final_dump.close()