# Extract KG triples from Hotpot QA dataset

This notebook demonstrates how to extract KG triples from `hotpot_qa`. To run as a script see `hotpot_qa_kgs.py`.

In [1]:
import os, sys
import nest_asyncio
nest_asyncio.apply()

open_ai_key = '...'
os.environ['OPENAI_API_KEY'] = open_ai_key

sys.path = ['/Users/walder2/kg_uq/'] + sys.path
path_to_data = '/Users/walder2/kg_uq/hotpot_qa_data'

nest_asyncio.apply()

from hotpot_qa_data.hotpot_data_load import write_hotpot_to_txt
from kg_extraction.kg_extraction import extract_kg

  from .autonotebook import tqdm as notebook_tqdm


## Write out Hotpot QA answers to txt files.

This makes reading the files back in as `Documents` with Llama-index easier. `ndocs` controls how many of observations of the training set we consider. All context documents for each training observation are included. 

In [2]:
hotpot_files = write_hotpot_to_txt(path_to_data=path_to_data, ndocs=20)

### Specfiy the entity types and relations. 

Below you can specify information for entity types and relation types. Have a look at https://schema.org/ for details on the entities I defined below. 


In [3]:
entity_types = {
    "person": 'https://schema.org/Person',
    'place': 'https://schema.org/Place',
    'thing': 'https://schema.org/Thing',
    'creativeWork': 'https://schema.org/CreativeWork',
    'event': 'https://schema.org/Event',
    'product': 'https://schema.org/Product'

}

relation_types = {
    "hasCharacteristic": "https://schema.org/additionalProperty",
    "hasColor": "https://schema.org/color",
    "hasMeasurement": "https://schema.org/hasMeasurement",
    "person": 'https://schema.org/Person',
    'place': 'https://schema.org/Place',
    'thing': 'https://schema.org/Thing',
    'creativeWork': 'https://schema.org/CreativeWork',
    'event': 'https://schema.org/Event',
    'product': 'https://schema.org/Product'

}

# Extract KG triples

For each question, all context documents are scraped from triples. We will be able to keep track of which question they are associated with, and which document they are constructed from. 

In [4]:
cnt = 0
tot = sum([len(v['topics']) for v in hotpot_files.values()])

for hid in hotpot_files.keys():
    n = len(hotpot_files[hid]['topics'])

    extract_kg(entity_types=entity_types, relation_types=relation_types, data_dir=path_to_data,
                      txt_files=hotpot_files[hid]['file_paths'])
    cnt += n
    print(f'Completed {cnt} out of {tot} kg extractions.')

 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
'./kg_files/103rd_grey_cup.json'
 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Already extracted 103rd_grey_cup.

 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
'./kg_files/1959_illinois_fighting_illini_football_team.json'
 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Already extracted 1959_illinois_fighting_illini_football_team.

 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
'./kg_files/1986_grand_prix_german_open.json'
 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Already extracted 1986_grand_prix_german_open.

 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
'./kg_files/2009_serena_williams_tennis_season.json'
 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Already extracted 2009_serena_williams_tennis_season.

 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
'./kg_files/2013_liqui_moly_bathurst_12_hour.json'
 - - - - - - 

 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
'./kg_files/103rd_grey_cup.json'
 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Already extracted 103rd_grey_cup.

 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
'./kg_files/1959_illinois_fighting_illini_football_team.json'
 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Already extracted 1959_illinois_fighting_illini_football_team.

 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
'./kg_files/1986_grand_prix_german_open.json'
 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Already extracted 1986_grand_prix_german_open.

 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
'./kg_files/2009_serena_williams_tennis_season.json'
 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Already extracted 2009_serena_williams_tennis_season.

 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
'./kg_files/2013_liqui_moly_bathurst_12_hour.json'
 - - - - - - 

 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
'./kg_files/103rd_grey_cup.json'
 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Already extracted 103rd_grey_cup.

 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
'./kg_files/1959_illinois_fighting_illini_football_team.json'
 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Already extracted 1959_illinois_fighting_illini_football_team.

 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
'./kg_files/1986_grand_prix_german_open.json'
 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Already extracted 1986_grand_prix_german_open.

 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
'./kg_files/2009_serena_williams_tennis_season.json'
 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Already extracted 2009_serena_williams_tennis_season.

 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
'./kg_files/2013_liqui_moly_bathurst_12_hour.json'
 - - - - - - 

 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
'./kg_files/103rd_grey_cup.json'
 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Already extracted 103rd_grey_cup.

 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
'./kg_files/1959_illinois_fighting_illini_football_team.json'
 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Already extracted 1959_illinois_fighting_illini_football_team.

 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
'./kg_files/1986_grand_prix_german_open.json'
 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Already extracted 1986_grand_prix_german_open.

 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
'./kg_files/2009_serena_williams_tennis_season.json'
 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Already extracted 2009_serena_williams_tennis_season.

 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
'./kg_files/2013_liqui_moly_bathurst_12_hour.json'
 - - - - - - 

 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
'./kg_files/103rd_grey_cup.json'
 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Already extracted 103rd_grey_cup.

 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
'./kg_files/1959_illinois_fighting_illini_football_team.json'
 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Already extracted 1959_illinois_fighting_illini_football_team.

 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
'./kg_files/1986_grand_prix_german_open.json'
 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Already extracted 1986_grand_prix_german_open.

 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
'./kg_files/2009_serena_williams_tennis_season.json'
 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Already extracted 2009_serena_williams_tennis_season.

 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
'./kg_files/2013_liqui_moly_bathurst_12_hour.json'
 - - - - - - 

 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
'./kg_files/103rd_grey_cup.json'
 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Already extracted 103rd_grey_cup.

 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
'./kg_files/1959_illinois_fighting_illini_football_team.json'
 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Already extracted 1959_illinois_fighting_illini_football_team.

 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
'./kg_files/1986_grand_prix_german_open.json'
 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Already extracted 1986_grand_prix_german_open.

 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
'./kg_files/2009_serena_williams_tennis_season.json'
 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Already extracted 2009_serena_williams_tennis_season.

 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
'./kg_files/2013_liqui_moly_bathurst_12_hour.json'
 - - - - - - 

 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
'./kg_files/103rd_grey_cup.json'
 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Already extracted 103rd_grey_cup.

 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
'./kg_files/1959_illinois_fighting_illini_football_team.json'
 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Already extracted 1959_illinois_fighting_illini_football_team.

 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
'./kg_files/1986_grand_prix_german_open.json'
 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Already extracted 1986_grand_prix_german_open.

 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
'./kg_files/2009_serena_williams_tennis_season.json'
 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Already extracted 2009_serena_williams_tennis_season.

 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
'./kg_files/2013_liqui_moly_bathurst_12_hour.json'
 - - - - - - 

### Next up

Now that the triples are scraped, head over to `hotpot_kg_rag.ipynb` to see how we can use the extracted triples for RAG on `hotpot_qa`. 

You can run `scripts/hotpot_qa_kgs.py` for a scripted extraction call. 