In [1]:
import numpy as np
from collections import defaultdict, Counter, OrderedDict
import pickle

In [122]:
import yaml, json
from schema import Schema, And, Use, Optional, SchemaError

In [3]:
from tqdm.notebook import tqdm

In [4]:
#from typing import List, Optional
from pydantic import BaseModel

from pathlib import Path

In [86]:
# Specify the file paths
file_paths = [
    "../data/doc2yaml_1660_1750.pkl", 
    "../data/doc2yaml_1660_1810.pkl", 
    "../data/doc2yaml_1820.pkl", 
    "../data/doc2yaml_1830.pkl",
    "../data/doc2yaml_1840.pkl",
    "../data/doc2yaml_1850.pkl",
    "../data/doc2yaml_1860.pkl",
    "../data/doc2yaml_1870.pkl",
    "../data/doc2yaml_1880.pkl",
    "../data/doc2yaml_1890.pkl",  
    "../data/doc2yaml_1900.pkl",
    "../data/doc2yaml_1910.pkl",
    "../data/doc2yaml_1920.pkl",        
]

In [87]:
doc2text = {}

for file_path in tqdm(file_paths):

    # Read the dictionary from the file
    with open(file_path, "rb") as file:
        doc2text_decade = pickle.load(file)

        # and then update the doc2text dictionary
        doc2text.update(doc2text_decade)

  0%|          | 0/13 [00:00<?, ?it/s]

In [88]:
len(doc2text)


17520

In [89]:

# # Specify the file path
# file_path = "../data/doc2yaml_1660_1810.pkl"

# # Read the dictionary from the file
# with open(file_path, "rb") as file:
#     doc2text2 = pickle.load(file)

In [90]:
def get_key_value_pair(line_str):
    """
    Given a str as in 'revised_title: "Caution: This is title"', 
    return a tuple ('revised_title', "Caution: This is title")
    """
    first_colon_index = line_str.find(":")

    if first_colon_index != -1:
        key_str = line_str[:first_colon_index].strip()

        # when " '...' ",  we don't need the outer double quotes, and 
        # when ' "..." ',  we don't need the outer single quotes
        # so we take everything except first and last char via [1:-1]
        value_str = line_str[first_colon_index + 1:].strip()[1:-1]

    else:
        # no colon in the line
        return (line_str, None)

    return key_str, value_str

# write test cases for the function get_key_value_pair
print(get_key_value_pair('revised_title: "Caution: This is title"'))
print(get_key_value_pair('topics:') )
assert get_key_value_pair('revised_title: "Caution: This is title"') == ('revised_title', 'Caution: This is title')
assert get_key_value_pair('topics:') == ('topics', '')

('revised_title', 'Caution: This is title')
('topics', '')


In [91]:
# Test case 1: Valid input with colon
line_str = 'revised_title: "Caution: This is title"'
expected_output = ('revised_title', 'Caution: This is title')
assert get_key_value_pair(line_str) == expected_output

# Test case 2: Valid input without colon
#line_str = 'revised_title "Caution: This is title"'
#expected_output = None
#assert get_key_value_pair(line_str) == expected_output

# Test case 3: Empty input
line_str = ''
expected_output = (line_str, None)
assert get_key_value_pair(line_str) == expected_output

# Test case 4: Input with leading/trailing spaces
line_str = '  revised_title: "Caution: This is title"  '
expected_output = ('revised_title', 'Caution: This is title')
assert get_key_value_pair(line_str) == expected_output

# Test case 5: Input with multiple colons
line_str = 'revised_title: "Caution: This: is: title"'
expected_output = ('revised_title', 'Caution: This: is: title')
assert get_key_value_pair(line_str) == expected_output

# Test case 6: Input with without colons but with topic list iterm
line_str = '- "Geometry"'
expected_output = (line_str, None)
assert get_key_value_pair(line_str) == expected_output

print("All test cases passed!")

All test cases passed!


In [102]:
# parse yaml string into dictionary
doc2dict = {}

for doc in tqdm(doc2text, desc="Parsing yaml string into dictionary", total=len(doc2text)):

    #print(doc, doc2text[doc])

    doc2dict[doc] = {} # perhaps consider OrderedDict() instead 
    lines = doc2text[doc].split("\n")

    # expected number of lines for each yaml is 12
    # if there is more, we need to do some cleaning

    # zeroth case: sometimes there are empty lines
    # get rid of them
    lines = [l for l in lines if len(l) > 0]

    if len(lines) > 11:
        
        if lines[0].startswith("article_id: "): 

            if lines[-1].startswith("scientific_subdiscipline: "):
                # first case: there are more than 5 topics 
                # this is OK, as long as first & last line 
                # behave as expected 
                pass # do nothing here for now 
            
            #elif lines[-2].startswith("scientific_discipline_2: "):
                # second case: sometimes the LLM add things like 
                # another category, e.g.,
                # scientific_discipline_2, scientific_subdiscipline_2
                # remove the last two lines and see how that works 
                #lines = lines[:-2]

            #elif lines[-1].startswith("scientific_subdiscipline_2: "):
                # third case: sometimes the LLM add things like 
                # an additional subcategory e.g., 
                # scientific_subdiscipline_2, scientific_subdiscipline_3
                # remove the last two lines and see how that works 
                #lines = lines[:-1] 

            elif lines[-1].startswith("```"):  
                # third case: sometimes the LLM output yaml output
                # twice! with additional ```yaml ... ``` at the end
                # remove the additional text to handle this case
                for i, l in enumerate(lines):
                    if l.startswith("```"):
                        lines = lines[:i]
                        break

    # a variable to keep track of whether we are reading a list
    # and it toggled to True when we see a the key 'topics'
    isTopicList = False

    for i, line in enumerate(lines):

        key_str, value_str = get_key_value_pair(line)

        # if a key-value pair (e.g., 'article_id: 12345')
        # simply read as key and value

        if key_str and value_str: 
            
            #if key_str == 'article_id':
            #value_str = str(value_str) # force this value to be string

            doc2dict[doc][key_str] = str(value_str) 

            continue # move to the next line

        # if a key followed by list items, read all list items
        elif key_str == 'topics' and not isTopicList: 
                #key = line_tokens[0] 
                topic_key_str = key_str
                doc2dict[doc][topic_key_str] = []

                isTopicList = True 
                continue # move to the next line to get first list item
        
        elif isTopicList:
            line_tokens = line.split("-")

            try: 
                topic_item = line_tokens[1].strip()[1:-1]
                doc2dict[doc][topic_key_str].append(topic_item)

            except IndexError:
                print(f"{doc} did not parse {line}, {len(lines)},  {lines[0]}") 
                # {line_tokens}, {lines},


Parsing yaml string into dictionary:   0%|          | 0/17520 [00:00<?, ?it/s]

109700 did not parse topics:, 15,  article_id: "109700"
108426 did not parse topics:, 13,  article_id: "108426"
rspl_1873_0058 did not parse topics:, 12,  article_id: "rspl_1873_0058"
rspl_1879_0038 did not parse topics:, 12,  article_id: "rspl_1879_0038"
rspl_1879_0070 did not parse topics:, 12,  article_id: "rspl_1879_0070"
rstl_1872_0010 did not parse topics:, 12,  article_id: "rstl_1872_0010"
rspl_1893_0034 did not parse topics:, 12,  article_id: "rspl_1893_0034"
rspl_1895_0105 did not parse topics:, 12,  article_id: "rspl_1895_0105"
rspa_1906_0055 did not parse topics:, 12,  article_id: "rspa_1906_0055"
rspb_1906_0049 did not parse topics:, 13,  article_id: "rspb_1906_0049"
rspb_1909_0042 did not parse topics_for_search:, 17,  article_id: "rspb_1909_0042"
rstb_1904_0010 did not parse topics:, 13,  article_id: "rstb_1904_0010"
rspa_1916_0041 did not parse topics:, 12,  article_id: "rspa_1916_0041"
rsta_1912_0012 did not parse topics_short:, 17,  article_id: "rsta_1912_0012"


In [107]:
schema = Schema(
    {
        'article_id': And(str, lambda s: len(s) >= 0),
        'revised_title': And(str, lambda s: len(s) >= 0),
        'topics': And(list, lambda topics: all(isinstance(topic, str) for topic in topics)),
        'tldr': And(str, lambda s: len(s) >= 0),
        'scientific_discipline': And(str, lambda s: len(s) >= 0),
        'scientific_subdiscipline': And(str, lambda s: len(s) >= 0),
        #Optional('scientific_discipline_2'): And(str, lambda s: len(s) >= 0),
        #Optional('scientific_subdiscipline_2'): And(str, lambda s: len(s) >= 0)
    }
)


def is_valid_yaml(yaml_str):
    try:
        schema.validate(yaml_str)
        return True #print("Data is valid according to the schema.")
        
    except Exception as e:
        #print("Data is not valid according to the schema:", str(e))
        return False

doc2yaml = {}
for doc, yaml_str in doc2dict.items():
    
    if not is_valid_yaml(yaml_str):
        print(f"{doc} is not a valid YAML!")
    else:
        doc2yaml[doc] = yaml_str

100943 is not a valid YAML!
101568 is not a valid YAML!
101704 is not a valid YAML!
102297 is not a valid YAML!
102437 is not a valid YAML!
102947 is not a valid YAML!
106125 is not a valid YAML!
106284 is not a valid YAML!
106670 is not a valid YAML!
106691 is not a valid YAML!
107010 is not a valid YAML!
109700 is not a valid YAML!
rspl_1873_0058 is not a valid YAML!
rspl_1879_0038 is not a valid YAML!
rspl_1879_0070 is not a valid YAML!
rstl_1872_0010 is not a valid YAML!
rstl_1875_0002 is not a valid YAML!
rstl_1876_0005 is not a valid YAML!
rspl_1881_0128 is not a valid YAML!
rspl_1891_0028 is not a valid YAML!
rspl_1893_0034 is not a valid YAML!
rspl_1894_0070 is not a valid YAML!
rspl_1894_0138 is not a valid YAML!
rspl_1894_0168 is not a valid YAML!
rspl_1895_0105 is not a valid YAML!
rspl_1897_0043 is not a valid YAML!
rspl_1899_0103 is not a valid YAML!
rspa_1906_0055 is not a valid YAML!
rspb_1905_0009 is not a valid YAML!
rspb_1906_0049 is not a valid YAML!
rspl_1902_0122 i

In [109]:
len(doc2text), len(doc2yaml), f"{(len(doc2yaml)/len(doc2text))*100:.2f}% valid YAML."

(17520, 17486, '99.81% valid YAML.')

In [127]:
# save as yaml files

for doc, yaml_dict in tqdm(doc2yaml.items()):

    file_path = f"../data/yaml_files/{doc}.yaml"
    
    with open(file_path, "w", encoding="utf-8") as file:
        yaml.safe_dump(yaml_dict, 
            file,
            allow_unicode=True,
            encoding='utf-8',
            default_flow_style=False
        )

  0%|          | 0/17486 [00:00<?, ?it/s]

In [126]:
# save as json files

for doc, yaml_dict in tqdm(doc2yaml.items()):

    file_path = f"../data/json_files/{doc}.json"
    
    with open(file_path, "w", encoding="utf-8") as file:
        json.dump(yaml_dict, file)

  0%|          | 0/17486 [00:00<?, ?it/s]