In [1]:
import numpy as np
from collections import defaultdict, Counter, OrderedDict
import pickle
import yaml, json
from schema import Schema, And, Use, Optional, SchemaError

from tqdm.notebook import tqdm

In [2]:
#from typing import List, Optional
#from pydantic import BaseModel

from pathlib import Path

In [3]:
# Specify the file paths
file_paths = [
    f"../data/dicts/doc2yaml_{year}.pkl"
    for year in range(1660, 2000, 10)  
]

In [4]:
len(file_paths)

34

In [5]:
doc2text = {}

for file_path in tqdm(file_paths):

    # Read the dictionary from the file
    with open(file_path, "rb") as file:
        doc2text_decade = pickle.load(file)

        # and then update the doc2text dictionary
        doc2text.update(doc2text_decade)

  0%|          | 0/34 [00:00<?, ?it/s]

In [6]:
len(doc2text)

47837

In [7]:
for l in doc2text['101533'].split('\n'):
    print(l)

article_id: 101533
revised_title: 'Kermes Grain: A Natural Dye for Wool'
semantic_tags:
  - Dyeing
  - Natural dyes
  - Wool processing
  - Textile arts
  - Botany
tldr: The author describes the process of extracting a red dye from the grain of kermes, a substance found on a shrub in Languedock. The dye is obtained by spreading the ripe grain on linen, separating the red powder, and exposing it to the sun. The resulting dye is used for coloring wool. The author also notes that if the powder is not treated with an acid, it forms small flies that change color before dying.
scientific_discipline: Chemistry > Materials Science
```


In [8]:
def get_key_value_pair(line_str):
    """
    Given a str as in 'revised_title: "Caution: This is title"', 
    return a tuple ('revised_title', "Caution: This is title")
    """
    first_colon_index = line_str.find(":")

    if first_colon_index != -1:
        key_str = line_str[:first_colon_index].strip()

        # when " '...' ",  we don't need the outer double quotes, and 
        # when ' "..." ',  we don't need the outer single quotes
        # so we take everything except first and last char via [1:-1]
        # but only if necessary
        if line_str[-1] != ":":

            # remove leading and trailing whitespaces if any
            in_progress_str = line_str[first_colon_index + 1:].strip()

            #print(in_progress_str)
            leading_char = in_progress_str[0]
            trailing_char = in_progress_str[-1]
        

            if leading_char == '"' and trailing_char == '"':
                value_str = in_progress_str[1:-1].strip()

            elif leading_char == "'" and trailing_char == "'":
                value_str = in_progress_str[1:-1].strip()

            else:
                value_str = in_progress_str.strip()#[1:-1]
        else:
            value_str = ''

    else:
        # no colon in the line
        return (line_str, None)

    return key_str, value_str

# write test cases for the function get_key_value_pair
print(get_key_value_pair('revised_title: "Caution: This is a title"'))
print(get_key_value_pair('topics:') )

assert get_key_value_pair('revised_title: "Caution: This is a title"') == ('revised_title', 'Caution: This is a title')
assert get_key_value_pair('topics:') == ('topics', '')

('revised_title', 'Caution: This is a title')
('topics', '')


In [9]:
# Test case 1: Valid input with colon
line_str = 'revised_title: "Caution: This is title"'
expected_output = ('revised_title', 'Caution: This is title')
assert get_key_value_pair(line_str) == expected_output

# Test case 2: Valid input without colon
#line_str = 'revised_title "Caution: This is title"'
#expected_output = None
#assert get_key_value_pair(line_str) == expected_output

# Test case 3: Empty input
line_str = ''
expected_output = (line_str, None)
assert get_key_value_pair(line_str) == expected_output

# Test case 4: Input with leading/trailing spaces
line_str = '  revised_title: Caution: This is title  '
expected_output = ('revised_title', 'Caution: This is title')
assert get_key_value_pair(line_str) == expected_output

# Test case 5: Input with multiple colons
line_str = 'revised_title: Caution: This: is: title'
expected_output = ('revised_title', 'Caution: This: is: title')
assert get_key_value_pair(line_str) == expected_output

# Test case 6: Input with without colons but with topic list iterm
line_str = '- "Geometry"'
expected_output = (line_str, None)
assert get_key_value_pair(line_str) == expected_output

print("All test cases passed!")

All test cases passed!


In [10]:
# write each string in doc2text as a text file on disk
# maintain encoded text in the file 

for doc_id, text in tqdm(doc2text.items()):
    with open(f"../data/llm_plain_texts/{doc_id}.txt", "w", encoding='utf-8') as file:
        file.write(text)

  0%|          | 0/47837 [00:00<?, ?it/s]

In [11]:
# parse yaml string into dictionary
doc2dict = {}

for doc in tqdm(
    doc2text, desc="Parsing yaml string into dictionary", 
    total=len(doc2text)
    ):

    #print(doc, doc2text[doc])

    doc2dict[doc] = {} # perhaps consider OrderedDict() instead 
    lines = doc2text[doc].split("\n")

    # expected number of lines for each yaml is 12
    # if there is more, we need to do some cleaning

    # zeroth case: sometimes there are empty lines
    # get rid of them
    lines = [l for l in lines if len(l) > 0]

    if len(lines) > 11:
        
        if lines[0].startswith("article_id: "): 

            if lines[-1].startswith("scientific_subdiscipline: "):
                # first case: there are more than 5 topics 
                # this is OK, as long as first & last line 
                # behave as expected 
                pass # do nothing here for now 
            
            #elif lines[-2].startswith("scientific_discipline_2: "):
                # second case: sometimes the LLM add things like 
                # another category, e.g.,
                # scientific_discipline_2, scientific_subdiscipline_2
                # remove the last two lines and see how that works 
                #lines = lines[:-2]

            #elif lines[-1].startswith("scientific_subdiscipline_2: "):
                # third case: sometimes the LLM add things like 
                # an additional subcategory e.g., 
                # scientific_subdiscipline_2, scientific_subdiscipline_3
                # remove the last two lines and see how that works 
                #lines = lines[:-1] 

            elif lines[-1].startswith("```"):  
                # third case: sometimes the LLM output yaml output
                # twice! with additional ```yaml ... ``` at the end
                # remove the additional text to handle this case
                for i, l in enumerate(lines):
                    if l.startswith("```"):
                        lines = lines[:i]
                        break

    # a variable to keep track of whether we are reading a list
    # and it toggled to True when we see a the key 'topics'
    isTopicList = False

    for i, line in enumerate(lines):

        key_str, value_str = get_key_value_pair(line)

        # check if key string is YAML leading '```' or '´´´
        # if so, skip this line
        if key_str[:3] == '```':
            continue
        elif key_str[:3] == '´´´':
            continue

        # if a key-value pair (e.g., 'article_id: 12345')
        # simply read as key and value

        if key_str and value_str: 
            
            #if key_str == 'article_id':
            #value_str = str(value_str) # force this value to be string

            doc2dict[doc][key_str] = str(value_str) 

            continue # move to the next line

        # if a key followed by list items, read all list items
        elif key_str == 'semantic_tags' and not isTopicList: 
                #key = line_tokens[0] 
                topic_key_str = key_str
                doc2dict[doc][topic_key_str] = []

                isTopicList = True 
                continue # move to the next line to get first list item
        
        elif isTopicList:
            line_tokens = line.split(" - ")

            # in some cases semantig tags key is repeated more than once 
            # to fix this, skip line if it does not have a list item
            if len(line_tokens) < 2:
                continue

            try: 
                topic_item = line_tokens[1].strip()#[1:-1]

                # tp aovid duplicates, only add if not already in the list
                if topic_item not in doc2dict[doc][topic_key_str]:
                    doc2dict[doc][topic_key_str].append(topic_item)

            except IndexError:
                print(f"X {doc} did not parsee due to IndexError {key_str}, {len(lines)},  {lines[0]}") 
                # {line_tokens}, {lines},


Parsing yaml string into dictionary:   0%|          | 0/47837 [00:00<?, ?it/s]

In [23]:
doc2dict['rstb_1911_0004']

{'article_id': 'rstb_1911_0004',
 'revised_title': 'The Lignite of Bovey Tracey: A Reassessment of Age and Flora',
 'semantic_tags': ['Paleontology',
  'Geology',
  'Botany',
  'Oligocene period',
  'Lignite'],
 'scientific_discipline': 'Earth Sciences > Geology'}

In [21]:
schema = Schema(
    {
        Optional('article_id'): And(str, lambda s: len(s) >= 0),
        'revised_title': And(str, lambda s: len(s) >= 0),
        'semantic_tags': And(
            list, 
            lambda tags: all(
                isinstance(tag, str) for tag in tags
            )
        ),
        'tldr': And(str, lambda s: len(s) >= 0),
        'scientific_discipline': And(str, lambda s: len(s) >= 0),
        Optional('sub_discipline'): And(str, lambda s: len(s) >= 0),
        #Optional('scientific_subdiscipline_2'): And(str, lambda s: len(s) >= 0)
    }
)


def is_valid_yaml(yaml_str):
    try:
        schema.validate(yaml_str)
        return True #print("Data is valid according to the schema.")
        
    except Exception as e:
        #print("Data is not valid according to the schema:", str(e))
        return False

doc2yaml = {}
for doc, yaml_str in doc2dict.items():
    
    if not is_valid_yaml(yaml_str):
        print(f"{doc} is not a valid YAML!")
    else:
        doc2yaml[doc] = yaml_str

101889 is not a valid YAML!
101839 is not a valid YAML!
102506 is not a valid YAML!
101992 is not a valid YAML!
105795 is not a valid YAML!
107230 is not a valid YAML!
108200 is not a valid YAML!
108278 is not a valid YAML!
112723 is not a valid YAML!
108771 is not a valid YAML!
112791 is not a valid YAML!
112368 is not a valid YAML!
rspl_1872_0003 is not a valid YAML!
rspl_1873_0018 is not a valid YAML!
rspl_1878_0002 is not a valid YAML!
rspl_1873_0004 is not a valid YAML!
rspl_1887_0177 is not a valid YAML!
rspl_1883_0002 is not a valid YAML!
rspl_1887_0172 is not a valid YAML!
rspl_1887_0109 is not a valid YAML!
rspl_1887_0141 is not a valid YAML!
rspl_1886_0079 is not a valid YAML!
rspl_1885_0002 is not a valid YAML!
rspl_1880_0079 is not a valid YAML!
rspl_1886_0083 is not a valid YAML!
rsta_1897_0015 is not a valid YAML!
rspl_1892_0068 is not a valid YAML!
rspl_1897_0042 is not a valid YAML!
rspa_1906_0001 is not a valid YAML!
rspl_1900_0039 is not a valid YAML!
rsta_1906_0004 i

In [22]:
len(doc2text), len(doc2yaml), f"{(len(doc2yaml)/len(doc2text))*100:.2f}% valid YAML."

(47837, 47558, '99.42% valid YAML.')

In [29]:
doc2yaml['112311']

{'article_id': '112311',
 'revised_title': 'Thermo-Electric Currents of Ritterian Species: A Study on Electromotive Force and Metal Tensions',
 'topics': ['Thermo-electricity',
  'Electromotive Force',
  'Ritterian Species',
  'Metal Tensions',
  'Thermo-electric Inversions'],
 'tldr': 'The author, C.K. Akin, investigates the electromotive force of thermo-electric couples and the role of metal tensions in their function. He demonstrates that the electromotive force can be expressed as the difference between two quantities, dependent on the temperature and the nature of the circuit. Akin also explores the influence of the texture and chemical nature of substances on thermo-electric function and discusses the possibility of thermo-electric inversions.',
 'scientific_discipline': 'Physics',
 'scientific_subdiscipline': 'Electromagnetism & Electromechanical Systems'}

In [30]:
# save as yaml files

for doc, yaml_dict in tqdm(doc2yaml.items()):

    file_path = f"../data/yaml_files/{doc}.yaml"
    
    with open(file_path, "w", encoding="utf-8") as file:
        yaml.safe_dump(yaml_dict, 
            file,
            allow_unicode=True,
            encoding='utf-8',
            default_flow_style=False
        )

  0%|          | 0/17486 [00:00<?, ?it/s]

In [31]:
# save as json files

for doc, yaml_dict in tqdm(doc2yaml.items()):

    file_path = f"../data/json_files/{doc}.json"
    
    with open(file_path, "w", encoding="utf-8") as file:
        json.dump(yaml_dict, file)

  0%|          | 0/17486 [00:00<?, ?it/s]