# Summarise Blood Pressure and Arterial Function Laboratory studies

This code scrapes the metadata files (json format) in the current folder and outputs a markdown file with a tabular summary of those studies.

In [5]:
# import required libraries
import json
import pandas as pd
import os

In [6]:
# Read a template markdown file with some top level information for the markdown file to be generated
with open("summarise_studies_template.md", "r") as f:
    content = f.read()

In [7]:
# Get list of files with extension ".json" in the current folder
json_files = [f for f in os.listdir('.') if f.endswith('.json')]

# Create table with top level study information from json files
for filename in json_files:
    if(filename != "000000TPLT.json"):
        with open(filename, 'r') as f:
            data = json.load(f)
        temp = pd.DataFrame({
            'study title': [data['study_title']],
            'study id': [data['study_id']],
            'study location': [data['study_location']],
            'folder': [data['folder_location']],
            'ethics': [data['ethics_location']],
            'Pure record': [data['pure_record']],
            'start date': [data['study_start_date']],
            'end date': [data['study_end_date']],
            'description': [data['study_description']],
            'principal investigator(s)': [data['principal_investigator']],
            'other investigators': [data['personnel']],
            'sample size': [len(data['id_list'])]
        })
        if (filename == json_files[1]):
            df = temp
        else:
            df = pd.concat([df, temp])
        print(filename, "read.")

198200BJNG.json read.
198501AORT.json read.
200901PCNG.json read.
201201PCHM.json read.
201809MSTR.json read.
201905MMO.json read.
202001BLMO.json read.
202007FLDG.json read.
202111OSAB.json read.
202201ORTH.json read.
202209APPG.json read.
202303IABD.json read.
202303QNTS.json read.
202304IABP.json read.
202305BACU.json read.


In [8]:
# Make a single date column
df['start date'] = df['start date'] + '-' + df['end date']
df = df.rename({'start date': 'date'}, axis='columns')
df = df.drop('end date', axis='columns')

# Put text (linked) instead of full url's in the table
def create_markdown_link(url, text):
    return f'[{text}]({url})'

df['folder']      = df['folder'].apply(lambda x: create_markdown_link(x, 'data'))
df['ethics']      = df['ethics'].apply(lambda x: create_markdown_link(x, 'ethics'))
df['Pure record'] = df['Pure record'].apply(lambda x: create_markdown_link(x, 'Pure'))

# Create a single column with the links
df['folder'] = df['folder'] + ';<br>' + df['ethics'] + ';<br>' + df['Pure record']
df = df.rename({'folder': 'links'}, axis='columns')
df = df.drop(columns=['ethics', 'Pure record'])

# Add study ID to title column
df['study title'] = df['study id'] + '<br>' + df['study title'] + '<br>' + df['study location']
df = df.drop(columns=['study id', 'study location'])

# Write the contents to the output file in markdown format
with open("README.md", "w") as f:
    # f.write("# " + input_file + "\n\n")
    f.write(content)
    f.write("\n")
    f.write(df.to_markdown(tablefmt="github"))
