In [5]:
# Import the json from the "data.json" file
import json
# Clean the data (lowercasing, rid of -\n, newlines (\n)) 
# Cleaned in 2 different ways due to the nature of the Cioms Form (will be used for different methods of information extraction)
with open('./data.json') as line1:
    data = json.load(line1)
with open('./data.json') as line2:
    original = json.load(line2)
for prop in original:
    if '-\n' in original[prop]:
        original[prop] = original[prop].replace('-\n', '')
    if '\n' in original[prop]:
        original[prop] = original[prop].replace('\n', ' ')
for attribute in data:
    data[attribute] = data[attribute].lower()
    if '-\n' in data[attribute]:
        data[attribute] = data[attribute].replace('-\n', '')
    if '\n' in data[attribute]:
        data[attribute] = data[attribute].replace('\n', ' ')
    if '’s' in data[attribute]:
        data[attribute] = data[attribute].replace('’s', '')
data


{'file_name': 'literature_300.pdf',
 'title': 'rasmussen encephalitis presenting as progressive parietal dysfunction sans seizures',
 'header': 'divyani garg a , ayush agarwal a , deepa dash a, ⁎ , swati mahajan b , rajesh kumar singh a , rohit bhatia a , manjari tripathi a a department of neurology, all india institute of medical sciences, new delhi, india b department of neuropathology, all india institute of medical sciences, new delhi, india article info keywords: encephalitis autoimmune rasmussen encephalitis parietal lobe apraxia hemiatrophy 1. ',
 'abstract': ' rasmussen encephalitis (re) is a rare immune-mediated condition that classically presents in children with focal epilepsy or epilepsia partialis continua, progressive hemiplegia associated with cognitive deterioration and chronic unilateral cortical inflammation and hemiatrophy [1]. adults may have atypical manifestations including a prolonged prodromal phase, poorly defined residual period and slower progression [2]. adu

In [6]:
#Template for CIOMS Form 
# NOTE: Need more properties
cioms_form = {
    "patient_initials": "",
    "country": "",
    "date_of_birth": {
        "day": "",
        "month": "",
        "year": "" 
    },
    "age_years": "",
    "sex": "",
    "reaction_onset": {
        "day": "",
        "month": "",
        "year": "" 
    },
    "reaction_description": ""
}


In [8]:
import re 
import pycountry
import pandas as pd

# Will use dictionary "map" to place disease names (str format) appearing in the JSON object as the keys and map the number of occurrences (num format) of each disease name as the value
map = {}
counter = 0
# Read in the excel file containing all disease names listed in the International Council for Harmonisation of Technical Requirements for Pharmaceuticals for Human Use, MedDRA
data_df = pd.read_excel('medra-3.xlsx', sheet_name = '23.1 IME List')
json_str = data_df.to_json(orient = 'records')
# Converted file into json for convenience 
# Format: {
# MedDRA Code: Disease iD Number
# PT Name: Disease name
# SOC Name: Class of Disease
# }  
medra = json.loads(json_str)
# Loop through json file to search for PT Name (Disease names) and keep a counter to map disease name as a key and counter as the value ONLY IF the disease name appears more than once (if counter > 0) 
for name in medra:
    for property in data:
        counter += data[property].count(name["PT Name"].lower())
    if counter > 0 and ' ' in name["PT Name"]:
        map[name["PT Name"]] = counter
    counter = 0
# Find disease name associated with the highest value to obtain the disease name stated most in the text
disease = max(map, key = map.get) 
for name in medra:
    if name["PT Name"] == disease:
        iD = name["MedDRA Code"]
# Fill out CIOMS Form for 7+13 DESCRIBE REACTION(S) (including relevant tests/lab data)
# NOT FULLY DONE, still need to extract data received and suspect drug
cioms_form["reaction_description"] = disease.upper() + " (" + disease + " (" + str(iD) + ")" + ", " + disease + " (" + str(iD) + "))" + "\n\nThis case, derived from a full text scientific literature study article, was received on" + "\n" + "This case refers to a patient who experienced the event " + disease.lower() + "while on therapy with " + "\n" + "Case report:" + original["case"]
# Fill out 1a.COUNTRY by searching for a country name in header with pycountry which provides the ISO database for countries
for country in pycountry.countries:
    if country.name.lower() in data["header"]:
        cioms_form["country"] = country.name
# Found the age of the patient by searching through the case report section of the case study for "year-old" and using a regular expression to extract age
age = re.findall("\d*-year-old", data["case"])
ageYears = ""
for num in age[0]:
    if (num.isdigit()):
        ageYears += num
cioms_form["age_years"] = ageYears
# Placed general synonyms of male/female in 2 seperate sets to extract the gender of the patient
males_set = set(["male", "man", "gentleman", "boy", "guy", "gent", "bloke", "chap", "lad", "hombre"])
females_set = set(["female", "woman", "lady", "girl", "colleen", "lass", "lassie"])
splits = data["case"].split()
for split in splits:
    for m in males_set:
        if m in split:
            cioms_form["sex"] = 'M'
            break
    for f in females_set:
        if f in split:
            cioms_form["sex"] = 'F'
            break
cioms_form

{'patient_initials': '',
 'country': 'India',
 'date_of_birth': {'day': '', 'month': '', 'year': ''},
 'age_years': '29',
 'sex': 'M',
 'reaction_onset': {'day': '', 'month': '', 'year': ''},
 'reaction_description': 'RASMUSSEN ENCEPHALITIS (Rasmussen encephalitis (10071141), Rasmussen encephalitis (10071141))\n\nThis case, derived from a full text scientific literature study article, was received on\nThis case refers to a patient who experienced the event rasmussen encephalitiswhile on therapy with \nCase report: A 29-year-old male presented with 1 1/2 years of difficulty in perceiving the shape and texture of objects with his right hand associated with numbness. He developed problems with typing without looking at the keyboard. He had inability in performing complex calculations and word-finding and sentence construction errors. Three months after onset, he developed gradual right grasp weakness. Over the next six months, he developed right-left confusion hampering driving. One year 