In [14]:
# Python code to illustrate parsing of XML files
# importing the required modules
import csv
import requests
import xml.etree.ElementTree as ET
import os
import pandas as pd


def loadRSS():
    # url of rss feed
    url = 'http://www.hindustantimes.com/rss/topnews/rssfeed.xml'

    # creating HTTP response object from given url
    resp = requests.get(url)

    # saving the xml file
    with open('topnewsfeed.xml', 'wb') as f:
        f.write(resp.content)


# Parse and get the Report text - single node. No need to iterate
def parseXML(xmlfile):
    # create element tree object
    tree = ET.parse(xmlfile)

    # get root element
    root = tree.getroot()

    report = root.find('TEXT').text

    # return news items list
    return report


def savetoCSV(newsitems, filename):
    # specifying the fields for csv file
    fields = ['filename', 'report_text']

    # writing to csv file
    with open(filename, 'w') as csvfile:
        # creating a csv dict writer object
        writer = csv.DictWriter(csvfile, fieldnames=fields)

        # writing headers (field names)
        writer.writeheader()

        # writing data rows
        writer.writerows(newsitems)


def get_report_text(filename):
    # parse xml file
    report_text = parseXML(filename)

    # print(report_text)

    # store news items in a csv file
    # savetoCSV(newsitems, 'data.csv')

    return report_text

def split_reports(report):
    return report.split("****************************************************************************************************")

def get_latest_report(full_reports):
    temp_reports = split_reports(full_reports)
    return temp_reports[0]


def get_report_from_xml_files(path):
    # create empty list for news items
    all_reports = []
    file_names = []

    # path = 'D:/Work/DataSets_NLP/n2c2_medical/ClinicalTrial/train/train/'
    for filename in os.listdir(path):
        if not filename.endswith('.xml'): continue
        fullname = os.path.join(path, filename)
        print(fullname)
        report_text = get_report_text(fullname)
        latest_report = get_latest_report(report_text)
        # append news dictionary to news items list
        all_reports.append(latest_report)
        file_names.append(filename)

    out = pd.DataFrame([file_names, all_reports]).T
    out.rename(columns={0: 'filename', 1: 'report_text'})
    out.to_csv('full_report_all_files.csv', header=['filename', 'report_text'], index=None)

    return all_reports


In [15]:
path = 'D:/Work/DataSets_NLP/n2c2_medical/ClinicalTrial/train/train/'
get_report_from_xml_files(path)

D:/Work/DataSets_NLP/n2c2_medical/ClinicalTrial/train/train/100.xml
D:/Work/DataSets_NLP/n2c2_medical/ClinicalTrial/train/train/101.xml
D:/Work/DataSets_NLP/n2c2_medical/ClinicalTrial/train/train/102.xml
D:/Work/DataSets_NLP/n2c2_medical/ClinicalTrial/train/train/103.xml
D:/Work/DataSets_NLP/n2c2_medical/ClinicalTrial/train/train/104.xml
D:/Work/DataSets_NLP/n2c2_medical/ClinicalTrial/train/train/105.xml
D:/Work/DataSets_NLP/n2c2_medical/ClinicalTrial/train/train/106.xml
D:/Work/DataSets_NLP/n2c2_medical/ClinicalTrial/train/train/107.xml
D:/Work/DataSets_NLP/n2c2_medical/ClinicalTrial/train/train/109.xml
D:/Work/DataSets_NLP/n2c2_medical/ClinicalTrial/train/train/110.xml
D:/Work/DataSets_NLP/n2c2_medical/ClinicalTrial/train/train/111.xml
D:/Work/DataSets_NLP/n2c2_medical/ClinicalTrial/train/train/112.xml
D:/Work/DataSets_NLP/n2c2_medical/ClinicalTrial/train/train/113.xml
D:/Work/DataSets_NLP/n2c2_medical/ClinicalTrial/train/train/114.xml
D:/Work/DataSets_NLP/n2c2_medical/ClinicalTrial/

["\n\nRecord date: 2106-02-12\n\nCampbell Orthopedic Associates\n4 Madera Circle\nOmak, GA 28172\n \nHabib Valenzuela, M.D.\n \n \n                                             Valdez, Harlan Jr.  \n                                           845-41-54-4\n                                             February 12, 2106 \nHar is a 43 year old 6' 214 pound gentleman who is referred for\nconsultation by Dr. Harlan Oneil.  About a week ago he slipped on\nthe driveway at home and sustained an injury to his left ankle. \nHe was seen at Tri-City Hospital and was told he had a\nfracture.  He was placed in an air splint and advised to be\npartial weight bearing, and he is using a cane.  He is here for\nroutine follow-up. \nPast medical history is notable for no ankle injuries previously. \nHe has a history of diabetes and sleep apnea.  He takes Prozac,\nCardizem, Glucophage and Amaryl.  He is also followed by Dr. Harold\nNutter for an arrhythmia.  He does not smoke.  He drinks\nminimally.  He is a 