# Prerequisites

* Annotations: https://drive.google.com/open?id=19ILZxJIkSnOI5a93TyIjyeUATDQn61X5
* Relevance score: http://www.trec-cds.org/qrels-treceval-clinical_trials.2017.txt
* Clinical Trials XML collection: http://trec-cds.appspot.com/2018.html#documents
* 2017 Topics: http://trec-cds.appspot.com/topics2017.xml

In [None]:
from os import listdir
from os.path import isfile, isdir, join
from lxml import etree
import pandas as pd
import tarfile
import gzip
import time
import csv
import re

In [None]:
# Decompress Files
def decompress(myPath):
    fileNames = [f for f in listdir(myPath) if isfile(join(myPath, f)) and f[-7:] == ".tar.gz"]
    for file in fileNames:
        print("Extracting from: ", file)
        tar = tarfile.open(join(myPath, file), "r:gz")
        tar.extractall(join(myPath, file[:-7]))
        tar.close()
        print("Done")
        
if __name__ == "__main__":
    # Path containing the .tar.gz file (XML collection)
    clinicalTrials = "/TREC/XML-Collection/clinical-trials"

    # Decompress files
    decompress(clinicalTrials)

In [None]:
# Extract Ids from the Gold-Standard CSV file
def extractDocIDs(filePath):
    """ Extracts all ids from the gold standard """
    f = pd.read_csv(filePath)
    return set(f['trec_doc_id'])

def getFolderNames(myPath):
    dirNames = [d for d in listdir(myPath) if isdir(join(myPath, d))]
    return dirNames

# Get the name of the xml files
def getFileNames(myPath):
    fileNames = [f for f in listdir(myPath) if isfile(join(myPath, f)) and f[-4:] == ".xml"]
    return fileNames
        
# Extract relevant information from the XML files that match the gold-standard
def extractFeatures(folderPath, docIDsPath, outputPath):
    st = time.time()
    
    # Get Ids from the Gold-Standard
    ids = extractDocIDs(docIDsPath)
    print("Nr of IDs in the Gold-Standard:", len(ids))
    # Recover the names of each folder containing xml.gz files
    folderNames = getFolderNames(folderPath)
    print("Internal folders: ", len(folderNames))
    nrExtractedXMLs = 0
    
    # Create TSV for the output
    with open(outputPath, 'w', encoding='utf-8') as extractFile:
        wr = csv.writer(extractFile, quoting=csv.QUOTE_ALL, delimiter="\t")
        wr.writerow(["trec_doc_id","brief_title","official_title","brief_summary",
                     "detailed_description","phase","study_type","study_design_info",
                     "outcomes","conditions","arm_groups", "drug_interventions","other_interventions","inclusion_criteria",
                     "mesh_terms_conditions","mesh_terms_interventions"
                    ])
    
    # Iterate through the folders with the xml files
    parser = etree.XMLParser(encoding='utf-8', recover=True)
    
    for folderName in folderNames:
        print("Looking into files from folder: ", folderName)
        secondLevelFolders = getFolderNames(join(folderPath, folderName))
        for secondLevelFolder in secondLevelFolders:
            print("Analyzing information from folder: ", secondLevelFolder)
            fileNames = getFileNames(join(join(folderPath, folderName), secondLevelFolder))            
            for fileName in fileNames:
                if fileName[:-4] in ids:                 
                    print("Extracting info from the file: ", fileName)
                    filePath = join(join(join(folderPath, folderName), secondLevelFolder), fileName)
                    ct = etree.parse(open(filePath, 'rt', encoding='utf-8'), parser=parser).getroot()
                    summaryList = []
                    detailList = []
                    outcomeList = []
                    conditionList = []
                    armGroupList = []
                    DrugInterventionList = []
                    OtherInterventionList = []
                    conditionsMeshList = []
                    interventionsMeshList = []
                    inclusionCriteriaList = []
                    
                    # Get titles
                    if ct.find("brief_title") is not None:
                        brief_title = ''.join(ct.find("brief_title").itertext())
                    if ct.find("official_title") is not None:
                        official_title = ''.join(ct.find("official_title").itertext())
                    # Get summary
                    if ct.find("brief_summary") is not None:
                        for summaryPiece in ct.findall("brief_summary/textblock"):
                            summaryList.append(''.join(summaryPiece.itertext()).strip().replace("\n","").replace("\t",""))
                        brief_summary = ' '.join(summaryList)
                    # Get detailed description
                    if ct.find("detailed_description") is not None:
                        for detailPiece in ct.findall("detailed_description/textblock"):
                            detailList.append(''.join(detailPiece.itertext()).strip().replace("\n","").replace("\t",""))
                        detailed_description = ' '.join(detailList)
                    # Phase
                    if ct.find("phase") is not None:
                        phase = ''.join(ct.find("phase").itertext())
                    # Study type
                    if ct.find("study_type") is not None:
                        study_type = ''.join(ct.find("study_type").itertext())
                    # Study Design Info
                    if ct.find("study_design_info") is not None:
                        study_design = ''.join(ct.find("study_design_info").itertext()).strip().replace("\n","")
                    # Outcomes Info
                    if ct.find("primary_outcome") is not None:
                        for outcomePiece in ct.findall("primary_outcome"):
                            outcomeList.append(''.join(outcomePiece.itertext()).strip().replace("\n","").replace("\t",""))
                    if ct.find("secondary_outcome") is not None:
                        for outcomePiece in ct.findall("secondary_outcome"):
                            outcomeList.append(''.join(outcomePiece.itertext()).strip().replace("\n","").replace("\t",""))
                    # Different outcomes separated by ";"
                    if outcomeList:
                        outcomes = ';'.join(outcomeList)
                    # Get Conditions - different conditions separated by ";"
                    if ct.find("condition") is not None:
                        for conditionPiece in ct.findall("condition"):
                            conditionList.append(''.join(conditionPiece.itertext()).strip().replace("\n","").replace("\t",""))
                        conditions = ';'.join(conditionList)
                    # Get ArmGroup Info - different arm groups separated by ";"
                    if ct.find("arm_group") is not None:
                        for armGroupPiece in ct.findall("arm_group"):
                            armGroupList.append(''.join(armGroupPiece.itertext()).strip().replace("\n","").replace("\t",""))
                        arm_groups = ';'.join(armGroupList)
                    # Get Interventions - different interventions separated by ";"
                    if ct.find("intervention") is not None:
                        for interventioPiece in ct.findall("intervention"):
                            if (interventioPiece.find('intervention_type').text) == "Drug":
                                DrugInterventionList.append(interventioPiece.find('intervention_name').text)
                                for otherName in interventioPiece.findall('other_name'):
                                    DrugInterventionList.append(otherName.text.strip().replace("\n","").replace("\t",""))
                            else:
                                OtherInterventionList.append(interventioPiece.find('intervention_name').text)
                                for otherName in interventioPiece.findall('other_name'):
                                    OtherInterventionList.append(otherName.text.strip().replace("\n","").replace("\t",""))
                        drug_interventions = ';'.join(DrugInterventionList)
                        other_interventions = ';'.join(OtherInterventionList)
                    # Get Conditions Mesh Terms Info - different mesh_terms separated by ";"
                    if ct.find("condition_browse") is not None:
                        for meshCPiece in ct.findall("condition_browse/mesh_term"):
                            conditionsMeshList.append(''.join(meshCPiece.itertext()).strip().replace("\n","").replace("\t",""))
                        mesh_terms_conditions = ';'.join(conditionsMeshList)
                    # Get Interventions Mesh Terms Info - different mesh_terms separated by ";"
                    mesh_terms_interventions = ""
                    if ct.find("intervention_browse") is not None:
                        for meshIPiece in ct.findall("intervention_browse/mesh_term"):
                            interventionsMeshList.append(''.join(meshIPiece.itertext()).strip().replace("\n","").replace("\t",""))
                        mesh_terms_interventions = ';'.join(interventionsMeshList)
                    # Get Inclusion Criteria
                    if ct.find("eligibility/criteria") is not None:
                        for eligibilityPiece in ct.findall("eligibility/criteria"):
                            criteria = ''.join(eligibilityPiece.itertext()).strip().replace("\n","").replace("\t","")
                            inclusionCriteria = re.search(r"([Ii]nclusion [Cc]riteria)?(.+?)([Ee]xclusion [Cc]riteria|[Ee]xcluded if|$)", criteria)
                            if inclusionCriteria:
                                inclusionCriteriaList.append(inclusionCriteria.group(2))
                        inclusion_criteria = ';'.join(inclusionCriteriaList)
                        
#                     print("\nBrief_Title")
#                     print(brief_title)
#                     print("\nOfficial_Title")
#                     print(official_title)
#                     print("\nBrief_Summary")
#                     print(brief_summary)
#                     print("\nDetailed_Description")
#                     print(detailed_description)
#                     print("\nPhase")
#                     print(phase)
#                     print("\nStudy Type")
#                     print(study_type)
#                     print("\nStudy Design")
#                     print(study_design)
#                     print("\nOutcomes")
#                     print(outcomes)
#                     print("\nConditions")
#                     print(conditions)
#                     print("\nArm Groups")
#                     print(arm_groups)
#                     print("\nDrug Interventions")
#                     print(drug_interventions)
#                     print("\nOther Interventions")
#                     print(other_interventions)
#                     print("\nMesh Terms Conditions")
#                     print(mesh_terms_conditions)
#                     print("\nMesh Terms Interventions")
#                     print(mesh_terms_interventions)
#                     print("\nInclusion Criteria")
#                     print(inclusion_criteria)
                    
                    # Write the result to TSV
                    with open(outputPath, 'a', encoding='utf-8') as extractFile:
                        wr = csv.writer(extractFile, quoting=csv.QUOTE_ALL, delimiter="\t")
                        wr.writerow([fileName[:-4], brief_title, official_title, brief_summary, detailed_description,
                                    phase,study_type,study_design,outcomes,conditions,arm_groups,drug_interventions,
                                     other_interventions,inclusion_criteria,mesh_terms_conditions,mesh_terms_interventions])
                
                    # Count the number of extracted papers
                    nrExtractedXMLs += 1
    
    print("Number of papers with information extracted: ", nrExtractedXMLs)
    end = time.time()
    print("Run time: ", end-st)

if __name__ == "__main__":
    # Path containing the xml files
    clinicalTrialsPath = "TREC/XML-Collection/clinical-trials/clinicaltrials_xml/clinicaltrials_xml"
    # Path containing the Annotated Gold-Standard File
    docIDPath = "../resources/clinical_trials.judgments.2017.csv"
    # Output file
    outputPath = "../resources/relevant-clinical-trials-XML.tsv"
    
    # Extract relevant information from the XML files
    extractFeatures(clinicalTrialsPath, docIDPath, outputPath)

# Read GS Clinical Trials

In [None]:
clinicalTrials = pd.read_csv("../resources/relevant-clinical-trials-XML.tsv", sep='\t', encoding="utf-8", dtype={'trec_doc_id':object})
clinicalTrials.shape

# Read Annotations

In [None]:
annotations = pd.read_csv("../resources/clinical_trials.judgments.2017.csv", sep=',', encoding="utf-8", dtype={'trec_topic_number':object})
annotations.shape

# Read Relevance Score

In [None]:
relevance = pd.read_csv("../resources/qrels-treceval-clinical_trials.2017.txt", sep=' ', encoding="utf-8", header=None, 
                        names=["trec_topic_number", "x", "trec_doc_id", "relevance_score"], dtype={'trec_topic_number':object})
relevance.shape

# Read Topics

## 2017

In [None]:
topicsColumns = ['trec_topic_number', 'trec_topic_disease', 'trec_topic_age', 'trec_topic_sex', 'trec_topic_other1', 'trec_topic_other2', 'trec_topic_other3']
topics = pd.DataFrame(columns=topicsColumns)
topicsXML = etree.parse("../resources/topics2017.xml")
for topic in topicsXML.getroot():
    topicNumber = topic.get('number')
    disease = topic.find('disease').text
    demographic = topic.find('demographic').text.split(' ')
    age = demographic[0]
    sex = demographic[1]
    other = topic.find('other').text.split(',')
    other1 = other[0]
    other2 = None
    other3 = None
    if len(other) == 2:
        other2 = other[1]
    if len(other) > 2:
        other3 = other[2]
    topics = topics.append(pd.Series([topicNumber, disease, age, sex, other1, other2, other3], index=topicsColumns), ignore_index=True)
topics.shape

## 2018

In [None]:
topicsColumns = ['trec_topic_number', 'trec_topic_disease', 'trec_topic_age', 'trec_topic_sex']
topics = pd.DataFrame(columns=topicsColumns)
topicsXML = etree.parse("../resources/topics2018.xml")
for topic in topicsXML.getroot():
    topicNumber = topic.get('number')
    disease = topic.find('disease').text
    demographic = topic.find('demographic').text.split(' ')
    age = demographic[0]
    sex = demographic[1]
    topics = topics.append(pd.Series([topicNumber, disease, age, sex], index=topicsColumns), ignore_index=True)
topics.shape

# Merge Annotations and Relevance

In [None]:
annotationsRelevance = annotations.merge(relevance, left_on=['trec_topic_number','trec_doc_id'], right_on=['trec_topic_number','trec_doc_id'], how='left')
annotationsRelevance.drop(["x"], axis=1, inplace=True)
annotationsRelevance.shape

# Merge Annotantions and Relevance with CT from the GS

In [None]:
processedClinicalTrials = annotationsRelevance.merge(clinicalTrials, left_on=['trec_doc_id'], right_on=['trec_doc_id'], how='left')
processedClinicalTrials.shape

# Add 2017 Topics Information

In [None]:
processedGoldStandard = processedClinicalTrials.merge(topics, left_on=['trec_topic_number'], right_on=['trec_topic_number'], how='left')
processedGoldStandard.shape

# Save the Output

In [None]:
date = time.strftime("%Y%m%d")
processedGoldStandard.to_csv(path_or_buf='../resources/'+ date + 'processed-goldstandard-CT.tsv', sep='\t')