# Prerequisites

* Annotations: https://drive.google.com/file/d/1IH4dL4OKG7bv57K8DreOeSAfJgkgC4sd/view
* Relevance score: http://www.trec-cds.org/qrels-treceval-abstracts.2017.txt
* Pubmed XML collection: http://trec-cds.appspot.com/2018.html#documents
* 2017 Topics: http://trec-cds.appspot.com/topics2017.xml
* Extra Abstracts (ASCO and AACR) extracted from the TREC TXT files **@see** [PreProcess_GoldStandard_TXT](PreProcess_GoldStandard_TXT.ipynb)

In [None]:
from os import listdir
from os.path import isfile, isdir, join
from lxml import etree
import pandas as pd
import tarfile
import gzip
import time
import csv

# Decompress _.tar.gz_ Files

In [None]:
# Decompress Files
def decompress(myPath):
    fileNames = [f for f in listdir(myPath) if isfile(join(myPath, f)) and f[-7:] == ".tar.gz"]
    for file in fileNames:
        print("Extracting from: ", file)
        tar = tarfile.open(join(myPath, file), "r:gz")
        tar.extractall(join(myPath, file[:-7]))
        tar.close()
        print("Done")
        
if __name__ == "__main__":
    # Path containing the medline_xml.part[x].tar.gz files (Pubmed XML collection)
    pubMedAbstracts = "../TREC/XML-Collection"

    # Decompress files
    decompress(pubMedAbstracts)

# Extract Information from Pubmed XML Files

In [None]:
# Extract Pubmed Ids from the Gold-Standard CSV file
def extractDocIDs(filePath):
    """ Extracts all ids from the gold standard """
    f = pd.read_csv(filePath)
    return set(f['trec_doc_id'])

# Get the name of the folders containing xml.gz files
def getFolderNames(myPath):
    dirNames = [d for d in listdir(myPath) if isdir(join(myPath, d))]
    return dirNames

# Get the name of the xml.gz files
def getGzFileNames(myPath):
    fileNames = [f for f in listdir(myPath) if isfile(join(myPath, f)) and f[-7:] == ".xml.gz"]
    return fileNames
        
# Extract relevant information from the papers inside the XML files that match the gold-standard
def extractFeatures(folderPath, docIDsPath, outputPath):
    st = time.time()
    
    # Get Pubmed Ids from the Gold-Standard
    ids = extractDocIDs(docIDsPath)
    print("Nr of PMIDs in the Gold-Standard:", len(ids))
    # Recover the names of each folder containing xml.gz files
    folderNames = getFolderNames(folderPath)
    
    nrExtractedXMLs = 0
    
    # Create CSV for the output
    with open(outputPath, 'w', encoding='utf-8') as extractFile:
        wr = csv.writer(extractFile, quoting=csv.QUOTE_ALL, delimiter="\t")
        wr.writerow(["trec_doc_id","title","abstract","major_mesh","minor_mesh"])
    
    # Iterate through the folders with the xml.gz files
    for folderName in folderNames:
        print("Looking into files from folder: ", folderName)
        gzFiles = getGzFileNames(join(folderPath, folderName))
        for gzFileName in gzFiles:
            print("Analyzing information from: ", gzFileName)
            gzFilePath = join(join(folderPath,folderName), gzFileName)
            parser = etree.XMLParser(encoding='utf-8', recover=True)
            pubMedArticleSet = etree.parse(gzip.open(gzFilePath, 'rt', encoding='utf-8'), parser=parser).getroot()
            for mc in pubMedArticleSet.iterfind('PubmedArticle/MedlineCitation'):
                pmid = mc.find("PMID").text
                # Verify if the PMID is in the list of IDs from the Gold-Standard
                majorMeshTerms = []
                minorMeshTerms = []
                abstractList = []
                if pmid in ids:
                    # Remove found pmid from ids set
                    ids.remove(pmid)
                    
                    print("Extracting info from the PMID: ", pmid)
                    # Get title
                    if mc.find("Article/ArticleTitle") is not None:
                        title = ''.join(mc.find("Article/ArticleTitle").itertext())
                    # Get abstract, including the structured ones
                    if mc.find("Article/Abstract") is not None:
                        for abstractPiece in mc.findall("Article/Abstract/AbstractText"):
                            abstractList.append(''.join(abstractPiece.itertext()))
                        abstract = ' '.join(abstractList)
                    # Extracting major and minor mesh descriptors
                    # Qualifiers - not taking into account major and minor attributes
                    for meshTerm in mc.findall("MeshHeadingList/MeshHeading"):
                        qualifiers = []
                        for qualifier in meshTerm.findall("QualifierName"):
                            qualifiers.append(meshTerm.find("DescriptorName").text + "/" + qualifier.text)
                        if not qualifiers:
                            fullMesh = meshTerm.find("DescriptorName").text
                            if meshTerm.find("DescriptorName").get("MajorTopicYN") == "Y":
                                majorMeshTerms.append(fullMesh)
                            else:
                                minorMeshTerms.append(fullMesh)
                        else:
                            if meshTerm.find("DescriptorName").get("MajorTopicYN") == "Y":
                                majorMeshTerms.extend(qualifiers)
                            else:
                                minorMeshTerms.extend(qualifiers)
                    majorMeshList = ";".join(majorMeshTerms)
                    minorMeshList = ";".join(minorMeshTerms)
                    
                    # Write the result to CSV
                    with open(outputPath, 'a', encoding='utf-8') as extractFile:
                        wr = csv.writer(extractFile, quoting=csv.QUOTE_ALL, delimiter="\t")
                        wr.writerow([pmid, title, abstract, majorMeshList, minorMeshList])
                
                    # Count the number of extracted papers
                    nrExtractedXMLs += 1
    
    print("Number of papers with information extracted: ", nrExtractedXMLs)
    end = time.time()
    print("Run time: ", end-st)

if __name__ == "__main__":
    # Path containing the medline_xml.partx folders - they need to be extracted first
    pubMedAbstracts = "../TREC/XML-Collection"
    # Path containing the Annotated Gold-Standard File
    docIDPath = "../TREC/gold-docs-annotations.csv"
    # Output file
    outputPath = "../resources/relevant-abstracts-from-XML.csv"
    
    # Extract relevant information from the XML files
    extractFeatures(pubMedAbstracts, docIDPath, outputPath)

# Read Relevant Pubmed Abstracts Output

In [None]:
abstracts = pd.read_csv("../resources/relevant-abstracts-from-XML.csv", sep='\t', encoding="utf-8", dtype={'trec_doc_id':object})
abstracts.shape

# Read Extra Abstracts

In [None]:
txtAbstracts = pd.read_csv("../resources/relevant-abstracts.csv", sep='\t', header=None, names=["trec_doc_id", "title", "abstract"], dtype={'trec_doc_id':object})
extraAbstracts = txtAbstracts[txtAbstracts['trec_doc_id'].str.contains('ASCO|AACR', regex=True)]
extraAbstracts.shape

# Merge Pubmed with Extra Abstracts

In [None]:
fullAbstracts = pd.concat([abstracts, extraAbstracts], sort=False)
fullAbstracts.shape

# Read Gold-Standard Annotation

In [None]:
annotations = pd.read_csv("../resources/gold-docs-annotations.csv", sep=',', encoding="utf-8", dtype={'trec_topic_number':object})
annotations.shape

# Read Relevance Score

In [None]:
relevance = pd.read_csv("../resources/qrels-treceval-abstracts.2017.txt", sep=' ', encoding="utf-8", header=None, 
                        names=["trec_topic_number", "x", "trec_doc_id", "relevance_score"], dtype={'trec_topic_number':object})
relevance.shape

# Read Information from Topics

## 2017

In [None]:
topicsColumns = ['trec_topic_number', 'trec_topic_disease', 'trec_topic_age', 'trec_topic_sex', 'trec_topic_other1', 'trec_topic_other2', 'trec_topic_other3']
topics = pd.DataFrame(columns=topicsColumns)
topicsXML = etree.parse("../resources/topics2017.xml")
for topic in topicsXML.getroot():
    topicNumber = topic.get('number')
    disease = topic.find('disease').text
    demographic = topic.find('demographic').text.split(' ')
    age = demographic[0]
    sex = demographic[1]
    other = topic.find('other').text.split(',')
    other1 = other[0]
    other2 = None
    other3 = None
    if len(other) == 2:
        other2 = other[1]
    if len(other) > 2:
        other3 = other[2]
    topics = topics.append(pd.Series([topicNumber, disease, age, sex, other1, other2, other3], index=topicsColumns), ignore_index=True)
topics.shape

## 2018

In [None]:
topicsColumns = ['trec_topic_number', 'trec_topic_disease', 'trec_topic_age', 'trec_topic_sex']
topics = pd.DataFrame(columns=topicsColumns)
topicsXML = etree.parse("../resources/topics2018.xml")
for topic in topicsXML.getroot():
    topicNumber = topic.get('number')
    disease = topic.find('disease').text
    demographic = topic.find('demographic').text.split(' ')
    age = demographic[0]
    sex = demographic[1]
    topics = topics.append(pd.Series([topicNumber, disease, age, sex], index=topicsColumns), ignore_index=True)
topics.shape

# Merge Relevance Score and Annotations

In [None]:
annotationsRelevance = annotations.merge(relevance, left_on=['trec_topic_number','trec_doc_id'], right_on=['trec_topic_number','trec_doc_id'], how='left')
annotationsRelevance.drop(["x"], axis=1, inplace=True)
annotationsRelevance.shape

# Merge Abstracts with Relevance Score and Annotations

In [None]:
processedAbstracts = annotationsRelevance.merge(fullAbstracts, left_on=['trec_doc_id'], right_on=['trec_doc_id'], how='left')
processedAbstracts.shape

# Add 2017 topics Information

In [None]:
processedGoldStandard = processedAbstracts.merge(topics, left_on=['trec_topic_number'], right_on=['trec_topic_number'], how='left')
processedGoldStandard.shape

# Save the Result into a new _.csv_

In [None]:
date = time.strftime("%Y%m%d")
processedGoldStandard.to_csv(path_or_buf='../resources/'+ date + 'processed-goldstandard.csv', sep='\t')