# Prerequisites

* Any Run in the TREC Format
* Pubmed XML collection: http://trec-cds.appspot.com/2018.html#documents
* 2018 Topics: http://trec-cds.appspot.com/topics2018.xml
* Extra Abstracts TXT collection: http://trec-cds.appspot.com/2018.html#documents

In [None]:
from os import listdir
from os.path import isfile, isdir, join
from lxml import etree
import pandas as pd
import tarfile
import gzip
import time
import csv
import csv
import re

# Decompress _.tar.gz_ Files

In [None]:
# Decompress Files
def decompress(myPath):
    fileNames = [f for f in listdir(myPath) if isfile(join(myPath, f)) and f[-7:] == ".tar.gz"]
    for file in fileNames:
        print("Extracting from: ", file)
        tar = tarfile.open(join(myPath, file), "r:gz")
        tar.extractall(join(myPath, file[:-7]))
        tar.close()
        print("Done")
        
if __name__ == "__main__":
    # Path containing the medline_xml.part[x].tar.gz files (Pubmed XML collection)
    pubMedAbstracts = "../TREC/XML-Collection"

    # Decompress files
    decompress(pubMedAbstracts)

# Extract Information from Pubmed XML and Extra Abstracts Text Files

In [None]:
# Extract Pubmed Ids from the CSV file
def extractDocIDs(filePath):
    """ Extracts all ids from the gold standard """
    f = pd.read_csv(filePath, 
                    names=["trec_topic_number", "x", "trec_doc_id", "order", "relevance", "name"], dtype={'trec_doc_id':object},
                    sep="\t", encoding="utf-8"
                   )
    return set(f['trec_doc_id'])

# Get the name of the folders containing xml.gz files
def getFolderNames(myPath):
    dirNames = [d for d in listdir(myPath) if isdir(join(myPath, d))]
    return dirNames

# Get the name of the xml.gz files
def getGzFileNames(myPath):
    fileNames = [f for f in listdir(myPath) if isfile(join(myPath, f)) and f[-7:] == ".xml.gz"]
    return fileNames

def getTarFileNames(myPath):
    fileNames = [f for f in listdir(myPath) if isfile(join(myPath, f)) and f[-4:] == ".tar"]
    return fileNames

def unzipTar(folderPath, docIDsPath, targetFolder=''):
    # Unzip either pubmed or extra abstracts from folderPath to targetFolder if they are in the gold standard
    ids = extractDocIDs(docIDsPath)
    print("Gold Standard Ids:", len(ids))
    tarFiles = getTarFileNames(folderPath)
    print(tarFiles)
    
    if targetFolder:
        outpuPath=join(folderPath, targetFolder)
    else:
        outpuPath=folderPath
    txtCounter = 0
    for tarFileName in tarFiles:
        print("Searching through:", tarFileName)
        tar = tarfile.open(join(folderPath, tarFileName), 'r:')
        for txtFile in tar:
            
            # Extract ID from full path
            docID = re.search( r'\/(.*)\.', txtFile.name)
            if docID:
                # Extract file only when there is a match
                if docID.group(1) in ids:
                    txtCounter += 1
                    ids.remove(docID.group(1))
                    tar.extract(txtFile, path=outpuPath)

        tar.close()
    print("Matched files:", txtCounter)
        
# Extract relevant information from the papers inside the XML files that match the gold-standard
def extractFeatures(folderPath, docIDsPath, outputPath):
    st = time.time()
    
    # Get Pubmed Ids from the Run
    ids = extractDocIDs(docIDsPath)
    print("Nr of PMIDs in the Gold-Standard:", len(ids))
    # Recover the names of each folder containing xml.gz files
    
    folderNames = getFolderNames(folderPath)
    
    nrExtractedXMLs = 0
    
    # Create CSV for the output
    with open(outputPath, 'w', encoding='utf-8') as extractFile:
        wr = csv.writer(extractFile, quoting=csv.QUOTE_ALL, delimiter="\t")
        wr.writerow(["trec_doc_id","title","abstract","major_mesh","minor_mesh"])
    
    # Iterate through the folders with the xml.gz files
    for folderName in folderNames:
        print("Looking into files from folder: ", folderName)
        gzFiles = getGzFileNames(join(folderPath, folderName))
        for gzFileName in gzFiles:
            print("Analyzing information from: ", gzFileName)
            gzFilePath = join(join(folderPath,folderName), gzFileName)
            parser = etree.XMLParser(encoding='utf-8', recover=True)
            pubMedArticleSet = etree.parse(gzip.open(gzFilePath, 'rt', encoding='utf-8'), parser=parser).getroot()
            for mc in pubMedArticleSet.iterfind('PubmedArticle/MedlineCitation'):
                pmid = mc.find("PMID").text
                # Verify if the PMID is in the list of IDs from the Run
                majorMeshTerms = []
                minorMeshTerms = []
                abstractList = []
                if pmid in ids:
                    # Remove found pmid from ids set
                    ids.remove(pmid)
                    
                    print("Extracting info from the PMID: ", pmid)
                    # Get title
                    if mc.find("Article/ArticleTitle") is not None:
                        title = ''.join(mc.find("Article/ArticleTitle").itertext())
                    # Get abstract, including the structured ones
                    if mc.find("Article/Abstract") is not None:
                        for abstractPiece in mc.findall("Article/Abstract/AbstractText"):
                            abstractList.append(''.join(abstractPiece.itertext()))
                        abstract = ' '.join(abstractList)
                    # Extracting major and minor mesh descriptors
                    # Qualifiers - not taking into account major and minor attributes
                    for meshTerm in mc.findall("MeshHeadingList/MeshHeading"):
                        qualifiers = []
                        for qualifier in meshTerm.findall("QualifierName"):
                            qualifiers.append(meshTerm.find("DescriptorName").text + "/" + qualifier.text)
                        if not qualifiers:
                            fullMesh = meshTerm.find("DescriptorName").text
                            if meshTerm.find("DescriptorName").get("MajorTopicYN") == "Y":
                                majorMeshTerms.append(fullMesh)
                            else:
                                minorMeshTerms.append(fullMesh)
                        else:
                            if meshTerm.find("DescriptorName").get("MajorTopicYN") == "Y":
                                majorMeshTerms.extend(qualifiers)
                            else:
                                minorMeshTerms.extend(qualifiers)
                    majorMeshList = ";".join(majorMeshTerms)
                    minorMeshList = ";".join(minorMeshTerms)
                    
                    # Write the result to CSV
                    with open(outputPath, 'a', encoding='utf-8') as extractFile:
                        wr = csv.writer(extractFile, quoting=csv.QUOTE_ALL, delimiter="\t")
                        wr.writerow([pmid, title, abstract, majorMeshList, minorMeshList])
                
                    # Count the number of extracted papers
                    nrExtractedXMLs += 1
    
    print("Number of papers with information extracted: ", nrExtractedXMLs)
    end = time.time()
    print("Run time: ", end-st)
    
def extractExtraFeatures(extraAbstracts, extractedFeaturesFile):
    files = [fi for fi in listdir(extraAbstracts) if isfile(join(extraAbstracts, fi))]
    fCount = 0
    for fi in files:
        fCount += 1
        fiObj = open(join(extraAbstracts, fi), encoding="utf8")
        fId = fi[:-4]
        lines = fiObj.readlines()
        fullTitle = lines[1].strip()
        title = re.search( r'(Title:) (.*)', fullTitle).group(2)
        abstract = ""
        for line in lines[2:]:
            if line.strip():
                abstract += line.strip() + " "
        with open(extractedFeaturesFile, 'a', encoding="utf8") as extractFile:
            wr = csv.writer(extractFile, quoting=csv.QUOTE_ALL, delimiter="\t")
            wr.writerow([fId, title, abstract])    
    print("Extracted files:", fCount)

In [None]:
# Path containing the Run File
docIDPath = "../results/runs/run.trec_results"

In [None]:
if __name__ == "__main__":
    # Path containing the medline_xml.partx folders - they need to be extracted first
    pubMedAbstracts = "../TREC/XML-Collection"
   
    # Output file
    outputPath = "../resources/relevant-abstracts-pubrun.csv"
    
    # Extract relevant information from the XML files
    extractFeatures(pubMedAbstracts, docIDPath, outputPath)

# Read Relevant Pubmed Abstracts Output

In [None]:
abstracts = pd.read_csv("../resources/relevant-abstracts-pubrun.csv", sep='\t', encoding="utf-8", dtype={'trec_doc_id':object})
abstracts.head(5)

# Extract and Read Extra Abstracts

In [None]:
# Path containing the downloaded txt collection (extra abstracts)
extraAbstractsPath = "../TXT-Collection/extra_abstracts"
# Path containing the extracted txt collection (extra abstracts)
extraAbstractsNewPath = join(extraAbstractsPath,"extra_abstracts")
abstractsGzFiles = getGzFileNames(extraAbstractsPath)
extractedFeaturesFile = "../resources/relevant-abstracts-pubrun.csv"

for abstractsGzFile in abstractsGzFiles:
    print("Extracting: ", abstractsGzFile)
    subprocess.call(['gunzip', '-d', join(abstractsPath, abstractsGzFile)])
    print("Done")

unzipTar(extraAbstractsPath, docIDPath)
extractExtraFeatures(extraAbstractsNewPath, extractedFeaturesFile)

In [None]:
abstracts = pd.read_csv("../resources/relevant-abstracts-pubrun.csv", sep='\t', encoding="utf-8", dtype={'trec_doc_id':object})
abstracts.head()

# Read Run File

In [None]:
run = pd.read_csv("../results/runs/run.trec_results", sep='\t', encoding="utf-8", header=None, 
                        names=["trec_topic_number", "x", "trec_doc_id", "order", "relevance_score", "run_name"], dtype={'trec_topic_number':object})
run.head()

# Merge Run Info with Abstract, Title, etc.

In [None]:
abstractsRun = run.merge(abstracts, left_on=['trec_doc_id'], right_on=['trec_doc_id'], how='left')
abstractsRun.drop(["order", "x"], axis=1, inplace=True)
abstractsRun.head(5)

In [None]:
runName = run["run_name"].unique()
runName

# Read Information from Topics

In [None]:
topicsColumns = ['trec_topic_number', 'trec_topic_disease', 'trec_topic_age', 'trec_topic_sex']
topics = pd.DataFrame(columns=topicsColumns)
topicsXML = etree.parse("../resources/topics2017.xml")
for topic in topicsXML.getroot():
    topicNumber = topic.get('number')
    disease = topic.find('disease').text
    demographic = topic.find('demographic').text.split(' ')
    age = demographic[0]
    sex = demographic[1]
    topics = topics.append(pd.Series([topicNumber, disease, age, sex], index=topicsColumns), ignore_index=True)
topics.head(1)

# Add topics Information

In [None]:
processedRun = abstractsRun.merge(topics, left_on=['trec_topic_number'], right_on=['trec_topic_number'], how='left')
processedRun['relevance_score'] = 0
processedRun.tail()

# Save the Result into a new _.csv_

In [None]:
date = time.strftime("%Y%m%d")
processedRun.to_csv(path_or_buf='../results/runs/'+ date + 'processed'+runName[0]+'.tsv', index=False, sep='\t')