# Prerequisites

* Annotations: https://drive.google.com/file/d/1IH4dL4OKG7bv57K8DreOeSAfJgkgC4sd/view
* Relevance score: http://www.trec-cds.org/qrels-treceval-abstracts.2017.txt
* Pubmed TXT collections: http://trec-cds.appspot.com/2018.html#documents
* 2017 Topics: http://trec-cds.appspot.com/topics2017.xml

In [None]:
from os import listdir, walk
from os.path import isfile, isdir, join, dirname
from collections import OrderedDict
from lxml import etree
import pandas as pd
import subprocess
import tarfile
import time
import csv
import re

In [None]:
def getGzFileNames(myPath):
    fileNames = [f for f in listdir(myPath) if isfile(join(myPath, f)) and f[-3:] == ".gz"]
    return fileNames

def decompressTarGz(myPath):
    fileNames = [f for f in listdir(myPath) if isfile(join(myPath, f)) and f[-7:] == ".tar.gz"]
    for file in fileNames:
        print("Extracting from:", file)
        tar = tarfile.open(join(myPath, file), "r:gz")
        tar.extractall(join(myPath, file[:-7]))
        tar.close()
        print("Done")
        
# Extract Ids from the Gold-Standard CSV File
def extractDocIDs(filePath):
    f = pd.read_csv(filePath)
    return set(f['trec_doc_id'])

def unzipTar(folderPath, docIDsPath, targetFolder=''):
    # Unzip either pubmed or extra abstracts from folderPath to targetFolder if they are in the gold standard
    ids = extractDocIDs(docIDsPath)
    print("Gold Standard Ids:", len(ids))
    tarFiles = getTarFileNames(folderPath)
    
    if targetFolder:
        outpuPath=join(folderPath, targetFolder)
    else:
        outpuPath=folderPath
    txtCounter = 0
    for tarFileName in tarFiles:
        print("Searching through:", tarFileName)
        tar = tarfile.open(join(folderPath, tarFileName), 'r:')
        for txtFile in tar:
            
            # Extract ID from full path
            docID = re.search( r'\/(.*)\.', txtFile.name)
            if docID:
                # Extract file only when there is a match
                if docID.group(1) in ids:
                    txtCounter += 1
                    ids.remove(docID.group(1))
                    tar.extract(txtFile, path=outpuPath)

        tar.close()
    print("Matched files:", txtCounter)

def getTarFileNames(myPath):
    fileNames = [f for f in listdir(myPath) if isfile(join(myPath, f)) and f[-4:] == ".tar"]
    return fileNames

def extractFeatures(folderPath, extractName):
    st = time.time()
    
    folders = [fo for fo in listdir(folderPath) if isdir(join(folderPath, fo))]
    with open(extractName, 'w') as extractFile:
         wr = csv.writer(extractFile, quoting=csv.QUOTE_ALL, delimiter="\t")
         wr.writerow(["id","title","abstract"])
    fCount = 0
    for fo in folders:
        print("fo: ", fo)
        filesInFo = [fi for fi in listdir(join(folderPath, fo)) if isfile(join(folderPath, fo, fi))]
        for fi in filesInFo:
            fCount += 1
            # print("fi: ", fi)
            fiObj = open(join(folderPath, fo, fi), encoding="utf8")
            fId = fi[:-4]
            lines = fiObj.readlines()
            title = lines[0].strip()
            abstract = ""
            for line in lines[1:]:
                if line.strip():
                    abstract += line.strip() + " "
            with open(extractName, 'a', encoding="utf8") as extractFile:
                wr = csv.writer(extractFile, quoting=csv.QUOTE_ALL, delimiter="\t")
                wr.writerow([fId, title, abstract])
    print("Extracted files:", fCount)

    end = time.time()
    print("time: ", end-st)

def extractExtraFeatures(extraAbstracts, extractedFeaturesFile):
    files = [fi for fi in listdir(extraAbstracts) if isfile(join(extraAbstracts, fi))]
    fCount = 0
    for fi in files:
        fCount += 1
        fiObj = open(join(extraAbstracts, fi), encoding="utf8")
        fId = fi[:-4]
        lines = fiObj.readlines()
        fullTitle = lines[1].strip()
        title = re.search( r'(Title:) (.*)', fullTitle).group(2)
        abstract = ""
        for line in lines[2:]:
            if line.strip():
                abstract += line.strip() + " "
        with open(extractedFeaturesFile, 'a', encoding="utf8") as extractFile:
            wr = csv.writer(extractFile, quoting=csv.QUOTE_ALL, delimiter="\t")
            wr.writerow([fId, title, abstract])    
    print("Extracted files:", fCount)

# Extract Pubmed Abstracts

In [None]:
# Path containing the downloaded txt collection
pubmedAbstractsPath = "/TREC/TXT-Collection/pubmed"
# Path containing the extracted txt collection
pubmedAbstractsNewPath = join(pubmedAbstractsPath,"medline_txt/medline_txt")
# Path containing the Annotated Gold-Standard File
goldIDsPath = "../resources/gold-docs-annotations"
# Output file
extractedFeaturesFile = "../resources/relevant-abstracts-TXT.csv"

# Decompress files
decompressTarGz(pubmedAbstractsPath)
abstractsGzFiles = getGzFileNames(pubmedAbstractsNewPath)

for abstractsGzFile in abstractsGzFiles:
    print("Extracting: ", abstractsGzFile)
    subprocess.call(['gunzip', '-d', join(pubmedAbstractsNewPath, abstractsGzFile)])
    print("Done")
    
unzipTar(pubmedAbstractsNewPath, goldIDsPath, "Gold")
extractFeatures(pubmedAbstractsNewPath + "/Gold", extractedFeaturesFile)

# Extract Extra Abstracts

In [None]:
# Path containing the downloaded txt collection (extra abstracts)
extraAbstractsPath = "/TREC/TXT-Collection/extra_abstracts"
# Path containing the extracted txt collection (extra abstracts)
extraAbstractsNewPath = join(extraAbstractsPath,"extra_abstracts")
abstractsGzFiles = getGzFileNames(extraAbstractsPath)

for abstractsGzFile in abstractsGzFiles:
    print("Extracting: ", abstractsGzFile)
    subprocess.call(['gunzip', '-d', join(abstractsPath, abstractsGzFile)])
    print("Done")

unzipTar(extraAbstractsPath, goldIDsPath)
extractExtraFeatures(extraAbstractsNewPath, extractedFeaturesFile)

# Read Abstracts

In [None]:
txtAbstracts = pd.read_csv("../resources/relevant-abstracts-TXT.csv", sep='\t', header=None, names=["trec_doc_id", "title", "abstract"], dtype={'trec_doc_id':object})
txtAbstracts.shape

# Read Annotations

In [None]:
annotations = pd.read_csv("../resources/gold-doc-IDs.csv", sep=',', encoding="utf-8", dtype={'trec_topic_number':object})
annotations.shape

# Read Relevance Score

In [None]:
relevance = pd.read_csv("../resources/qrels-treceval-abstracts.2017.txt", sep=' ', encoding="utf-8", header=None, 
                        names=["trec_topic_number", "x", "trec_doc_id", "relevance_score"], dtype={'trec_topic_number':object})
relevance.shape

# Read Topics

## 2017

In [None]:
topicsColumns = ['trec_topic_number', 'trec_topic_disease', 'trec_topic_age', 'trec_topic_sex', 'trec_topic_other1', 'trec_topic_other2', 'trec_topic_other3']
topics = pd.DataFrame(columns=topicsColumns)
topicsXML = etree.parse("../resources/topics2017.xml")
for topic in topicsXML.getroot():
    topicNumber = topic.get('number')
    disease = topic.find('disease').text
    demographic = topic.find('demographic').text.split(' ')
    age = demographic[0]
    sex = demographic[1]
    other = topic.find('other').text.split(',')
    other1 = other[0]
    other2 = None
    other3 = None
    if len(other) == 2:
        other2 = other[1]
    if len(other) > 2:
        other3 = other[2]
    topics = topics.append(pd.Series([topicNumber, disease, age, sex, other1, other2, other3], index=topicsColumns), ignore_index=True)
topics.shape

## 2018

In [None]:
topicsColumns = ['trec_topic_number', 'trec_topic_disease', 'trec_topic_age', 'trec_topic_sex']
topics = pd.DataFrame(columns=topicsColumns)
topicsXML = etree.parse("../resources/topics2018.xml")
for topic in topicsXML.getroot():
    topicNumber = topic.get('number')
    disease = topic.find('disease').text
    demographic = topic.find('demographic').text.split(' ')
    age = demographic[0]
    sex = demographic[1]
    topics = topics.append(pd.Series([topicNumber, disease, age, sex], index=topicsColumns), ignore_index=True)
topics.shape

# Merge Relevance Score and Annotations

In [None]:
annotationsRelevance = annotations.merge(relevance, left_on=['trec_topic_number','trec_doc_id'], right_on=['trec_topic_number','trec_doc_id'], how='left')
annotationsRelevance.drop(["x"], axis=1, inplace=True)
annotationsRelevance.shape

# Merge Abstracts with Relevance Score and Annotations

In [None]:
processedAbstracts = annotationsRelevance.merge(txtAbstracts, left_on=['trec_doc_id'], right_on=['trec_doc_id'], how='left')
processedAbstracts.shape

# Add 2017 Topics Information

In [None]:
processedGoldStandard = processedAbstracts.merge(topics, left_on=['trec_topic_number'], right_on=['trec_topic_number'], how='left')
processedGoldStandard.shape

# Save the Result into a new _.csv_

In [None]:
date = time.strftime("%Y%m%d")
processedGoldStandard.to_csv(path_or_buf='../resources/'+ date + 'processed-goldstandard-TXT.csv', sep='\t')