In [1]:
'''In this notebook we will classify the prescence/abscence of HPO terms for 183,000 PubMed Abstracts using Facebook's
FastText algorithm. We need the following files:

articles_hpo.txt : a map of articles to the appropriate HPO terms 
pmid_abstract.txt : a map of pmids and abstracts 

NOTE: pmid_abstract.txt includes articles that do not have an HPO term, so we 
will use the list of articles in articles_hpo.txt 


Using this API: https://pypi.python.org/pypi/fasttext
'''

import fasttext as ft 

import numpy as np 
import re 
import os 
import pandas as pd 

from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score

#if the text files are somewhere else, change this 
data_path = os.getcwd()+'/data/'

articles_hpo_file = open(data_path+'articles_hpo.txt')
pmid_abstracts_file = open(data_path+'pmid_abstract.txt')

#make an array of pmids and set of HPO terms 
print('Parsing files...')
articles_hpo = {}
pmids = set() 
hpo_terms = set() 
for line in articles_hpo_file:
    parse = line.rstrip().split('\t')
    parse[0] = parse[0].replace('"','')
    articles_hpo[int(parse[0])] = '\t'.join(parse[1:])
    pmids.add(int(parse[0]))
    hpo_terms |= set(parse[1:])

#make a dictionary of abstracts 
pmid_abstracts = {}
for line in pmid_abstracts_file:
    parse = line.rstrip().split('\t')
    pmid_abstracts[int(parse[0])] = pmid_abstracts.get(parse[0],'')+'\t'.join(parse[1:])
    
print('Creating DataFrame...')
#make a matrix of articles and hpo terms using Pandas DataFrame
df = pd.DataFrame(index=list(pmids), columns = list(hpo_terms))
df = df.fillna(0)
#fill in our dataframe 
for pmid in pmids: 
    for hpo_term in articles_hpo[pmid].split('\t'):
        df.loc[pmid,hpo_term] = 1
    
#make sure we are only dealing with articles that have abstracts and HPO terms 
pmids = pmids.intersection(set(pmid_abstracts.keys()))
print('Working with {} articles and {} hpo terms'.format(len(pmids),len(hpo_terms)))

Parsing files...
Creating DataFrame...
Working with 157473 articles and 1307 hpo terms


In [2]:
#Function to clean and add an abstract to our list of clean abstracts as a single line 
def abstract_to_line(abstract, hpo_terms, pmid, THRESHOLD=100):
    #remove non words 
    clean_abstract = re.sub("[^a-zA-Z0-9]"," ", abstract)
    #create the label string
    hpo_terms = hpo_terms.split('\t')
    #check whether we have enough example documents for HPO terms 
    #df is the dataframe of HPO terms and documents 
    for hpo in hpo_terms: 
        if np.sum(df.loc[:,hpo].values)<THRESHOLD:
            hpo_terms.remove(hpo)
    #if there are no HPO terms left, return nothing
    if len(hpo_terms)==0:
        return
    label_string = '__label__'+ ' , __label__'.join(hpo_terms)
    return(label_string+' , {} , '.format(pmid)+clean_abstract.lower().rstrip()+'\n')

#count the number of unique labels in a text file
def count_hpo_terms(file_path,label_prefix='__label__'):
    hpo_terms = set()
    for line in open(file_path):
        for word in line.split():
            if word.startswith(label_prefix):
                hpo_terms.add(word.split(label_prefix)[-1])
    return len(hpo_terms),hpo_terms


In [12]:
#array that we will shuffle for test/train splits 
indices = np.arange(len(pmids))

#generate train/test split 
train_index, test_index = train_test_split(indices, test_size=.2)
pmids = np.array(list(pmids))
train_pmids = pmids[train_index]
test_pmids = pmids[test_index]

#delete the train/test file if it exists 
try: 
    os.remove(data_path+'train.txt')
    print('Generating train file...')
except:
    print('Generating train file...')
try: 
    os.remove(data_path+'test.txt')
    print('Generating test file...')
except:
    print('Generating test file...')

train_file = open(data_path+'train.txt','a')
test_file = open(data_path+'test.txt','a')


#populate the test/train files
THRESHOLD = 5000
for pmid in train_pmids:
    line = abstract_to_line(pmid_abstracts[pmid],articles_hpo[pmid],pmid, THRESHOLD)
    if line is not None:
        train_file.write(line)

for pmid in test_pmids:
    line = abstract_to_line(pmid_abstracts[pmid],articles_hpo[pmid],pmid, THRESHOLD)
    if line is not None:
        test_file.write(line)

test_file.close()
train_file.close()



Generating train file...
Generating test file...


In [13]:
#train the classifier 
classifier = ft.supervised(data_path+'train.txt', 'model', label_prefix='__label__')

In [14]:
#test the classifier 
result = classifier.test(data_path+'test.txt')
print('P@1:', result.precision)
print('R@1:', result.recall)
print('Number of abstracts:', result.nexamples)
print('Number of HPO terms:', count_hpo_terms(data_path+'test.txt')[0])

P@1: 0.4849689261453967
R@1: 0.44194929206453737
Number of abstracts: 13838
Number of HPO terms: 795
