In [148]:
import re
import random
import os.path
import trustedanalytics as ia
import xml.etree.ElementTree as ET

ia.connect()

Already connected.  This client instance connected to server http://localhost:9099/v1 (version=0.4.3-201511049238) as user test_api_key_1 at 2015-11-16 14:03:11.171843.


In [133]:
# CONSTANTS...
HDFS_DATADIR_PATH = "data/TAPfest"
MEDLINEDIR = "PubMed"
ARTICLE_SEPARATION_TAG = "PubmedArticle"
DEFAULT_MONTH = 1
DEFAULT_DAY = 1
DEFAULT_MISSING_YEAR = 1

In [None]:
def parse_xml_to_frame(path, tag, name):
    """
    Helper function to convert an xml file on the hdfs into a data frame...
    """
    xml = ia.XmlFile(path, tag)
    
    # Check that the frame doesn't already exist. Drop it, if it does...
    if name in ia.get_frame_names():
        sys.stderr.write("Dropping existing frame named {NAME}...\n".format(NAME=name))
        ia.drop_frames(name)	
    frame = ia.Frame(xml, name=name)
    return frame

In [None]:
# Create Frame...
ia.drop_frames("tutorial_pubmed_frame")
tutorial_pubmed_frame = parse_xml_to_frame(path=os.path.join(HDFS_DATADIR_PATH, MEDLINEDIR), tag=ARTICLE_SEPARATION_TAG, name="tutorial_pubmed_frame")

In [None]:
tutorial_pubmed_frame.column_names

In [170]:
# Let's extract some data!
def get_pmid(row):
    """Function to extract the publication-unique identifier"""
    xml = row[0]
    try:
        return re.findall(r'\<PMID Version\=\"1\"\>(\d+)\</PMID\>', xml)[0]
    except:
        return 0

def get_mesh_terms(row):
    """Function to extract subject headings"""
    xml = row[0]
    try:
        keywords = re.findall(r'\<KeywordList Owner\=\"NLM\"\>(.*?)\</KeywordList\>', xml, re.DOTALL)[0]
    except:
        return ''
    kws = re.findall(r'<Keyword MajorTopicYN\=\"Y\">(.*?)\</Keyword\>', keywords)
    if len(kws) > 1:
        return "|".join(kws)
    elif len(kws) == 1:
        return kws[0]

def get_article_title(row):
    """Function to extract article title"""
    xml = row[0]
    try:
        article = re.findall(r'\<Article PubModel\=\"Print\"\>(.*?)\</Article\>', xml, re.DOTALL)[0]
    except:
        return 'None'
    try:
        title = re.findall(r'\<ArticleTitle\>\[?(.*?)\]?\</ArticleTitle\>', xml)[0]
        if title[-1] == "]":
            title = title[0:-1]
        elif title[len(title)-2:] == "].":
            title = title[:len(title)-2]
    except:
        tirle = 'None'
    return title

def get_publication_year(row):
    """Function to extract publication year"""
    xml = row[0]
    try:
        journal = re.findall(r'\<Journal\>(.*?)\</Journal\>', xml, re.DOTALL)[0]
        year = re.findall(r'\<Year\>(.*?)\</Year\>', journal)[0]
        return year
    except:
        return DEFAULT_MISSING_YEAR

tutorial_pubmed_frame.add_columns(get_pmid, ("PMID", str))
tutorial_pubmed_frame.add_columns(get_mesh_terms, ("MeSH", str))
tutorial_pubmed_frame.add_columns(get_article_title, ("TITLE", str))
tutorial_pubmed_frame.add_columns(get_publication_year, ("PUBYEAR", ia.int64))



In [146]:
# Let's confirm that all columns were added...
tutorial_pubmed_frame.column_names

[u'data_lines', u'PMID', u'MeSH', u'TITLE', u'GS', u'PUBYEAR']

In [172]:
# Let's generate a table describing the number of publications x year...
tmp_count = tutorial_pubmed_frame.group_by('PUBYEAR', ia.agg.count)
tmp_count.sort(columns="count", ascending=False)
tmp_count.inspect(n=tmp_count.row_count)



[##]  PUBYEAR  count
[0]      1948  57887
[1]      1947  57159
[2]      1949  54133
[3]      1946  46673
[4]         1  20857
[5]      1945  14162
[6]      1950  12103
[7]      1944     13
[8]      1942      7
[9]      1940      3
[10]     1930      1

In [173]:
# Let's filter out nonsense data...
tutorial_pubmed_frame.row_count

262998

In [175]:
def filter_mesh(row):
	return row['MeSH'] != ''

def filter_title(row):
    return row['TITLE'] != 'None'

def filter_pubyear(row):
    return row['PUBYEAR'] != 1

# Filter out any rows without patid, visid, or publication year...
tutorial_pubmed_frame.filter(filter_mesh)
tutorial_pubmed_frame.filter(filter_title)
tutorial_pubmed_frame.filter(filter_pubyear)



In [176]:
# Confirm that rows were dropped...
tutorial_pubmed_frame.row_count

242141

In [177]:
# Let's inspect the frame schema...
tutorial_pubmed_frame.schema

[(u'data_lines', unicode),
 (u'PMID', unicode),
 (u'MeSH', unicode),
 (u'TITLE', unicode),
 (u'GS', numpy.int32),
 (u'PUBYEAR', numpy.int64)]

In [126]:
# We can extract a single row...
tmp = tutorial_pubmed_frame.take(1)
tmp[0][1:]

[u'15425272',
 u'SKIN',
 u"So-called Gougerot's trisymptomatic disease; a clinical case",
 1,
 u'0005-01-01T00:00:00.000-07:52:58']

In [178]:
# Let's inspect the frame...
tutorial_pubmed_frame.inspect(n=15, columns=[i for i in tutorial_pubmed_frame.column_names if i != 'data_lines'])

[##]  PMID      MeSH                                         
[0]   15425272  SKIN
[1]   15425271  CURARE|GASTROINTESTINAL DISEASE|PHLEBITIS
[2]   15425270  MENINGES
[3]   15425269  DERMATITIS|STREPTOMYCIN
[4]   15425268  LAURENCE-MOON-BIEDL SYNDROME
[5]   15425267  ERYSIPELAS|HERPES
[6]   15425266  CORNEA
[7]   15425265  ROENTGEN RAYS|THORAX
[8]   15425264  ARTERY|HEART
[9]   15418895  TISSUE
[10]  15418894  ARTERY|BLOOD VESSELS
[11]  15418893  ABNORMALITIES AND DEFORMITIES|FETUS|MONSTERS
[12]  15418892  EMBRYOLOGY
[13]  15418891  CARTILAGE
[14]  15418890  BRAIN|EMBRYOLOGY|FISH|NERVOUS SYSTEM, CENTRAL

[##]  TITLE                                                                     
[0]   So-called Gougerot's trisymptomatic disease; a clinical case
[1]   Curare in gastric disorders and in phlebitis
[2]   Tuberculous meningitis clinically cured
[3]   Dermatosis caused by streptomycin
[4]   Laurence-Moon-Biedl syndrome
[5]   Herpes zoster of the VIII beginning with erysipelas; Ramsay Hun

In [None]:
# Let's add some gold-standard labels...
def add_gs(row):
    """Function to add gold-standard labels based on whether the term skin appears in subject headings"""
    mesh = row['MeSH']
    mesh = mesh.lower()
    if 'skin' in mesh:
        return 1
    else:
        return 0

tutorial_pubmed_frame.drop_columns("GS")    
tutorial_pubmed_frame.add_columns(add_gs, ("GS", ia.int32))

In [None]:
# Confirm that it worked...
tutorial_pubmed_frame.column_names

In [None]:
# Get a summary of the labels...
tutorial_pubmed_frame.categorical_summary("GS")

In [None]:
# Generate the frame needed to perform topic modeling...
# Create a working copy...
lts = tutorial_pubmed_frame.copy()

# Flatten on the separating character...
lts.flatten_column("MeSH", "|")

# We need to create a table ox ID x MED x Count...
lts_lda_input = lts.group_by(["PMID", "MeSH"], ia.agg.count)
if "lts_lda_input_renamed" in ia.get_frame_names():
    ia.drop_frames("lts_lda_input_renamed")
lts_lda_input_renamed = lts_lda_input.copy(name = "lts_lda_input_renamed")

In [None]:
# Set up LDA model...
# Set the parameters for the LDA model...
nTopics = 20
max_iterations = 2

# Clean up any old attempts...
ia.drop_models(['lts_lda_model'])

# Create LDA model..
lda_model =  ia.LdaModel(name='lts_lda_model')
lts_lda_model = lda_model.train(lts_lda_input_renamed, "PMID", "MeSH", 'count', num_topics=nTopics, max_iterations=max_iterations)

# We can extract the per-id topic distributions...
doc_mixes = lts_lda_model['topics_given_doc']

In [None]:
# Clean up old approaches...
ia.drop_frames(['lts_lda_feature_frame'])

# Copy the document x topic frame...
lts_lda_feature_frame = doc_mixes.copy(name='lts_lda_feature_frame')

# Create schema...
schema = map(lambda i: ('topic' + "_" + str(i), ia.float64), range(1, nTopics + 1))

# Create features frame...
lts_lda_feature_frame.add_columns(lambda row: (row['topic_probabilities']), schema)
lts_lda_feature_frame.drop_columns(['topic_probabilities'])

In [None]:
lts_lda_feature_frame.inspect()

In [None]:
tutorial_pubmed_lda = tutorial_pubmed_frame.join(lts_lda_feature_frame, 'PMID')

In [None]:
tutorial_pubmed_lda.inspect(columns=[i for i in tutorial_pubmed_lda.column_names if i != 'data_lines'])

In [None]:
# Partition data set into train and test collections...
pmids = tutorial_pubmed_lda.download(n=tutorial_pubmed_lda.row_count, columns='PMID')
unique_pmids = list(set(list(pmids['PMID'])))
random.shuffle(unique_pmids)
test_pmids = unique_pmids[0:int(len(unique_pmids) * 0.10)]


def label_train_test(row, test_list=test_pmids, train_list=train_pmids):
    """Simple function to partition into train/test collections based on look-up"""
    if row['PMID'] in test_pmids:
        return "TEST"
    else:
        return "TRAIN"

tutorial_pubmed_lda.add_columns(label_train_test, ("DATASET", str))

In [None]:
trainframe = tutorial_pubmed_lda.copy()
trainframe.filter(lambda row: row['DATASET'] == "TRAIN")
testframe = tutorial_pubmed_lda.copy()
testframe.filter(lambda row: row['DATASET'] == "TEST")

In [None]:
rf_model_tutorial = ia.RandomForestClassifierModel('rf_model_intel_tutorial1')

In [None]:
rf_model_tutorial_results = rf_model_tutorial.train(frame=trainframe, 
                                                    label_column="GS", 
                                                    observation_columns=["topic_1", "topic_2"], 
                                                    num_classes=2, 
                                                    impurity='gini', 
                                                    max_depth=4, 
                                                    seed=01001000
                                                   )

In [None]:
# test the model
x = rf_model_tutorial.test(testframe, "GS")

In [179]:
x

Precision: 0.0
Recall: 0.0
Accuracy: 0.990798129206
FMeasure: 0.0
Confusion Matrix: 
            Predicted_Pos  Predicted_Neg
Actual_Pos              0            242
Actual_Neg              0          26057

In [182]:
rf_model_tutorial.publish()



u'hdfs://master.poc-mtsinai.gao.cluster:8020/user/iauser/models_7e6588c9ba6d490bba5811484a76861a.tar'