In [60]:
import re
import random
import os.path
import trustedanalytics as ia

ia.connect()

Already connected.  This client instance connected to server http://localhost:9099/v1 (version=0.4.3-201511049238) as user test_api_key_1 at 2015-11-16 14:03:11.171843.


In [61]:
# CONSTANTS...
HDFS_DATADIR_PATH = "data/TAPfest"
MEDLINEDIR = "PubMed"
# /PubmedArticle

In [62]:
def parse_xml_to_frame(path, tag, name):
    """
    Helper function to convert an xml file on the hdfs into a data frame...
    """
    xml = ia.XmlFile(path, tag)
    
    # Check that the frame doesn't already exist. Drop it, if it does...
    if name in ia.get_frame_names():
        sys.stderr.write("Dropping existing frame named {NAME}...\n".format(NAME=name))
        ia.drop_frames(name)	
    frame = ia.Frame(xml, name=name)
    return frame

In [63]:
ia.drop_frames("tutorial_pubmed_frame")
tutorial_pubmed_frame = parse_xml_to_frame(path=os.path.join(HDFS_DATADIR_PATH, MEDLINEDIR), tag="PubmedArticle", name="tutorial_pubmed_frame")



In [64]:
tutorial_pubmed_frame.column_names

[u'data_lines']

In [None]:
tmp = tutorial_pubmed_frame.take(1)

In [None]:
tmp[0][0]

In [66]:
def get_pmid(row):
    xml = row[0]
    try:
        return re.findall(r'\<PMID Version\=\"1\"\>(\d+)\</PMID\>', xml)[0]
    except:
        return 0

def get_mesh_terms(row):
    xml = row[0]
    try:
        keywords = re.findall(r'\<KeywordList Owner\=\"NLM\"\>(.*?)\</KeywordList\>', xml, re.DOTALL)[0]
    except:
        return ''
    kws = re.findall(r'<Keyword MajorTopicYN\=\"Y\">(.*?)\</Keyword\>', keywords)
    if len(kws) > 1:
        return "|".join(kws)
    elif len(kws) == 1:
        return kws[0]

def get_article_title(row):
    xml = row[0]
    try:
        article = re.findall(r'\<Article PubModel\=\"Print\"\>(.*?)\</Article\>', xml, re.DOTALL)[0]
    except:
        return 'None'
    try:
        title = re.findall(r'\<ArticleTitle\>\[?(.*?)\]?\</ArticleTitle\>', xml)[0]
        if title[-1] == "]":
            title = title[0:-1]
        elif title[len(title)-2:] == "].":
            title = title[:len(title)-2]
    except:
        tirle = 'None'
    return title
    
# tutorial_pubmed_frame.drop_columns(["PMID", "MeSH", "TITLE"])
tutorial_pubmed_frame.add_columns(get_pmid, ("PMID", str))
tutorial_pubmed_frame.add_columns(get_mesh_terms, ("MeSH", str))
tutorial_pubmed_frame.add_columns(get_article_title, ("TITLE", str))



In [67]:
tutorial_pubmed_frame.column_names

[u'data_lines', u'PMID', u'MeSH', u'TITLE']

In [68]:
# tutorial_pubmed_frame.inspect(n=100, columns=[i for i in tutorial_pubmed_frame.column_names if i != 'data_lines'])
tutorial_pubmed_frame.row_count

493191

In [69]:
def filter_mesh(row):
	return row['MeSH'] != ''

def filter_title(row):
    return row['TITLE'] != 'None'

# Filter out any rows without patid or visid...
tutorial_pubmed_frame.filter(filter_mesh)
tutorial_pubmed_frame.filter(filter_title)



In [70]:
tutorial_pubmed_frame.row_count

262998

In [86]:
tutorial_pubmed_frame.inspect(n=15, columns=[i for i in tutorial_pubmed_frame.column_names if i != 'data_lines'])

[##]  PMID      MeSH                                         
[0]   15425272  SKIN
[1]   15425271  CURARE|GASTROINTESTINAL DISEASE|PHLEBITIS
[2]   15425270  MENINGES
[3]   15425269  DERMATITIS|STREPTOMYCIN
[4]   15425268  LAURENCE-MOON-BIEDL SYNDROME
[5]   15425267  ERYSIPELAS|HERPES
[6]   15425266  CORNEA
[7]   15425265  ROENTGEN RAYS|THORAX
[8]   15425264  ARTERY|HEART
[9]   15418895  TISSUE
[10]  15418894  ARTERY|BLOOD VESSELS
[11]  15418893  ABNORMALITIES AND DEFORMITIES|FETUS|MONSTERS
[12]  15418892  EMBRYOLOGY
[13]  15418891  CARTILAGE
[14]  15418890  BRAIN|EMBRYOLOGY|FISH|NERVOUS SYSTEM, CENTRAL

[##]  TITLE                                                                     
[0]   So-called Gougerot's trisymptomatic disease; a clinical case
[1]   Curare in gastric disorders and in phlebitis
[2]   Tuberculous meningitis clinically cured
[3]   Dermatosis caused by streptomycin
[4]   Laurence-Moon-Biedl syndrome
[5]   Herpes zoster of the VIII beginning with erysipelas; Ramsay Hun

In [78]:
tutorial_pubmed_frame.column_names

[u'data_lines', u'PMID', u'MeSH', u'TITLE']

In [84]:
def add_gs(row):
    mesh = row['MeSH']
    mesh = mesh.lower()
    if 'skin' in mesh:
        return 1
    else:
        return 0

tutorial_pubmed_frame.drop_columns("GS")    
tutorial_pubmed_frame.add_columns(add_gs, ("GS", ia.int32))



In [80]:
tutorial_pubmed_frame.column_names

[u'data_lines', u'PMID', u'MeSH', u'TITLE', u'GS']

In [85]:
tutorial_pubmed_frame.categorical_summary("GS")



{u'categorical_summary': [{u'column': u'GS',
   u'levels': [{u'frequency': 260474,
     u'level': u'0',
     u'percentage': 0.9904029688438696},
    {u'frequency': 2524, u'level': u'1', u'percentage': 0.009597031156130465},
    {u'frequency': 0, u'level': u'Missing', u'percentage': 0.0},
    {u'frequency': 0, u'level': u'Other', u'percentage': 0.0}]}]}

In [87]:
# Create a working copy...
lts = tutorial_pubmed_frame.copy()

# Flatten on the separating character...
lts.flatten_column("MeSH", "|")

# We need to create a table ox ID x MED x Count...
lts_lda_input = lts.group_by(["PMID", "MeSH"], ia.agg.count)
if "lts_lda_input_renamed" in ia.get_frame_names():
    ia.drop_frames("lts_lda_input_renamed")
lts_lda_input_renamed = lts_lda_input.copy(name = "lts_lda_input_renamed")



In [88]:
# Set the parameters for the LDA model...
nTopics = 20
max_iterations = 2

# Clean up any old attempts...
ia.drop_models(['lts_lda_model'])

# Create LDA model..
lda_model =  ia.LdaModel(name='lts_lda_model')
lts_lda_model = lda_model.train(lts_lda_input_renamed, "PMID", "MeSH", 'count', num_topics=nTopics, max_iterations=max_iterations)

# We can extract the per-id topic distributions...
doc_mixes = lts_lda_model['topics_given_doc']



In [89]:
# Clean up old approaches...
ia.drop_frames(['lts_lda_feature_frame'])

# Copy the document x topic frame...
lts_lda_feature_frame = doc_mixes.copy(name='lts_lda_feature_frame')

# Create schema...
schema = map(lambda i: ('topic' + "_" + str(i), ia.float64), range(1, nTopics + 1))

# Create features frame...
lts_lda_feature_frame.add_columns(lambda row: (row['topic_probabilities']), schema)
lts_lda_feature_frame.drop_columns(['topic_probabilities'])



In [None]:
lts_lda_feature_frame.inspect()

In [90]:
tutorial_pubmed_lda = tutorial_pubmed_frame.join(lts_lda_feature_frame, 'PMID')



In [91]:
tutorial_pubmed_lda.inspect(columns=[i for i in tutorial_pubmed_lda.column_names if i != 'data_lines'])

[#]  PMID      MeSH                                     
[0]  18101813  AORTA/stricture|DUCTUS ARTERIOSUS
[1]  18208013  STREPTOMYCIN/therapy|TUBERCULOSIS/therapy
[2]  18122306  AGAR|BACTERIA/culture mediums--apparatus
[3]  18112343  MILK
[4]  18122757  KIDNEYS/infarction
[5]  18118635  MEDICINE/history
[6]  18110679  GYNECOLOGY
[7]  18891943  BLOOD/dyscrasia|DENTISTRY/blood in
[8]  18109112  ACROMEGALY|GIGANTISM|GROWTH
[9]  18122801  LEAD/determination

[#]  TITLE                                                                      
[0]  Aortic coarctation with patent ductus arteriosus.
[1]  Not Available
[2]  An automatic agar dispenser.
[3]  Not Available
[4]  Traumatic ischemic infarction of the kidney.
[5]  Not Available
[6]  Not Available
[7]  The blood dyscrasias and their effect on the practice of dentistry.
[8]  Interrelationship between pituitary growth factor and growth-promoting androgens in acromegaly and gigantism; quantitative evaluation of bone and soft tissue growth in

In [101]:
pmids = tutorial_pubmed_lda.download(n=tutorial_pubmed_lda.row_count, columns='PMID')
unique_pmids = list(set(list(pmids['PMID'])))
random.shuffle(unique_pmids)
test_pmids = unique_pmids[0:int(len(unique_pmids) * 0.10)]


def label_train_test(row, test_list=test_pmids, train_list=train_pmids):
    if row['PMID'] in test_pmids:
        return "TEST"
    else:
        return "TRAIN"
    

tutorial_pubmed_lda.add_columns(label_train_test, ("DATASET", str))



In [103]:
trainframe = tutorial_pubmed_lda.copy()
trainframe.filter(lambda row: row['DATASET'] == "TRAIN")
testframe = tutorial_pubmed_lda.copy()
testframe.filter(lambda row: row['DATASET'] == "TEST")



In [104]:
rf_model_tutorial = ia.RandomForestClassifierModel('rf_model_intel_tutorial1')



In [105]:
rf_model_tutorial_results = rf_model_tutorial.train(frame=trainframe, 
                                                    label_column="GS", 
                                                    observation_columns=["topic_1", "topic_2"], 
                                                    num_classes=2, 
                                                    impurity='gini', 
                                                    max_depth=4, 
                                                    seed=01001000
                                                   )



In [106]:
# test the model
x = rf_model_tutorial.test(testframe, "GS")



In [107]:
x

Precision: 0.0
Recall: 0.0
Accuracy: 0.990798129206
FMeasure: 0.0
Confusion Matrix: 
            Predicted_Pos  Predicted_Neg
Actual_Pos              0            242
Actual_Neg              0          26057

In [None]:
#x.publish()