In [None]:
import re
import random
import os.path
import trustedanalytics as ia

ia.connect()

In [None]:
# CONSTANTS...
HDFS_DATADIR_PATH = "data/TAPfest"
MEDLINEDIR = "PubMed"
# /PubmedArticle

In [None]:
def parse_xml_to_frame(path, tag, name):
    """
    Helper function to convert an xml file on the hdfs into a data frame...
    """
    xml = ia.XmlFile(path, tag)
    
    # Check that the frame doesn't already exist. Drop it, if it does...
    if name in ia.get_frame_names():
        sys.stderr.write("Dropping existing frame named {NAME}...\n".format(NAME=name))
        ia.drop_frames(name)	
    frame = ia.Frame(xml, name=name)
    return frame

In [None]:
ia.drop_frames("tutorial_pubmed_frame")
tutorial_pubmed_frame = parse_xml_to_frame(path=os.path.join(HDFS_DATADIR_PATH, MEDLINEDIR), tag="PubmedArticle", name="tutorial_pubmed_frame")

In [None]:
tutorial_pubmed_frame.column_names

In [None]:
def get_pmid(row):
    xml = row[0]
    try:
        return re.findall(r'\<PMID Version\=\"1\"\>(\d+)\</PMID\>', xml)[0]
    except:
        return 0

def get_mesh_terms(row):
    xml = row[0]
    try:
        keywords = re.findall(r'\<KeywordList Owner\=\"NLM\"\>(.*?)\</KeywordList\>', xml, re.DOTALL)[0]
    except:
        return ''
    kws = re.findall(r'<Keyword MajorTopicYN\=\"Y\">(.*?)\</Keyword\>', keywords)
    if len(kws) > 1:
        return "|".join(kws)
    elif len(kws) == 1:
        return kws[0]

def get_article_title(row):
    xml = row[0]
    try:
        article = re.findall(r'\<Article PubModel\=\"Print\"\>(.*?)\</Article\>', xml, re.DOTALL)[0]
    except:
        return 'None'
    try:
        title = re.findall(r'\<ArticleTitle\>\[?(.*?)\]?\</ArticleTitle\>', xml)[0]
        if title[-1] == "]":
            title = title[0:-1]
        elif title[len(title)-2:] == "].":
            title = title[:len(title)-2]
    except:
        tirle = 'None'
    return title
    
# tutorial_pubmed_frame.drop_columns(["PMID", "MeSH", "TITLE"])
tutorial_pubmed_frame.add_columns(get_pmid, ("PMID", str))
tutorial_pubmed_frame.add_columns(get_mesh_terms, ("MeSH", str))
tutorial_pubmed_frame.add_columns(get_article_title, ("TITLE", str))

In [None]:
tutorial_pubmed_frame.column_names

In [None]:
# tutorial_pubmed_frame.inspect(n=100, columns=[i for i in tutorial_pubmed_frame.column_names if i != 'data_lines'])
tutorial_pubmed_frame.row_count

In [None]:
def filter_mesh(row):
	return row['MeSH'] != ''

def filter_title(row):
    return row['TITLE'] != 'None'

# Filter out any rows without patid or visid...
tutorial_pubmed_frame.filter(filter_mesh)
tutorial_pubmed_frame.filter(filter_title)

In [None]:
tutorial_pubmed_frame.row_count

In [None]:
tutorial_pubmed_frame.inspect(n=15, columns=[i for i in tutorial_pubmed_frame.column_names if i != 'data_lines'])

In [None]:
tutorial_pubmed_frame.column_names

In [None]:
def add_gs(row):
    mesh = row['MeSH']
    mesh = mesh.lower()
    if 'skin' in mesh:
        return 1
    else:
        return 0

tutorial_pubmed_frame.drop_columns("GS")    
tutorial_pubmed_frame.add_columns(add_gs, ("GS", ia.int32))

In [None]:
tutorial_pubmed_frame.column_names

In [None]:
tutorial_pubmed_frame.categorical_summary("GS")

In [None]:
# Create a working copy...
lts = tutorial_pubmed_frame.copy()

# Flatten on the separating character...
lts.flatten_column("MeSH", "|")

# We need to create a table ox ID x MED x Count...
lts_lda_input = lts.group_by(["PMID", "MeSH"], ia.agg.count)
if "lts_lda_input_renamed" in ia.get_frame_names():
    ia.drop_frames("lts_lda_input_renamed")
lts_lda_input_renamed = lts_lda_input.copy(name = "lts_lda_input_renamed")

In [None]:
# Set the parameters for the LDA model...
nTopics = 20
max_iterations = 2

# Clean up any old attempts...
ia.drop_models(['lts_lda_model'])

# Create LDA model..
lda_model =  ia.LdaModel(name='lts_lda_model')
lts_lda_model = lda_model.train(lts_lda_input_renamed, "PMID", "MeSH", 'count', num_topics=nTopics, max_iterations=max_iterations)

# We can extract the per-id topic distributions...
doc_mixes = lts_lda_model['topics_given_doc']

In [None]:
# Clean up old approaches...
ia.drop_frames(['lts_lda_feature_frame'])

# Copy the document x topic frame...
lts_lda_feature_frame = doc_mixes.copy(name='lts_lda_feature_frame')

# Create schema...
schema = map(lambda i: ('topic' + "_" + str(i), ia.float64), range(1, nTopics + 1))

# Create features frame...
lts_lda_feature_frame.add_columns(lambda row: (row['topic_probabilities']), schema)
lts_lda_feature_frame.drop_columns(['topic_probabilities'])

In [None]:
lts_lda_feature_frame.inspect()

In [None]:
tutorial_pubmed_lda = tutorial_pubmed_frame.join(lts_lda_feature_frame, 'PMID')

In [None]:
tutorial_pubmed_lda.inspect(columns=[i for i in tutorial_pubmed_lda.column_names if i != 'data_lines'])

In [None]:
pmids = tutorial_pubmed_lda.download(n=tutorial_pubmed_lda.row_count, columns='PMID')
unique_pmids = list(set(list(pmids['PMID'])))
random.shuffle(unique_pmids)
test_pmids = unique_pmids[0:int(len(unique_pmids) * 0.10)]


def label_train_test(row, test_list=test_pmids, train_list=train_pmids):
    if row['PMID'] in test_pmids:
        return "TEST"
    else:
        return "TRAIN"
    

tutorial_pubmed_lda.add_columns(label_train_test, ("DATASET", str))

In [None]:
trainframe = tutorial_pubmed_lda.copy()
trainframe.filter(lambda row: row['DATASET'] == "TRAIN")
testframe = tutorial_pubmed_lda.copy()
testframe.filter(lambda row: row['DATASET'] == "TEST")

In [None]:
rf_model_tutorial = ia.RandomForestClassifierModel('rf_model_intel_tutorial1')

In [None]:
rf_model_tutorial_results = rf_model_tutorial.train(frame=trainframe, 
                                                    label_column="GS", 
                                                    observation_columns=["topic_1", "topic_2"], 
                                                    num_classes=2, 
                                                    impurity='gini', 
                                                    max_depth=4, 
                                                    seed=01001000
                                                   )

In [None]:
# test the model
x = rf_model_tutorial.test(testframe, "GS")

In [None]:
x

In [None]:
#x.publish()