In [1]:
from model.data_utils import Dataset
from model.models import HANNModel
from model.config import Config
import argparse
import os
import sys
import nltk.data


In [2]:
# Load sentence splitter
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

# Provide the dataset name and the path to the trained parameters

In [3]:
models = [
    {"dataset": "nicta",      "weights": "results/nicta_1_epoch/model.weights"},
    {"dataset": "pubmed-20k", "weights": "results/pubmed-20k_1_epoch/model.weights"}    
]



# Load the config and the model with the given model parameters

In [4]:
for d in models:
    parser = argparse.ArgumentParser()
    config = Config(parser, log_config=False, dataset = d["dataset"], args=dict())
    model = HANNModel(config)
    model.build()
    model.restore_session(d["weights"])
    d["config"] = config
    d["model"] = model
    

INFO:tensorflow:Scale of 0 disables regularizer.


Scale of 0 disables regularizer.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
Initializing tf session
Reloading the latest trained model...


INFO:tensorflow:Restoring parameters from results/nicta_1_epoch/model.weights


Restoring parameters from results/nicta_1_epoch/model.weights


INFO:tensorflow:Scale of 0 disables regularizer.


Scale of 0 disables regularizer.
Initializing tf session
Reloading the latest trained model...


INFO:tensorflow:Restoring parameters from results/pubmed-20k_1_epoch/model.weights


Restoring parameters from results/pubmed-20k_1_epoch/model.weights


In [5]:
def split_to_sentences(text):
    """ Splits the text into sentences """    
    return list(map(lambda l: l.strip(), tokenizer.tokenize(text)))



In [6]:
def split_sentences_to_words(sentences, config):
    """Splits each sentence into words and maps each word to an index for word embeddings """
    
    sentences_words = []
    # split abstract to sentences
    for line in sentences:
        # split line into words and map  words to ids
        sentence = [config.processing_word(word) for word in line.split()]
        sentences_words += [sentence]
    return sentences_words

In [7]:
def classify(sentences):
    """Classifies the sentences and returns a predicted label for each sentence for each model"""
    
    result = []
    
    for m in models:
        sentences_words = split_sentences_to_words(sentences, m["config"])
        labels_pred, _ = m["model"].predict_batch([sentences_words])
        # map: label id to label string
        tag_id_to_label = dict((v,k) for k,v in m["config"].vocab_tags.items())            
        # convert predicted labels to string
        labels_pred_str = []
        for sublist in labels_pred:
            for item in sublist:
                labels_pred_str.append(tag_id_to_label[item].upper())

        result += [labels_pred_str]
        
    
    return result

In [8]:
def classify_and_print_result(text):
    """Classifies each sentence in the text and prints the predicted label for each sentence for each model"""
    
    sentences = split_to_sentences(text)
    predicted_labels = classify(sentences)

    model_names = list(map(lambda m: m["dataset"], models))
    print("/".join(model_names))
    

    for index, sentence in enumerate(sentences):
        labels_str = list(map(lambda m: m[index], predicted_labels))
        
        print("/".join(labels_str) + ": " + sentence)

        

# Possible sentence labels of the model

In [9]:
for m in models:
    print(m["dataset"] + ":")
    for l in m["config"].vocab_tags.keys():
        print(l.upper())
    print()
    


nicta:
OUTCOME
BACKGROUND
INTERVENTION
STUDY
OTHER
POPULATION

pubmed-20k:
OBJECTIVE
BACKGROUND
CONCLUSIONS
METHODS
RESULTS



# Now play around: classify sentences in abstracts

In [10]:
abstract = """
Although adolescents in Reunion have a pregnancy rate of 77/1000 , twice that of France , there are no published studies on the themes of adolescent pregnancy and parenthood .
105 women under 18 including 20 aged 14 who gave birth in 1 maternity center were analyzed for sociofamilial characteristics and perinatal complications .
The educational level of the adolescent mothers was low and 47 % had stopped school before the pregnancy .
1/3 expected to find employment or return to work after delivery .
19 % were already in consensual unions at the time of pregnancy and 34 % had entered such unions after delivery .
It was the 2nd pregnancy for 11.5 % .
Only 8.5 % used contraception .
The age difference with the partner was over 5 years for more than half .
68 % had known the father for more than 1 year .
At the time of pregnancy , 35 % lived in families with 2 parents and 36 % in households headed by their mothers .
Heads of 50 % of all their households and 80 % of those headed by the mothers were unemployed .
44 % had more than 6 siblings .
20 % had 1 or more sisters who were adolescent mothers .
Comparison with a group of 62 adolescents seeking abortions at the same center indicated several factors that appeared to encourage continuation of the pregnancy ; including being below age level in school , absence of plans for education or employment , lim
Risk factors at the level of the family included absence of the father , maternal tolerance , poverty , and sisters who were single mothers .
27 % states retrospectively that they would have preferred to terminate the pregnancy , but all stated they were satisfied after the delivery .
41 % had problems in pregnancy or delivery .
10.5 % had toxemia and 23 % had low birth weight babies .
5.7 % had caesareans , mainly for eclampsia and acute fetal distress .
There was 1 stillbirth and 2 cases of cogenital malformations .
14 newborns were transferred were transferred to the neonatology unit for various problems , including 6 weighing under 2000 g.
Compared to the general population , the rate of fetal growth retardation was 3 times higher and that of neonatal pathology requiring transfer to the neonatology unit was 2 times higher among adolescent mothers .
Prenatal care was inadequate in many cases .
35 % of 14-year-old mothers had 3 or fewer prenatal visits.adolescent pregnancy -- complicationsadolescentsadolescents femaleafricaafrica south of the saharaage factorscontraceptioncontraceptive usagedelivery of health caredemographic factorsdeveloping count
"""
classify_and_print_result(abstract)

nicta/pubmed-20k
BACKGROUND/BACKGROUND: Although adolescents in Reunion have a pregnancy rate of 77/1000 , twice that of France , there are no published studies on the themes of adolescent pregnancy and parenthood .
OTHER/METHODS: 105 women under 18 including 20 aged 14 who gave birth in 1 maternity center were analyzed for sociofamilial characteristics and perinatal complications .
OUTCOME/RESULTS: The educational level of the adolescent mothers was low and 47 % had stopped school before the pregnancy .
OUTCOME/RESULTS: 1/3 expected to find employment or return to work after delivery .
OUTCOME/RESULTS: 19 % were already in consensual unions at the time of pregnancy and 34 % had entered such unions after delivery .
OUTCOME/RESULTS: It was the 2nd pregnancy for 11.5 % .
OUTCOME/RESULTS: Only 8.5 % used contraception .
OUTCOME/RESULTS: The age difference with the partner was over 5 years for more than half .
OUTCOME/RESULTS: 68 % had known the father for more than 1 year .
OUTCOME/RESULT

In [11]:
abstract = """
Despite improved digital access to scientific publications in the last decades, the fundamental principles of scholarly communication remain unchanged and continue to be largely document-based. 
The document-oriented workflows in science publication have reached the limits of adequacy as highlighted by recent discussions on the increasing proliferation of scientific literature, the deficiency of peer-review and the reproducibility crisis. 
In this article, we present first steps towards representing scholarly knowledge semantically with knowledge graphs.
We expand the currently popular RDF graph-based knowledge representation formalism to capture annotations, such as provenance information and describe how to manage such knowledge in a graph data base. 
We report on the results of a first experimental evaluation of the concept and its implementations with the participants of an international conference.
"""
classify_and_print_result(abstract)

nicta/pubmed-20k
BACKGROUND/BACKGROUND: Despite improved digital access to scientific publications in the last decades, the fundamental principles of scholarly communication remain unchanged and continue to be largely document-based.
BACKGROUND/BACKGROUND: The document-oriented workflows in science publication have reached the limits of adequacy as highlighted by recent discussions on the increasing proliferation of scientific literature, the deficiency of peer-review and the reproducibility crisis.
OUTCOME/BACKGROUND: In this article, we present first steps towards representing scholarly knowledge semantically with knowledge graphs.
BACKGROUND/BACKGROUND: We expand the currently popular RDF graph-based knowledge representation formalism to capture annotations, such as provenance information and describe how to manage such knowledge in a graph data base.
OUTCOME/BACKGROUND: We report on the results of a first experimental evaluation of the concept and its implementations with the parti

In [12]:
abstract = """
Interpreting observational data is a fundamental task in the sciences, specifically in earth and environmental science where observational data are increasingly acquired, curated, and published systematically by environmental research infrastructures. Typically subject to substantial processing, observational data are used by research communities, their research groups and individual scientists, who interpret such primary data for their meaning in the context of research investigations. The result of interpretation is information—meaningful secondary or derived data—about the observed environment. Research infrastructures and research communities are thus essential to evolving uninterpreted observational data to information. In digital form, the classical bearer of information are the commonly known “(elaborated) data products,” for instance maps. In such form, meaning is generally implicit e.g., in map colour coding, and thus largely inaccessible to machines. The systematic acquisition, curation, possible publishing and further processing of information gained in observational data interpretation—as machine readable data and their machine readable meaning—is not common practice among environmental research infrastructures. For a use case in aerosol science, we elucidate these problems and present a Jupyter based prototype infrastructure that exploits a machine learning approach to interpretation and could support a research community in interpreting observational data and, more importantly, in curating and further using resulting information about a studied natural phenomenon.
"""
classify_and_print_result(abstract)

nicta/pubmed-20k
BACKGROUND/BACKGROUND: Interpreting observational data is a fundamental task in the sciences, specifically in earth and environmental science where observational data are increasingly acquired, curated, and published systematically by environmental research infrastructures.
OTHER/BACKGROUND: Typically subject to substantial processing, observational data are used by research communities, their research groups and individual scientists, who interpret such primary data for their meaning in the context of research investigations.
OUTCOME/RESULTS: The result of interpretation is information—meaningful secondary or derived data—about the observed environment.
BACKGROUND/RESULTS: Research infrastructures and research communities are thus essential to evolving uninterpreted observational data to information.
OUTCOME/RESULTS: In digital form, the classical bearer of information are the commonly known “(elaborated) data products,” for instance maps.
BACKGROUND/RESULTS: In such 