In [1]:
import pandas as pd
import spacy
nlp = spacy.load('en_core_web_sm') 

from spacy.matcher import Matcher 
from spacy.tokens import Span 
from spacy.tokenizer import Tokenizer
from spacy.util import compile_infix_regex

# nlp = spacy.load(r'C:\\Users\\Marta\\AppData\\Local\\Programs\\Python\\Python38\\Lib\\site-packages\\en_core_web_sm\\en_core_web_sm-2.2.0')

### Create dataframe from query results. (QUERY API: https://krr.triply.cc/annadg/-/queries/Abstract-Data-Query/1)

In [None]:
data = pd.read_csv('entityQueryResults.csv')

#### Apply a pattern rule-based function that extract the sentence with a lemma of hypothesis.

In [4]:
def pattern_match(text):
    """function to find sentences that contain the lemma of hypothesis"""
    matcher = Matcher(nlp.vocab)
    # Add match ID "HypothesisIs" with no callback and one pattern
    pattern = [{'LEMMA':{"IN":["hypothesis","hypothesize","hypothesise", "hypothesized", "hypothesised"]}}]
    
    matcher.add("HypothesisIs", None, pattern)

    doc = nlp(text)
    matches = matcher(doc)
    for match_id, start, end in matches:
        string_id = nlp.vocab.strings[match_id]  # Get string representation
        span = doc[start:end]  # The matched span
        sent = span.sent
        return sent.text

In [5]:
data['pattern_match'] = data['value'].apply(pattern_match)

In [6]:
# need to get rid of this instance, otherwise the merge noun chunks does not work. Drop the index number returned here in the following cell. 
data[data['pattern_match']=="While the hypothesis that dromedary camels are the likely major source of MERS-CoV infection in humans is gaining acceptance, conjecture continues over the original natural reservoir host(s)"].index[0]

399

In [7]:
data.drop([data.index[399]], inplace=True)

In [8]:
# creating bool series True for NaN values - as the subsequent formula will break if there are 
bool_series = pd.isnull(data["pattern_match"])  
    
# filtering data  
data[bool_series]

Unnamed: 0,paper,article_types,new_id,pm_central_id,abstract,value,abstract_entities,pattern_match
8,http://ns.inria.fr/covid19/ceda3494dad0563197a...,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/ceda349...,,http://ns.inria.fr/covid19/ceda3494dad0563197a...,Summary Severe Acute Respiratory Syndrome (SAR...,https://krr.triply.cc/.well-known/genid/0ff833...,
30,http://ns.inria.fr/covid19/bdc91980735543fbf8c...,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/bdc9198...,,http://ns.inria.fr/covid19/bdc91980735543fbf8c...,"Abstract For epidemic control, rapid identific...",https://krr.triply.cc/.well-known/genid/0c5288...,
32,http://ns.inria.fr/covid19/5d53384b0c11ff23069...,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/PMC7087...,PMC7087228,http://ns.inria.fr/covid19/5d53384b0c11ff23069...,Trionyx sinensis hemorrhagic syndrome virus (T...,https://krr.triply.cc/.well-known/genid/0cf5ee...,
50,http://ns.inria.fr/covid19/40a07bdce5fdd3382c1...,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/40a07bd...,,http://ns.inria.fr/covid19/40a07bdce5fdd3382c1...,Abstract This study adopted a two (author: alg...,https://krr.triply.cc/.well-known/genid/0bcce4...,
53,http://ns.inria.fr/covid19/f8f464606fc9d44a2cd...,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/f8f4646...,,http://ns.inria.fr/covid19/f8f464606fc9d44a2cd...,Abstract Objective To evaluate the health effe...,https://krr.triply.cc/.well-known/genid/78f8b4...,
...,...,...,...,...,...,...,...,...
881,http://ns.inria.fr/covid19/f4e399086421d7a4ee5...,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/f4e3990...,,http://ns.inria.fr/covid19/f4e399086421d7a4ee5...,Abstract Suckling C57BU6 mice infected with mo...,https://krr.triply.cc/.well-known/genid/370225...,
893,http://ns.inria.fr/covid19/704854881dfad026bde...,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/7048548...,,http://ns.inria.fr/covid19/704854881dfad026bde...,Background The risk of developing nosocomial i...,https://krr.triply.cc/.well-known/genid/1a1b7e...,
909,http://ns.inria.fr/covid19/0c1b5e06659621ae245...,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/PMC7108...,PMC7108410,http://ns.inria.fr/covid19/0c1b5e06659621ae245...,Abstract A new personal bioaerosol sampler has...,https://krr.triply.cc/.well-known/genid/1a842c...,
913,http://ns.inria.fr/covid19/01e3d931c0bc070ec62...,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/PMC5583...,PMC5583561,http://ns.inria.fr/covid19/01e3d931c0bc070ec62...,"Medical countermeasures, including new drugs a...",https://krr.triply.cc/.well-known/genid/16156c...,


In [9]:
# Investigate abstracts as not matching lemma pattern to verify integrity of pattern

for row in data.value[881:882]:  # iterating through the rows of the object column
    print(row, '\n')

As the predominant aetiological agent of the common cold, human rhinovirus (HRV) is the leading cause of human infectious disease. Early studies showed that a monovalent formalin-inactivated HRV vaccine can be protective, and virus-neutralizing antibodies (nAb) correlated with protection. However, co-circulation of many HRV types discouraged further vaccine efforts. Here, we test the hypothesis that increasing virus input titres in polyvalent inactivated HRV vaccine may result in broad nAb responses. We show that serum nAb against many rhinovirus types can be induced by polyvalent, inactivated HRVs plus alhydrogel (alum) adjuvant. Using formulations up to 25-valent in mice and 50-valent in rhesus macaques, HRV vaccine immunogenicity was related to sufficient quantity of input antigens, and valency was not a major factor for potency or breadth of the response. Thus, we have generated a vaccine capable of inducing nAb responses to numerous and diverse HRV types. 



In [None]:
# Drop rows where 'pattern match' does not return a match
data.dropna(subset=['pattern_match'], inplace = True)

### Extract hypothesis entities

In [None]:
merge_nps = nlp.create_pipe("merge_noun_chunks")
nlp.add_pipe(merge_nps)

def merge_noun_chunks(text):
    """function to merge noun chunks in texts"""
    noun_chunks = []
    for t in nlp(text):
        noun_chunks.append(t.text)
        
    return noun_chunks

In [15]:
data['merged_noun_chunks'] = data['pattern_match'].apply(merge_noun_chunks)

In [18]:
data['merged_noun_chunks']

0      [Based, on, this hypothesis, ,, a T-cell vacci...
1      [However, ,, enhancements, made, to, EDNA, inc...
2      [In, a follow up study, ,, we, hypothesized, t...
3      [We, hypothesized, that, a combinatorial prote...
4      [We, hypothesize, that, SES, is, caused, by, a...
                             ...                        
926    [Molecular tools, enable, one, to, trace, the ...
927    [We, previously, hypothesized, that, HIV-1, co...
928    [We, hypothesised, that, healthy children, wit...
929    [We, hypothesized, that, this change, would, a...
930    [While, there, are, several hypotheses, that, ...
Name: merged_noun_chunks, Length: 838, dtype: object

In [16]:
def combine_chunks(list_of_chunks):
    """
    Input: list of strings 
    Output: new list of strings containing only the strings that were >1 word; the returned strings are connected by '_'  
    """
    for index, word in enumerate(list_of_chunks):
        if len(word.split(' ')) > 1:
            new_word = word.replace(' ', '_')
            list_of_chunks[index] = new_word
    sentence = ' '.join(list_of_chunks)
    
    return sentence

data['merged_sent'] = data['merged_noun_chunks'].apply(combine_chunks)

In [20]:
# solution from https://stackoverflow.com/questions/59993683/how-can-i-get-spacy-to-stop-splitting-both-hyphenated-numbers-and-words-into-sep 
def custom_tokenizer(nlp):
    """
    Function that keeps intra-hyphenated words as single tokens.
    """
    inf = list(nlp.Defaults.infixes)               # Default infixes
    inf.remove(r"(?<=[0-9])[+\-\*^](?=[0-9-])")    # Remove the generic op between numbers or between a number and a -
    inf = tuple(inf)                               # Convert inf to tuple
    infixes = inf + tuple([r"(?<=[0-9])[+*^](?=[0-9-])", r"(?<=[0-9])-(?=-)"])  # Add the removed rule after subtracting (?<=[0-9])-(?=[0-9]) pattern
    infixes = [x for x in infixes if '-|–|—|--|---|——|~' not in x] # Remove - between letters rule
    infix_re = compile_infix_regex(infixes)

    return Tokenizer(nlp.vocab, prefix_search=nlp.tokenizer.prefix_search,
                                suffix_search=nlp.tokenizer.suffix_search,
                                infix_finditer=infix_re.finditer,
                                token_match=nlp.tokenizer.token_match,
                                rules=nlp.Defaults.tokenizer_exceptions)

nlp.tokenizer = custom_tokenizer(nlp)

In [21]:
def get_keywords(hypothesis_sentence):
    """
    Takes a sentence as input and tokenizes it. Outputs the words that have the certain POS label. 
    """    
    hypothesis_keywords = []
    doc = nlp(hypothesis_sentence)
    for tok in doc:
        if tok.pos_ == "PROPN" or tok.pos_ == "NOUN":
            hypothesis_keywords.append(tok.text)
   
    return hypothesis_keywords

In [22]:
data['hypothesis_entities'] = data['merged_sent'].apply(get_keywords)

#### Convert data into dataframe with desired columns for future steps.

In [23]:
data.drop(columns = ['abstract_entities', 'merged_noun_chunks', 'merged_sent'] , inplace=True)
data.rename(columns={"pattern_match":"hypothesis_sentence"}, inplace=True)

In [24]:
def clean_hypothesis_entities(text):
    """
    Cleans the '_' from the sentences created in previous functions.
    """
    cleaned_hypotheses = []
    for word in (text):
        word.split(' ')
        new_word = word.replace('_', ' ')
        cleaned_hypotheses.append(new_word)
    
    return cleaned_hypotheses

In [25]:
data["clean_hypothesis_entities"] = data["hypothesis_entities"].apply(clean_hypothesis_entities)

In [27]:
f = lambda x: 'entity_{}'.format(x + 1)
entity_df = pd.DataFrame(data.clean_hypothesis_entities.values.tolist(),data.index, dtype=object).fillna('').rename(columns=f)

In [28]:
data = data.reset_index(drop=True)
entity_df = entity_df.reset_index(drop=True)

In [29]:
merged_data = (pd.concat([data,entity_df],axis=1))

In [31]:
merged_data.to_csv('paper_hyp_entity_data.csv', index=False)