In [2]:
import pandas as pd
import spacy
nlp = spacy.load('en_core_web_sm') 

from spacy.matcher import Matcher 
from spacy.tokens import Span 
from spacy.tokenizer import Tokenizer
from spacy.util import compile_infix_regex

# nlp = spacy.load(r'C:\\Users\\Marta\\AppData\\Local\\Programs\\Python\\Python38\\Lib\\site-packages\\en_core_web_sm\\en_core_web_sm-2.2.0')

### Create dataframe from query results. (QUERY API: https://krr.triply.cc/annadg/-/queries/Abstract-Data-Query/1)

In [3]:
# data = pd.read_csv('entityQueryResults.csv')
data = pd.read_csv('../sparql-queries/abstract-info-instances.csv')

#### Apply a pattern rule-based function that extract the sentence with a lemma of hypothesis.

In [4]:
def pattern_match(text):
    """function to find sentences that contain the lemma of hypothesis"""
    matcher = Matcher(nlp.vocab)
    # Add match ID "HypothesisIs" with no callback and one pattern
    pattern = [{'LEMMA':{"IN":["hypothesis","hypothesize","hypothesise", "hypothesized", "hypothesised"]}}]
    
    matcher.add("HypothesisIs", None, pattern)

    doc = nlp(text)
    matches = matcher(doc)
    for match_id, start, end in matches:
        string_id = nlp.vocab.strings[match_id]  # Get string representation
        span = doc[start:end]  # The matched span
        sent = span.sent
        return sent.text

In [5]:
data['pattern_match'] = data['value'].apply(pattern_match)

In [6]:
# need to get rid of this instance, otherwise the merge noun chunks does not work. Drop the index number returned here in the following cell. 
data[data['pattern_match']=="While the hypothesis that dromedary camels are the likely major source of MERS-CoV infection in humans is gaining acceptance, conjecture continues over the original natural reservoir host(s)"].index[0]

1173

In [7]:
data.drop([data.index[1173]], inplace=True)

In [8]:
# creating bool series True for NaN values - as the subsequent formula will break if there are 
bool_series = pd.isnull(data["pattern_match"])  
    
# filtering data  
data[bool_series]

Unnamed: 0,paper,article_types,new_id,pm_central_id,abstract,value,abstract_entities,pattern_match
7,http://ns.inria.fr/covid19/1eb3f3f0aafd8b2741a...,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/1eb3f3f...,,http://ns.inria.fr/covid19/1eb3f3f0aafd8b2741a...,Summary Objectives To measure the spatial cont...,https://krr.triply.cc/.well-known/genid/1683bd...,
19,http://ns.inria.fr/covid19/PMC4747015,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/PMC4747...,PMC4747015,http://ns.inria.fr/covid19/PMC4747015#abstract,Information regarding effective anesthetic reg...,,
45,http://ns.inria.fr/covid19/4b278bf04e245866207...,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/4b278bf...,,http://ns.inria.fr/covid19/4b278bf04e245866207...,Abstract Global re-emergence of Chikungunya vi...,https://krr.triply.cc/.well-known/genid/0e1b11...,
52,http://ns.inria.fr/covid19/84317229254324b8c2d...,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/PMC7106...,PMC7106413,http://ns.inria.fr/covid19/84317229254324b8c2d...,Summary The Coalition for Epidemic Preparednes...,https://krr.triply.cc/.well-known/genid/0000a5...,
54,http://ns.inria.fr/covid19/700cf45c09aadb04452...,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/700cf45...,,http://ns.inria.fr/covid19/700cf45c09aadb04452...,"As of March 23, 2020 there have been over 354,...",,
...,...,...,...,...,...,...,...,...
1181,http://ns.inria.fr/covid19/7461fe0adbb9a865f8a...,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/PMC5661...,PMC5661933,http://ns.inria.fr/covid19/7461fe0adbb9a865f8a...,BACKGROUND: The detection of wild poliovirus i...,https://krr.triply.cc/.well-known/genid/027f07...,
1188,http://ns.inria.fr/covid19/93df1925c1aa0cf7e72...,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/PMC3282...,PMC3282974,http://ns.inria.fr/covid19/93df1925c1aa0cf7e72...,Kawasaki disease (KD) is a self-limited system...,,
1212,http://ns.inria.fr/covid19/PMC4187631,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/PMC4187...,PMC4187631,http://ns.inria.fr/covid19/PMC4187631#abstract,SUMMARY: The pathogenicity and clinical pertin...,,
1213,http://ns.inria.fr/covid19/8d07b32de2cd9460999...,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/8d07b32...,,http://ns.inria.fr/covid19/8d07b32de2cd9460999...,A SEIR simulation model for the COVID-19 pande...,https://krr.triply.cc/.well-known/genid/04dd52...,


In [9]:
# Investigate abstracts as not matching lemma pattern to verify integrity of pattern

for row in data.value[881:882]:  # iterating through the rows of the object column
    print(row, '\n')

OBJECTIVES—To explore the hypothesis that hydrocarbon species and other air pollutants which accumulate at low and high concentrations of ozone are more directly associated with childhood wheezy episodes than ozone. METHODS—Prospective observational study over 1 year set in the Lewisham district of south east London. The daily attendance rate of children with acute wheeze at the accident and emergency department of Lewisham Hospital was related to local measurements of ozone, hydrocarbon species, nitrogen dioxide (NO(2)), sulphur dioxide (SO(2)) and small particulate matter with diameter <10µm (PM(10)). RESULTS—An inverse relation was found between the air pollutants and ozone. After seasonal and meteorological adjustment a non-linear U shaped trend was found between incidence of wheeze and ozone. The trend was significant in children <2 years of age but not in older children. In the younger age group, after adjustment for season, temperature, wind speed, and respiratory infection, the

In [10]:
# Drop rows where 'pattern match' does not return a match
data.dropna(subset=['pattern_match'], inplace = True)

### Extract hypothesis entities

In [11]:
merge_nps = nlp.create_pipe("merge_noun_chunks")
nlp.add_pipe(merge_nps)

def merge_noun_chunks(text):
    """function to merge noun chunks in texts"""
    noun_chunks = []
    for t in nlp(text):
        noun_chunks.append(t.text)
        
    return noun_chunks

In [12]:
data['merged_noun_chunks'] = data['pattern_match'].apply(merge_noun_chunks)

In [13]:
def combine_chunks(list_of_chunks):
    """
    Input: list of strings 
    Output: new list of strings containing only the strings that were >1 word; the returned strings are connected by '_'  
    """
    for index, word in enumerate(list_of_chunks):
        if len(word.split(' ')) > 1:
            new_word = word.replace(' ', '_')
            list_of_chunks[index] = new_word
    sentence = ' '.join(list_of_chunks)
    
    return sentence

data['merged_sent'] = data['merged_noun_chunks'].apply(combine_chunks)

In [14]:
# solution from https://stackoverflow.com/questions/59993683/how-can-i-get-spacy-to-stop-splitting-both-hyphenated-numbers-and-words-into-sep 
def custom_tokenizer(nlp):
    """
    Function that keeps intra-hyphenated words as single tokens.
    """
    inf = list(nlp.Defaults.infixes)               # Default infixes
    inf.remove(r"(?<=[0-9])[+\-\*^](?=[0-9-])")    # Remove the generic op between numbers or between a number and a -
    inf = tuple(inf)                               # Convert inf to tuple
    infixes = inf + tuple([r"(?<=[0-9])[+*^](?=[0-9-])", r"(?<=[0-9])-(?=-)"])  # Add the removed rule after subtracting (?<=[0-9])-(?=[0-9]) pattern
    infixes = [x for x in infixes if '-|–|—|--|---|——|~' not in x] # Remove - between letters rule
    infix_re = compile_infix_regex(infixes)

    return Tokenizer(nlp.vocab, prefix_search=nlp.tokenizer.prefix_search,
                                suffix_search=nlp.tokenizer.suffix_search,
                                infix_finditer=infix_re.finditer,
                                token_match=nlp.tokenizer.token_match,
                                rules=nlp.Defaults.tokenizer_exceptions)

nlp.tokenizer = custom_tokenizer(nlp)

In [15]:
def get_keywords(hypothesis_sentence):
    """
    Takes a sentence as input and tokenizes it. Outputs the words that have the certain POS label. 
    """    
    hypothesis_keywords = []
    doc = nlp(hypothesis_sentence)
    for tok in doc:
        if tok.pos_ == "PROPN" or tok.pos_ == "NOUN":
            hypothesis_keywords.append(tok.text)
   
    return hypothesis_keywords

In [16]:
data['hypothesis_entities'] = data['merged_sent'].apply(get_keywords)

#### Convert data into dataframe with desired columns for future steps.

In [17]:
data.drop(columns = ['abstract_entities', 'merged_noun_chunks', 'merged_sent'] , inplace=True)
data.rename(columns={"pattern_match":"hypothesis_sentence"}, inplace=True)

In [18]:
def clean_hypothesis_entities(text):
    """
    Cleans the '_' from the sentences created in previous functions.
    """
    cleaned_hypotheses = []
    for word in (text):
        word.split(' ')
        new_word = word.replace('_', ' ')
        cleaned_hypotheses.append(new_word)
    
    return cleaned_hypotheses

In [19]:
data["clean_hypothesis_entities"] = data["hypothesis_entities"].apply(clean_hypothesis_entities)

In [20]:
f = lambda x: 'entity_{}'.format(x + 1)
entity_df = pd.DataFrame(data.clean_hypothesis_entities.values.tolist(),data.index, dtype=object).fillna('').rename(columns=f)

In [21]:
data = data.reset_index(drop=True)
entity_df = entity_df.reset_index(drop=True)

In [22]:
merged_data = (pd.concat([data,entity_df],axis=1))

In [23]:
merged_data.to_csv('paper_hyp_entity_data.csv', index=False)