In [8]:
import pandas as pd
#from wordcloud import WordCloud, STOPWORDS 
import matplotlib.pyplot as plt 

In [None]:
# run this block to install wordcloud
# pip instalL wordcloud

In [9]:
data = pd.read_csv('abstract_data.csv')
keywords = data.keywords

In [10]:
clean_items = []
for i in keywords:
    i = i.replace('[', '')   
    i = i.replace(']', '')  
    clean_items.append(i)
    
data.keywords = clean_items

In [11]:
def clean_text(series):
    clean_words = []
    list_of_words = series.split(',')
    for word in list_of_words:
#word = word.replace('_', ' ')      
        word = word.replace("'", '')
        word = word.lower()
        word = word.strip(' ')
        clean_words.append(word)
    return clean_words

In [12]:
data.keywords = data.keywords.apply(clean_text)

In [13]:
# Keywords = the noun entities taken from the merged noun_chunks
# Here we clean them and then tokenize them

def clean_keyword(text):
    cleaned_keywords = []
    for word in (text):
        word.split(' ')
        new_word = word.replace('_', ' ')
        cleaned_keywords.append(new_word)
    
    return cleaned_keywords

def tokenize(text):
    res = [sub.split() for sub in text]
    flattened = [i for j in res for i in j]
    return flattened


In [14]:
data['keywords_clean'] = data["keywords"].astype(str)
# clean the keywords
data['keywords_clean'] = data['keywords'].apply(clean_keyword)
# tokenize 
data['tokens'] = data['keywords_clean'].apply(tokenize)
# put tokens into set
data['tuple_tokens'] = data['tokens'].apply(set)

In [None]:
# We start with keywords - which are nouns that were extracted from noun-chunks

for row in data.keywords[0:5]:  # iterating through the rows of the object column
    print(row, '\n')

In [16]:
# We then tokenize those keywords

for row in data.tokens[0:5]:  # iterating through the rows of the object column
    print(row, '\n')

['a', 'liposome-based', 'anti-cancer', 'vaccine'] 

['a', 'scoping', 'review', 'the', 'literature', 'nurses’', 'dual', 'practice', 'the', 'objective', 'generating', 'hypotheses', 'its', 'nature', 'consequences', 'a', 'research', 'agenda', 'the', 'phenomenon'] 

['the', 'expression', 'sialic', 'acid', 'o-acetylesterases', 'a', 'broad', 'specificity', 'competition', 'the', 'gut', 'microbiota', 'nutrients', 'ehec', 'colonization', 'the', 'human', 'large', 'intestine'] 

['ra', 'dcs', 'an', 'immature', 'mucosal', 'phenotype', 'the', 'small', 'intestinal', 'submucosa'] 

['fiv', 'infection', 'dysregulation', 'trophoblast', 'immunomodulator', 'expression', 'aberrant', 'expression', 'these', 'molecules', 'inflammation', 'compromise', 'pregnancy'] 



In [18]:
# These are all the functions that are run on the tokens sets to clean up the tokens column

def drop_double_char(ents):
    """Drop any entities that are less than three characters. 
    
    Keyword arguments:
    ents -- a set of entities
    
    """
    drop_ents = {ent for ent in ents if len(ent) < 3}
    return ents - drop_ents

def keep_alpha(ents):
    """Keep only entities with alphabetical unicode characters, hyphens, and spaces. 
    
    Keyword arguments:
    ents -- a set of entities
    
    """
    keep_char = set('-abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ ')
    drop_ents = {ent for ent in ents if not set(ent).issubset(keep_char)}
    return ents - drop_ents

def drop_single_char_nps(ents):
    """Within an entity, drop single characters. 
    
    Keyword arguments:
    ents -- a set of entities
    
    """
    return {' '.join([e for e in ent.split(' ') if not len(e) == 1]) for ent in ents}

def remove_freq_words(entities):
    """Drop any entities in the 5000 most common words in the English langauge. 
    
    Keyword arguments:
    ents -- a set of entities
    
    """
    freq_words = pd.read_csv('datasets_freq_words.csv')['Word'].iloc[1:]
    for word in freq_words:
        try:
            entities.remove(word)
        except KeyError:
            continue # ignore the stop word if it's not in the list of abstract entities
    return entities

def add_clean_ents(df, funcs=[]):
    """Create new column in data frame with cleaned entities.
    
    Keyword arguments:
    df -- a dataframe object
    funcs -- a list of heuristic functions to be applied to entities
    
    """
    col = 'tuple_tokens_clean'
    df[col] = df['tuple_tokens']
    for f in funcs:
        df[col] = df[col].apply(f)

In [19]:
# We run all the functions through 'add clean ents function'

functions = [drop_double_char, keep_alpha, drop_single_char_nps, remove_freq_words]
add_clean_ents(data, functions)

In [20]:
# Taking a look at the cleaned tokens

for row in data.tuple_tokens_clean[0:5]:  # iterating through the rows of the object column
    print(row, '\n')

{'liposome-based', 'anti-cancer', 'vaccine'} 

{'scoping', 'consequences', 'generating', 'dual'} 

{'ehec', 'sialic', 'o-acetylesterases', 'specificity', 'microbiota', 'nutrients', 'intestine', 'colonization'} 

{'dcs', 'mucosal', 'phenotype', 'submucosa', 'immature', 'intestinal'} 

{'fiv', 'aberrant', 'trophoblast', 'dysregulation', 'immunomodulator', 'inflammation', 'molecules'} 



In [21]:
# Put the set of sets into a list, expand the list and create one final clean set

def large_list(text):
    large_list = []
    for word in (text):
        word.split(',')
        if word not in large_list:
            large_list.append(word)
    return large_list

data["list_clean"] = data["tuple_tokens_clean"].apply(large_list)
aggregated_list = data.list_clean.sum()

unique_tokens = set()
for word in aggregated_list:
    unique_tokens.add(word)

In [22]:
# This is the flattened set that we use to create the SPARQL query

unique_tokens

{'hitherto',
 'tiv',
 'neurodegeneration',
 'vaccinated',
 'tests',
 'catechin',
 'psycho-neuroendocrinological',
 'explanatory',
 'sct',
 'acquired',
 'lrti',
 'european',
 'controlled',
 'adjacent',
 'rejection',
 'cia',
 'fragmentation',
 'fluorescence',
 'prrsv',
 'ichnovirus',
 'huanan',
 'vascular',
 'adipogenesis',
 'diarrhea',
 'cytokine',
 'sars',
 'validation',
 'spatial',
 'attenuated',
 'bstvs',
 'ndv',
 'viable',
 'implications',
 'camels',
 'pcr-reactions',
 'heparan',
 'node',
 'ifitms',
 'isothermal',
 'rbct',
 'lining',
 'sporadic',
 'risks',
 'rabv',
 'permeability',
 'insults',
 'bocv',
 'methods',
 'aetiology',
 'singapore',
 'vuvuzelas',
 'ones',
 'residues',
 'parts',
 'placental',
 'subjective',
 'voles',
 'insights',
 'african',
 'gof',
 'spinal',
 'antigens',
 'causal',
 'astv',
 'large-del',
 'bovine',
 'evt',
 'spherules',
 'covmpro',
 'thymocyte',
 'lcmv',
 'dehydration',
 'antibody',
 'chb',
 'hundreds',
 'orthomyxovirus',
 'horses',
 'drug',
 'proteases',


In [25]:
def sparql_query(text):

    print ("WHERE {")
    print ("?sub meshv:preferredConcept ?pa .")
    print ("?pa rdfs:label ?paLabel .")
    print ("FILTER(")
    for keyword in text:
        print (f"REGEX(?paLabel, \'^{keyword}', 'i') ||")
    print ("}\n")

In [26]:
sparql_query(unique_tokens)

WHERE {
?sub meshv:preferredConcept ?pa .
?pa rdfs:label ?paLabel .
FILTER(
REGEX(?paLabel, '^hitherto', 'i') ||
REGEX(?paLabel, '^tiv', 'i') ||
REGEX(?paLabel, '^neurodegeneration', 'i') ||
REGEX(?paLabel, '^vaccinated', 'i') ||
REGEX(?paLabel, '^tests', 'i') ||
REGEX(?paLabel, '^catechin', 'i') ||
REGEX(?paLabel, '^psycho-neuroendocrinological', 'i') ||
REGEX(?paLabel, '^explanatory', 'i') ||
REGEX(?paLabel, '^sct', 'i') ||
REGEX(?paLabel, '^acquired', 'i') ||
REGEX(?paLabel, '^lrti', 'i') ||
REGEX(?paLabel, '^european', 'i') ||
REGEX(?paLabel, '^controlled', 'i') ||
REGEX(?paLabel, '^adjacent', 'i') ||
REGEX(?paLabel, '^rejection', 'i') ||
REGEX(?paLabel, '^cia', 'i') ||
REGEX(?paLabel, '^fragmentation', 'i') ||
REGEX(?paLabel, '^fluorescence', 'i') ||
REGEX(?paLabel, '^prrsv', 'i') ||
REGEX(?paLabel, '^ichnovirus', 'i') ||
REGEX(?paLabel, '^huanan', 'i') ||
REGEX(?paLabel, '^vascular', 'i') ||
REGEX(?paLabel, '^adipogenesis', 'i') ||
REGEX(?paLabel, '^diarrhea', 'i') ||
REGEX(?paLa