<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
import pandas as pd 
import numpy as np 
from wasabi import msg
from os import listdir
from tqdm import tqdm 
from os.path import isfile, join


pd.set_option('max_row',5000)

In [2]:
# input parameters 

INPUT_DIR = './Datasets/orig/'
REMOVE_LABEL = ['university','person','researcher','country','location','scientist']
ONLY_MWE = False
LIST_TO_CONCATE = ['./wikidump_terms.txt',
                   './CATEGORY_ESSENTIAL_CORE_FILLERS_V1.txt', 
                   './INVENTION_PATTERNS_V8.txt']

WIKI_TITLE_DICT = './title_summary.json' # wikipedia redirect page title
WIKI_TITLE_SEARCH = True

In [3]:
# read files 
files = [join(INPUT_DIR, f) for f in listdir(INPUT_DIR) if isfile(join(INPUT_DIR, f))]

remove_dfs = pd.DataFrame([], columns=['term', 'annotation', 'df'])
dfs = pd.DataFrame([], columns=['term', 'annotation', 'df'])

for file in files:
    msg.text(f"Preprocessing {file}...")
    df = pd.read_csv(file, na_filter= False)
    
    
    if df.shape[1] != 3:
         raise ValueError(f"{file} is not in a good format. ['term', 'label', 'dataset']")
    
    # check annotation type and remove those that are not a term
    to_remove = []
    for i in range(len(df)):
        if df.iloc[i,1] in REMOVE_LABEL  or len(df.iloc[i,0].split(' '))>6 or any(x in df.iloc[i,0].split(' ') for x in ['this', 'these', 'the', 'those', 'that']):
            to_remove.append(df.iloc[i,0])

    if to_remove == []: 
        msg.good("There is no terms to remove.")     
    else:
        msg.info(f"Removing terms from {file}:")

        remove_df = df[df.term.isin(to_remove)]
        df = df.drop(remove_df.index, axis=0)
        remove_dfs = remove_dfs.append(remove_df,sort=False)

        print('\n'.join(remove_df.term.values))
    
    # whether to keep only Multi-word expressions
    if ONLY_MWE:
        msg.info(f"Removing single tokens from {file}:")
        
        single_tokens = [term for term in df.term.values if ' ' not in term]
        remove_df = df[df.term.isin(single_tokens)]
        df = df.drop(remove_df.index, axis=0)
        remove_dfs = remove_dfs.append(remove_df,sort=False)
        
        print('\n'.join(single_tokens))  
        
    else:
#         import nltk
#         nltk.download('stopwords')
        from nltk.corpus import stopwords
        sw = stopwords.words("english")
        
        msg.info(f"Removing single words that are less than 3 in length and that are stop words:")
        single_tokens_remove = [term for term in df.term.values if (' ' not in term and len(term)<3) or term in sw]
        remove_df = df[df.term.isin(single_tokens_remove)]
        df = df.drop(remove_df.index, axis=0)
        remove_dfs = remove_dfs.append(remove_df,sort=False)
        
        print('\n'.join(single_tokens_remove))  
        
    dfs = dfs.append(df,sort=False)  
    print('\n')
    
dfs.drop_duplicates(subset ="term", keep = False, inplace = True)

Preprocessing ./Datasets/orig/annotation_df_scienceie.csv...
[38;5;4mℹ Removing terms from ./Datasets/orig/annotation_df_scienceie.csv:[0m
measure this cumulative thermal effect
general equations of state of the form
partition functions of the form
investigate the influence of the particle shape on interacting particles flowing in a horizontal turbulent channel flow
determining the behaviour of the particles in this horizontal gas–solid channel flow
permit direct observation of the dynamics of the binding interactions
predict the relative stability of the defect energies
morl and the arima potential models
morl, along with the grimes shell potential model
reproduce the activation energy of oxygen migration
the dynamics of various physical phenomena
movement of pendulums, planets, or water waves
study the symmetry properties of a system
express all the dynamics of a system
improving the pore structure and specific surface area of nanocarbons
introducing charge variations in the materi

[38;5;4mℹ Removing single words that are less than 3 in length and that are
stop words:[0m
d2
h2
n2
co
ls
ωf
ω
cl
ci
ti
y
o
α
b
rm
ψ
δψ
xc
ci
fm
fe
dg
tm
pb
bb
he
sm
sm
mo
fe
ca
sr
ba
ga
gp
h
pπ
λ
σ
π
ρ
ω
η
k
k∗
ra
u
al
km
a5
φ
b
δn
lr
mn
d2
xe
vu
w
d
c
nb
oh
p
pg
ph
lm
aa
na
px
cg
fr
vo
oi
md
ni
sv
mf
sa
π±
k±
p̄
η′
ϕ
b0
sn
zn
ag
ct
ga
o2
mc
sa
ℓ
θ+
s̄
bn
ca
pe
gt
pl
pl
us
x
ac
th
lr
md
r
t
mg
mx
nl
ns
rh
ad
dp
κ
ψ′
pc
sb
ρλ
ρ
q
h0
er
e
ε∞
se
pt


Preprocessing ./Datasets/orig/annotation_df_ncbi.csv...
[38;5;4mℹ Removing terms from ./Datasets/orig/annotation_df_ncbi.csv:[0m
demyelination of the central nervous system
prostatic , pancreas , skin , and lung cancer
developmental abnormalities of the eye
von hippel - lindau ( vhl ) disease
parenchymal tumours of the kidney
glucose - 6 - phosphate dehydrogenase ( g6pd ) deficiency
glucose - 6 - phosphate dehydrogenase deficiency
combined subtotal deficiencies of c6 and c7
combined subtotal deficiency of c6 and c7
complete hypoxanthine 

[38;5;4mℹ Removing terms from ./Datasets/orig/annotation_df_scientific.csv:[0m
Brion James
Leon Kowalski
Joanna Cassidy
Zhora
Segmenting the text into topics
Indiana University
Stanford University
Ray Kurzweil
Hans Moravec
Kevin Kelly
Ralph Merkle
Bill Joy
Frank Drake
John Henry Holland
John Koza
Lee Sedol
Scheinman
Banerjee
Lavie
Ian Niles
Adam Pease
Kiichiro Toyoda
Sakichi Toyoda
Webber
Association for the Advancement of Artificial Intelligence
India
Pakistan
Rajabazar Science College
University of Calcutta
Indian Statistical Institute
Diploma of the Imperial College
Imperial College
University of London
Expo II
Mamie Van Doren
Pinky Lee
Ulf Grenander
reading machines for the blind
Marvin Minsky
Seymour Papert
Sweden
Switzerland
Germany
Italy
Harrison Ford
Rutger Hauer
Sean Young
Philip K. Dick
Do Androids Dream of Electric Sheep ?
universities of Newcastle
Surrey
Tel Aviv University
Simon Fraser University
University of Tromsø
European Chapter of the Association for Computational 

[38;5;4mℹ Removing terms from ./Datasets/orig/annotation_df_bc5cdr_.csv:[0m
reduced the supine systolic and diastolic blood pressures
calcification of the artery
disorders of the central nervous system
structural lesions of the brain
pain and visual disturbance in the ipsilateral eye
increases in dural and cortical blood flow
ventricular septal ( vsd ) and midline ( md ) defects
injury to the brain
decreased thymus ( p < 0 . 001 ) and bodyweights
decrease of mean arterial blood pressure ( mbp ) and heart rate ( hr )
injury to different regions of the kidney
cirrhosis of the liver
myoclonic , atonic , and absence seizures
myoclonic , atypical absence and / or atonic ( minor motor ) seizures
veno - occlusive disease of the liver
non - insulin - dependent diabetes mellitus
left ventricular end - diastolic volume falls
pathology at both the neuromuscular junction
cerebral venous sinus and internal carotid artery thrombosis
occlusion of the left internal carotid artery
cerebral artery and

In [4]:
if LIST_TO_CONCATE:
    for list_file in LIST_TO_CONCATE:
        with open(list_file) as f:
            terms = [x.strip() for x in f.readlines()]

        # remove duplicate
        terms = list(set(terms) - set(dfs.term.values))        

        df = pd.DataFrame(terms, columns=['term'])
        if list_file == LIST_TO_CONCATE[0]:
            dfs = dfs.append(df,sort=False).fillna(value={'annotation': 'TECH', 'df': 'wikidump'})
        else:
            dfs = dfs.append(df,sort=False).fillna(value={'annotation': 'JURY', 'df': 'patterns'})
            
        dfs.drop_duplicates(subset ="term", keep = False, inplace = True)

In [5]:
if WIKI_TITLE_DICT:
    import json     
    with open(WIKI_TITLE_DICT, 'r') as json_file:
        wiki_title_dict = json.load(json_file)
        
    def map_wiki(var, term):
        try:
            return wiki_title_dict[term][var]
        except KeyError:
            return None
            # return nothing

    dfs['wiki_title'] = dfs['term'].map(lambda term: map_wiki(var = 'title', term=term))
    dfs['wiki_summary'] = dfs['term'].map(lambda term: map_wiki(var = 'summary', term=term))

In [6]:
if WIKI_TITLE_SEARCH:
    # search for wiki page title that are not in the original dict (756419)
    import urllib 
    import time
    import requests 
    import wikipedia
    import warnings
    from bs4 import BeautifulSoup
    
    warnings.filterwarnings('ignore')
    
    
    def find_wiki_title(term):
        title = wikipedia.search(term)
        if title != []:
            return title[0]  
    def find_wiki_summary(term):
        try:
            return wikipedia.summary(term)
        except wikipedia.exceptions.WikipediaException or KeyError:
            return None

            
#     add wikititle and summary for each term
    for term in tqdm(dfs[dfs.wiki_title.isna()].term.values):
        wiki_title = find_wiki_title(term)
        if wiki_title:
            wiki_summary = find_wiki_summary(wiki_title)
            wiki_title_dict.update({term:{'title':wiki_title, 'summary': wiki_summary}})

  1%|          | 315/45256 [01:59<4:43:22,  2.64it/s]


KeyboardInterrupt: 

In [7]:
# save new dictionary
with open(WIKI_TITLE_DICT, "w") as outfile: 
    json.dump(wiki_title_dict, outfile, indent = 4)

In [8]:
# save data
dfs['wiki_title'] = dfs['term'].map(lambda term: map_wiki(var = 'title', term=term))
dfs['wiki_summary'] = dfs['term'].map(lambda term: map_wiki(var = 'summary', term=term))
dfs.to_csv (r'./matching_list.csv', index = False, header=True, sep='\t')

remove_dfs.to_csv (r'./removed_terms.csv', index = False, header=True, sep='\t')

In [None]:
# 1. read all original csv 
# 2. check their types 
# 3. return type list 
# 4. whether to keep MWEs and remove single tokens 
# 5. if other list txt file, we can concatenate it 
# 6. find wikipedia pagetile with redirects = True

# 7. if not found for the term 
# ==> pos NOUN
# ==> from left to right

# 45258