<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
import pandas as pd 
import numpy as np 
from wasabi import msg
from os import listdir
from tqdm import tqdm 
from os.path import isfile, join


pd.set_option('max_row',5000)

In [2]:
# input parameters 

INPUT_DIR = './Datasets/orig/'
REMOVE_LABEL = ['university','person','researcher','country','location','scientist']
ONLY_MWE = True
LIST_TO_CONCATE = './wikidump_terms.txt'

WIKI_TITLE_DICT = './page_titles.txt' # wikipedia redirect page title
WIKI_TITLE_SEARCH = True

In [3]:
# read files 
files = [join(INPUT_DIR, f) for f in listdir(INPUT_DIR) if isfile(join(INPUT_DIR, f))]

remove_dfs = pd.DataFrame([], columns=['term', 'annotation', 'df'])
dfs = pd.DataFrame([], columns=['term', 'annotation', 'df'])

for file in files:
    msg.text(f"Preprocessing {file}...")
    df = pd.read_csv(file, na_filter= False)
    
    
    if df.shape[1] != 3:
         raise ValueError(f"{file} is not in a good format. ['term', 'label', 'dataset']")
    
    # check annotation type and remove those that are not a term
    annotations = df.iloc[:,1].unique()
    to_remove = [ann for ann in annotations if ann in REMOVE_LABEL]
    
    if to_remove == []: 
        msg.good("There is no label to remove.")     
    else:
        for label in to_remove:
            msg.info(f"Removing terms with label {label} from {file}:")
            
            remove_df = df[df.annotation==label]
            df = df.drop(remove_df.index, axis=0)
            remove_dfs = remove_dfs.append(remove_df,sort=False)
             
            print('\n'.join(remove_df.term.values))
    
    # whether to keep only Multi-word expressions
    if ONLY_MWE:
        msg.info(f"Removing single tokens from {file}:")
        
        single_tokens = [term for term in df.term.values if ' ' not in term]
        remove_df = df[df.term.isin(single_tokens)]
        df = df.drop(remove_df.index, axis=0)
        remove_dfs = remove_dfs.append(remove_df,sort=False)
        
        print('\n'.join(single_tokens))   
        
    dfs = dfs.append(df,sort=False)  
    print('\n')

Preprocessing ./Datasets/orig/annotation_df_scienceie.csv...
[38;5;2m✔ There is no label to remove.[0m
[38;5;4mℹ Removing single tokens from
./Datasets/orig/annotation_df_scienceie.csv:[0m
optical-chopper
water
alcohols
molecules
absorption
spin
aluminium
copper
masonry
concrete
ceo2
calumite
particle
les
pda
fluid
dem
particles
gas
d2
mofs
ins
nott-300
h2
al-o⋯h2
mof
liquid
grain
oxygen
uo2
cation
approach
lagrangian
pendulums
planets
framework
system
physisorption
n2
co
co2
ch4
nh3
so2
h2s
adsorbent
molecule
adsorbents
h2o
doping
functionalization
nanocarbons
graphene
coolant
neutron
air
graphite
methane
nitrogen
reactor
ls
pipes
rcs
pipe
alloys
r-adaptivity
adaptivity
meshes
mesh
monge–ampère
sphere
forcing
damping
ωf
ω
wind
waves
|ω−ωf|
cl
distribution
c3h3+
channels
chirp
fragmentation
framework
cfd
dns
sc∼o(102)
gases
hmf
dmss
zns
cds
gan
zno
znse
znte
tio2
sno2
electrons
neutrons
uam
tsunami
cpv
hcpv
mpp
cell
stereoisomer
enantiomers
benchmarks
mp/soft
2l-ccs
squid
hamiltoni

In [4]:
if LIST_TO_CONCATE:
    with open(LIST_TO_CONCATE) as f:
        mwes = [x.strip() for x in f.readlines()]
    
    # remove duplicate
    mwes = list(set(mwes) - set(dfs.term.values))        
        
    df = pd.DataFrame(mwes, columns=['term'])
    dfs = dfs.append(df,sort=False).fillna(value={'annotation': 'TECH', 'df': 'wikidump'})

In [5]:
if WIKI_TITLE_DICT:
    wiki_title_dict = pd.read_csv(WIKI_TITLE_DICT, delimiter='\t', header=None, index_col=0).to_dict()[1]
    dfs['wiki_title'] = dfs['term'].map(wiki_title_dict)

In [None]:
if WIKI_TITLE_SEARCH:
    # search for wiki page title that are not in the original dict (756419)
    import urllib 
    import time
    import requests 
    from bs4 import BeautifulSoup
    
    def search_wikipedia_1st_title(term):
        encoded_term = urllib.parse.quote(term)

        url = f"https://en.wikipedia.org/w/index.php?search={encoded_term}&title=Special%3ASearch&profile=advanced&fulltext=1&ns0=1"
        html = requests.get(url)
        soup = BeautifulSoup(html.text, 'html.parser')

        _list = soup.find('li', class_="mw-search-result")
        if _list is None:
            return None
        else:    
            title = _list.select('.mw-search-result-heading a')
            return title[0]['title']  
    
    
    def find_wiki_title(term):
        encoded_term = urllib.parse.quote(term)       
        
        url = f'https://en.wikipedia.org/w/api.php?action=query&redirects=true&titles={encoded_term}&format=json'     
        json_response = requests.get(url).json()

        query = json_response.get('query')
        if query: 
            pages = query.get('pages')
            if pages:
                pageid = list(pages.keys())[0]
                if int(pageid) > 0:
                    info = pages.get(pageid)        
                    if info:
                        title = info.get('title')
                        return title 
                  
                else: # find the first title when searching in wikipedia
                    return search_wikipedia_1st_title(term)                  
        

            
    # add wikititle for each term
    for term in tqdm(dfs[dfs.wiki_title.isna()].term.values):
        wiki_title = find_wiki_title(term)
        if wiki_title:
            wiki_title_dict.update({term:wiki_title})

  1%|          | 3186/368395 [52:14<105:44:41,  1.04s/it]

In [None]:
# save new dictionary
pd.DataFrame([(k,v) for k,v in wiki_title_dict.items()]).to_csv(WIKI_TITLE_DICT, header=None, index=False, sep='\t')

In [None]:
# save data
dfs['wiki_title'] = dfs['term'].map(wiki_title_dict)
dfs.to_csv (r'./matching_list.csv', index = False, header=True, sep='\t')

remove_dfs.to_csv (r'./removed_terms.csv', index = False, header=True, sep='\t')

In [None]:
# 1. read all original csv 
# 2. check their types 
# 3. return type list 
# 4. whether to keep MWEs and remove single tokens 
# 5. if other list txt file, we can concatenate it 
# 6. find wikipedia pagetile with redirects = True

# 7. if not found for the term 
# ==> pos NOUN
# ==> from left to right

# 368595