<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
import pandas as pd 
import numpy as np 
from wasabi import msg
from os import listdir
from tqdm import tqdm 
from os.path import isfile, join


pd.set_option('max_row',5000)

In [2]:
# input parameters 

INPUT_DIR = './Datasets/orig/'
REMOVE_LABEL = ['university','person','researcher','country','location','scientist']
ONLY_MWE = False
LIST_TO_CONCATE = './wikidump_terms.txt'

WIKI_TITLE_DICT = './page_titles.txt' # wikipedia redirect page title
WIKI_TITLE_SEARCH = True

In [4]:
# read files 
files = [join(INPUT_DIR, f) for f in listdir(INPUT_DIR) if isfile(join(INPUT_DIR, f))]

remove_dfs = pd.DataFrame([], columns=['term', 'annotation', 'df'])
dfs = pd.DataFrame([], columns=['term', 'annotation', 'df'])

for file in files:
    msg.text(f"Preprocessing {file}...")
    df = pd.read_csv(file)
    
    
    if df.shape[1] != 3:
         raise ValueError(f"{file} is not in a good format. ['term', 'label', 'dataset']")
    
    # check annotation type and remove those that are not a term
    annotations = df.iloc[:,1].unique()
    to_remove = [ann for ann in annotations if ann in REMOVE_LABEL]
    
    if to_remove == []: 
        msg.good("There is no label to remove.")     
    else:
        for label in to_remove:
            msg.info(f"Removing terms with label {label} from {file}:")
            
            remove_df = df[df.annotation==label]
            df = df.drop(remove_df.index, axis=0)
            remove_dfs = remove_dfs.append(remove_df,sort=False)
             
            print('\n'.join(remove_df.term.values))
    
    # whether to keep only Multi-word expressions
    if ONLY_MWE:
        msg.info(f"Removing from single tokens from {file}:")
        
        single_tokens = [term for term in df.term.values if ' ' not in term]
        remove_df = df[df.term.isin(single_tokens)]
        df = df.drop(remove_df.index, axis=0)
        remove_dfs = remove_dfs.append(remove_df,sort=False)
        
        print('\n'.join(single_tokens))   
        
    dfs = dfs.append(df,sort=False)  
    print('\n')

Preprocessing ./Datasets/orig/annotation_df_scienceie.csv...
[38;5;2m✔ There is no label to remove.[0m


Preprocessing ./Datasets/orig/.DS_Store...


ValueError: ./Datasets/orig/.DS_Store is not in a good format. ['term', 'label', 'dataset']

In [None]:
if LIST_TO_CONCATE:
    with open(LIST_TO_CONCATE) as f:
        mwes = [x.strip() for x in f.readlines()]
    
    # remove duplicate
    mwes = list(set(mwes) - set(dfs.term.values))        
        
    df = pd.DataFrame(mwes, columns=['term'])
    dfs = dfs.append(df,sort=False).fillna(value={'annotation': 'TECH', 'df': 'wikidump'})

In [None]:
if WIKI_TITLE_DICT:
    wiki_title_dict = pd.read_csv(WIKI_TITLE_DICT, delimiter='\t', header=None, index_col=0).to_dict()[1]
    dfs['wiki_title'] = dfs['term'].map(wiki_title_dict)

In [None]:
if WIKI_TITLE_SEARCH:
    # search for wiki page title that are not in the original dict (723836)
    import urllib 
    import requests 
    from bs4 import BeautifulSoup
    
    def find_wiki_title(term):
        encoded_term = urllib.parse.quote(term)       
        
        url = f'https://en.wikipedia.org/w/api.php?action=query&redirects=true&titles={encoded_term}&format=json'
        json_response = requests.get(url).json()
        
        query = json_response.get('query')
        if query: 
            pages = query.get('pages')
            if pages:
                pageid = list(pages.keys())[0]
                if pageid =='-1':
                    return None 
                else:
                    info = pages.get(pageid)
                    if info:
                        title = info.get('title')
                        return title 
        

            
    # add wikititle for each term
    for term in tqdm(dfs[dfs.wiki_title.isna()].term.values):
        wiki_title = find_wiki_title(term)
        if wiki_title:
            wiki_title_dict.update({term:wiki_title})

In [None]:
# save new dictionary (original file with 1718334 rows)
pd.DataFrame([(k,v) for k,v in wiki_title_dict.items()]).to_csv(WIKI_TITLE_DICT, header=None, index=False, sep='\t')

In [None]:
# save data
dfs['wiki_title'] = dfs['term'].map(wiki_title_dict)
dfs.to_csv (r'./matching_list.csv', index = False, header=True)

remove_dfs.to_csv (r'./removed_terms.csv', index = False, header=True)

In [None]:
# 1. read all original csv 
# 2. check their types 
# 3. return type list 
# 4. whether to keep MWEs and remove single tokens 
# 5. if other list txt file, we can concatenate it 