In [1]:
import copy
import re
import opencc
import spacy
from utils import *
import wikipediaapi
converter = opencc.OpenCC('s2t') # simplified to traditional Chinese characters
en_spacy = spacy.load('en_core_web_sm')
user_agent = "sense_dict/1.0 (https://example.com/mywikiapp; amirali.marashifar@sjsu.edu)"
wiki_wiki = wikipediaapi.Wikipedia('en', headers={'User-Agent': user_agent})

In [2]:
def check_pos(title, word):
    if title.lower().find(word) == -1:
        return []
    else:
        part_of_speech = []
        for t in en_spacy(title): # turn spacy to case sensitive to catch proper nouns
            if t.text.lower().find(word) != -1:
                part_of_speech.append(t.pos_)

        return part_of_speech

In [3]:
def clean_doc(sense_dict,lang='en'):

    if lang=='zh':
        cleaned = copy.deepcopy(sense_dict)

        for word,dic in sense_dict.items():
            for key,tup in dic.items():
                (title, summary) = tup
                cleaned[word][key] = (converter.convert(title), re.sub('\s+',' ',converter.convert(summary)))

        return cleaned
    else:
        cleaned = copy.deepcopy(sense_dict)

        for word,dic in sense_dict.items():
            for key,tup in dic.items():
                (title, summary) = tup
                cleaned[word][key] = (title,re.sub('\s+',' ',summary))

        return cleaned


In [4]:
def clean_internal(sense_dict, sense_dict_2):
    # list of english "internal" pages to be removed

    dict_copy = copy.deepcopy(sense_dict)
    dict_copy2 = copy.deepcopy(sense_dict_2)

    for word, dic in dict_copy.items():
        for key, (tit, summary) in dic.items():
            # boo = any(p in tit for p in punc)
            # if boo: print('WORD: {}, TITLE: {}'.format(word,tit))
            if re.search('(Wikipedia|Category|Help|WikiProject):', tit):
                print('WORD: {}, TITLE: {}'.format(word,tit))
                try: del sense_dict[word][key]
                except: pass
                try: del sense_dict_2[word][key]
                except: pass

    for word, dic in dict_copy2.items():
        for key, (tit, summary) in dic.items():
            # boo = any(p in tit for p in punc)
            # if boo: print('WORD: {}, TITLE: {}'.format(word,tit))
            if re.search('(Wikipedia|Category|Help|WikiProject):', tit):
                print('WORD: {}, TITLE: {}'.format(word,tit))
                try: del sense_dict[word][key]
                except: pass
                try: del sense_dict_2[word][key]
                except: pass

    return sense_dict, sense_dict_2


In [5]:
def main(tlang):
    dis_nouns = unpickle('{}/dis_nouns.pickle'.format(tlang))
    eng_sense_dict, yy_sense_dict = {}, {}
    # eng_sense_dict = unpickle('{}/eng_sense_dict.pickle'.format(tlang))
    # yy_sense_dict = unpickle('{}/{}_sense_dict.pickle'.format(tlang,tlang))

    for word in dis_nouns:
        english_pages, yy_pages = {}, {}
        pages = list(wiki_wiki.page(str('%s (disambiguation)' % word.lower())).links.items())

        for page in pages:
            # conditions
            find1 = page[0].lower().find(word)  # find target word in page name
            find2 = page[0].find('disambiguation')  # find disambiguation
            find_it = tlang in page[1].langlinks  # find chinese page
            match_tit = page[1].title.lower() == page[0].lower()  # marching title and name
            match_pos = 'NOUN' in check_pos(page[1].title, word)  # check if target is noun in title

            if (find1 != -1) and not (find2 != -1) and match_tit and match_pos and find_it:
                yy_page = page[1].langlinks[tlang]

                if len(yy_page.summary) > 0 and len(page[1].summary) > 0:
                    english_pages[page[0]] = (page[1].title, page[1].summary)
                    yy_pages[page[0]] = (yy_page.title, yy_page.summary)
                else:
                    continue

        if len(english_pages): eng_sense_dict[word] = english_pages
        if len(yy_pages): yy_sense_dict[word] = yy_pages

        print(f'\r>finish getting %d/%d nouns' % (len(eng_sense_dict.keys()),len(dis_nouns)), flush=True, end='')
        if len(eng_sense_dict)%10 == 0:
            save_pickle('{}/eng_sense_dict.pickle'.format(tlang), eng_sense_dict)
            save_pickle('{}/{}_sense_dict.pickle'.format(tlang,tlang), yy_sense_dict)
    print('\n===========finished===========')

    print('last save')
    save_pickle('{}/eng_sense_dict.pickle'.format(tlang), eng_sense_dict)
    save_pickle('{}/{}_sense_dict.pickle'.format(tlang,tlang), yy_sense_dict)

    print('clean single sense')
    eng_sense_dict = {k:v for k,v in eng_sense_dict.items() if len(v)>=2}
    yy_sense_dict = {k:v for k,v in yy_sense_dict.items() if len(v)>=2}

    print('clean docs')
    eng_sense_dict = clean_doc(eng_sense_dict, 'en')
    yy_sense_dict = clean_doc(yy_sense_dict, tlang)

    print('update pickle files')
    save_pickle('{}/eng_sense_dict.pickle'.format(tlang), eng_sense_dict)
    save_pickle('{}/{}_sense_dict.pickle'.format(tlang,tlang), yy_sense_dict)

    print('clean internal pages')
    eng_sense_dict, yy_sense_dict = clean_internal(eng_sense_dict, yy_sense_dict)

    # clean single sense
    eng_sense_dict = {k:v for k,v in eng_sense_dict.items() if len(v)>=2}
    yy_sense_dict = {k:v for k,v in yy_sense_dict.items() if len(v)>=2}

    print('save cleaned pickle')
    save_pickle('{}/clean_eng_sense_dict.pickle'.format(tlang), eng_sense_dict)
    save_pickle('{}/clean_{}_sense_dict.pickle'.format(tlang,tlang), yy_sense_dict)

In [6]:
!ls

[34mMuSeCLIR dataset[m[m README.md        main.py          [34mzh[m[m
MuSeCLIR.ipynb   [34mdata[m[m             [34msrc[m[m


In [7]:
import os

os.chdir("/Users/amirali/Desktop/Notebooks/MuSeCLIR/data/")

In [8]:
!chmod 755 get_evaluation.sh

In [9]:
!./get_evaluation.sh


./get_evaluation.sh: line 13: declare: -A: invalid option
declare: usage: declare [-afFirtx] [-p] [name[=value] ...]
mkdir: monolingual: File exists
mkdir: crosslingual: File exists
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  218k  100  218k    0     0  33144      0  0:00:06  0:00:06 --:--:-- 50302
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  578k  100  578k    0     0  1079k      0 --:--:-- --:--:-- --:--:-- 1084k  501k      0  0:00:01 --:--:--  0:00:01  504k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  577k  100  577k    0     0  1271k      0 --:--:-- --:--:-- --:--:-- 1281k
  % Total    % Received % Xferd  Average Speed   Time    Time

In [10]:
os.chdir("/Users/amirali/Desktop/Notebooks/MuSeCLIR/src/")
!head translate_query.py

import re

def traslate(sentences, tlang):
    f = open('/Users/amirali/Desktop/Notebooks/MuSeCLIR/data/crosslingual/dictionaries/en-{}.txt'.format(tlang))
    en, yy = [],[]

    for line in f.readlines():
        en.append(line.split()[0])
        yy.append(line.split()[1])



In [11]:
os.chdir("/Users/amirali/Desktop/Notebooks/MuSeCLIR/")

In [12]:
!python main.py --target_lang zh

save to  zh/multi_trans_nouns.pickle
SAVING PICKLE,  zh/multi_trans_nouns.pickle
>finished 2086 nouns
SAVING PICKLE,  zh/dis_nouns.pickle
>finish getting 10/2086 nounsSAVING PICKLE,  zh/eng_sense_dict.pickle
SAVING PICKLE,  zh/zh_sense_dict.pickle
>finish getting 20/2086 nounsSAVING PICKLE,  zh/eng_sense_dict.pickle
SAVING PICKLE,  zh/zh_sense_dict.pickle
>finish getting 30/2086 nounsSAVING PICKLE,  zh/eng_sense_dict.pickle
SAVING PICKLE,  zh/zh_sense_dict.pickle
>finish getting 40/2086 nounsSAVING PICKLE,  zh/eng_sense_dict.pickle
SAVING PICKLE,  zh/zh_sense_dict.pickle
>finish getting 50/2086 nounsSAVING PICKLE,  zh/eng_sense_dict.pickle
SAVING PICKLE,  zh/zh_sense_dict.pickle
>finish getting 50/2086 nounsSAVING PICKLE,  zh/eng_sense_dict.pickle
SAVING PICKLE,  zh/zh_sense_dict.pickle
>finish getting 50/2086 nounsSAVING PICKLE,  zh/eng_sense_dict.pickle
SAVING PICKLE,  zh/zh_sense_dict.pickle
>finish getting 60/2086 nounsSAVING PICKLE,  zh/eng_sense_dict.pickle
SAVING PICKLE,  zh/zh_