Skip to content

Commit

Permalink
Merge pull request suyashb95#86 from jsch8q/patch_pronunciation
Browse files Browse the repository at this point in the history
pronunciation parsing for heteronyms
  • Loading branch information
ZOUHEIRBN committed Oct 6, 2023
1 parent 933919b commit bf854b4
Show file tree
Hide file tree
Showing 3 changed files with 8,650 additions and 6 deletions.
59 changes: 59 additions & 0 deletions test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from parameterized import parameterized
import unittest
import json
from wiktionaryparser import WiktionaryParser
from deepdiff import DeepDiff
from typing import Dict, List
import mock
from urllib import parse
import os

parser = WiktionaryParser()


tests_dir = os.path.dirname(__file__)+"\\tests"
html_test_files_dir = os.path.join(tests_dir, 'html_test_files')
markup_test_files_dir = os.path.join(tests_dir, 'markup_test_files')

test_words = [
# ('ἀγγελία', 47719496, ['Ancient Greek']),
('اللغة_العربية', None, ['Arabic', 'EngliSH']),
# ('grapple', 50080840, ['EnGlish']),
# ('test', 50342756, ['English']),
# ('patronise', 49023308, ['English']),
# ('abiologically', 43781266, ['English']),
# ('alexin', 50152026, ['English']),
# ('song', 60388804, ['English']),
# ('house', 50356446, ['English']),
# ('correspondent', 61052028, ['English']),
# ('video', 50291344, ['Latin']),
# ('seg', 50359832, ['Norwegian Bokmål']),
# ('aldersblandet', 38616917, ['Norwegian Bokmål']),
# ('by', 50399022, ['Norwegian Bokmål']),
# ('for', 50363295, ['Norwegian Bokmål']),
# ('admiral', 50357597, ['Norwegian Bokmål']),
# ('heis', 49469949, ['Norwegian Bokmål']),
# ('konkurs', 48269433, ['Norwegian Bokmål']),
# ('pantergaupe', 46717478, ['Norwegian Bokmål']),
# ('maldivisk', 49859434, ['Norwegian Bokmål']),
# ('house', 50356446, ['Swedish'])
]


def get_test_words_table(*allowed_words):
"""Convert the test_words array to an array of three element tuples."""
result = []

for word, old_id, languages in test_words:
for language in languages:
if len(allowed_words) == 0 or (word in allowed_words):
result.append((language, word, old_id))

return result

test_words = get_test_words_table()

parser = WiktionaryParser()
for lang, word, old_id in test_words:
result = parser.fetch(word=word, language=lang, old_id=old_id)
print(json.dumps(result, indent=4, ensure_ascii=False))
29 changes: 23 additions & 6 deletions wiktionaryparser/core.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
from urllib import parse
import re, requests
from wiktionaryparser.utils import WordData, Definition, RelatedWord
from bs4 import BeautifulSoup
from itertools import zip_longest
from copy import copy
from string import digits
import json
import os

PARTS_OF_SPEECH = [
"noun", "verb", "adjective", "adverb", "determiner",
Expand All @@ -20,6 +23,12 @@
"coordinate terms",
]

LANGUAGE_CODES = {}
with open('./wiktionaryparser/wiki_codes.json', 'r', encoding="utf8") as f:
LANGUAGE_CODES = json.load(f)

#Language codes originally obtained from droher/etymology-db repository
#https://raw.githubusercontent.com/droher/etymology-db/master/wiktionary_codes.csv
def is_subheading(child, parent):
child_headings = child.split(".")
parent_headings = parent.split(".")
Expand All @@ -32,7 +41,7 @@ def is_subheading(child, parent):

class WiktionaryParser(object):
def __init__(self):
self.url = "https://en.wiktionary.org/wiki/{}?printable=yes"
self.url = "https://{}.wiktionary.org/w/api.php"
self.soup = None
self.session = requests.Session()
self.session.mount("http://", requests.adapters.HTTPAdapter(max_retries = 2))
Expand Down Expand Up @@ -143,9 +152,9 @@ def parse_pronunciations(self, word_contents):
pronunciation_id_list = self.get_id_list(word_contents, 'pronunciation')
pronunciation_list = []
audio_links = []
pronunciation_text = []
pronunciation_div_classes = ['mw-collapsible', 'vsSwitcher']
for pronunciation_index, pronunciation_id, _ in pronunciation_id_list:
pronunciation_text = []
span_tag = self.soup.find_all('span', {'id': pronunciation_id})[0]
list_tag = span_tag.parent
while list_tag.name != 'ul':
Expand Down Expand Up @@ -276,10 +285,18 @@ def map_to_object(self, word_data):
json_obj_list.append(data_obj.to_json())
return json_obj_list

def fetch(self, word, language=None, old_id=None):
def fetch(self, word, language=None, old_id=None, **params):
params.update({'oldid': old_id, "list": "search", 'srsearch': word, "format": "json"})
params['action'] = params.get("action", "query")
# params = parse.urlencode(params)
print(params)
language = self.language if not language else language
response = self.session.get(self.url.format(word), params={'oldid': old_id})
self.soup = BeautifulSoup(response.text.replace('>\n<', '><'), 'html.parser')
lang_code = LANGUAGE_CODES.get(language.lower(), 'en')
response = self.session.get(self.url.format(lang_code), params=params)
self.soup = response.json()
self.current_word = word
self.clean_html()
res = {"response": copy.deepcopy(params)}
res.update(params)
return res

return self.get_word_data(language.lower())
Loading

0 comments on commit bf854b4

Please sign in to comment.