Merge pull request suyashb95#86 from jsch8q/patch_pronunciation

pronunciation parsing for heteronyms
ZOUHEIRBN · Oct 6, 2023 · bf854b4 · bf854b4
1 parent 933919b
commit bf854b4
Show file tree

Hide file tree

Showing 3 changed files with 8,650 additions and 6 deletions.
diff --git a/test.py b/test.py
@@ -0,0 +1,59 @@
+from parameterized import parameterized
+import unittest
+import json
+from wiktionaryparser import WiktionaryParser
+from deepdiff import DeepDiff
+from typing import Dict, List
+import mock
+from urllib import parse
+import os
+
+parser = WiktionaryParser()
+
+
+tests_dir = os.path.dirname(__file__)+"\\tests"
+html_test_files_dir = os.path.join(tests_dir, 'html_test_files')
+markup_test_files_dir = os.path.join(tests_dir, 'markup_test_files')
+
+test_words = [
+    # ('ἀγγελία', 47719496, ['Ancient Greek']),
+    ('اللغة_العربية', None, ['Arabic', 'EngliSH']),
+    # ('grapple', 50080840, ['EnGlish']),
+    # ('test', 50342756, ['English']),
+    # ('patronise', 49023308, ['English']),
+    # ('abiologically', 43781266, ['English']),
+    # ('alexin', 50152026, ['English']),
+    # ('song', 60388804, ['English']),
+    # ('house', 50356446, ['English']),
+    # ('correspondent', 61052028, ['English']),
+    # ('video', 50291344, ['Latin']),
+    # ('seg', 50359832, ['Norwegian Bokmål']),
+    # ('aldersblandet', 38616917, ['Norwegian Bokmål']),
+    # ('by', 50399022, ['Norwegian Bokmål']),
+    # ('for', 50363295, ['Norwegian Bokmål']),
+    # ('admiral', 50357597, ['Norwegian Bokmål']),
+    # ('heis', 49469949, ['Norwegian Bokmål']),
+    # ('konkurs', 48269433, ['Norwegian Bokmål']),
+    # ('pantergaupe', 46717478, ['Norwegian Bokmål']),
+    # ('maldivisk', 49859434, ['Norwegian Bokmål']),
+    # ('house', 50356446, ['Swedish'])
+]
+
+
+def get_test_words_table(*allowed_words):
+    """Convert the test_words array to an array of three element tuples."""
+    result = []
+
+    for word, old_id, languages in test_words:
+        for language in languages:
+            if len(allowed_words) == 0 or (word in allowed_words):
+                result.append((language, word, old_id))
+
+    return result
+
+test_words = get_test_words_table()
+
+parser = WiktionaryParser()
+for lang, word, old_id in test_words:
+    result = parser.fetch(word=word, language=lang, old_id=old_id)
+    print(json.dumps(result, indent=4, ensure_ascii=False))
diff --git a/wiktionaryparser/core.py b/wiktionaryparser/core.py
@@ -1,9 +1,12 @@
+from urllib import parse
 import re, requests
 from wiktionaryparser.utils import WordData, Definition, RelatedWord
 from bs4 import BeautifulSoup
 from itertools import zip_longest
 from copy import copy
 from string import digits
+import json
+import os
 
 PARTS_OF_SPEECH = [
     "noun", "verb", "adjective", "adverb", "determiner",
@@ -20,6 +23,12 @@
     "coordinate terms",
 ]
 
+LANGUAGE_CODES = {}
+with open('./wiktionaryparser/wiki_codes.json', 'r', encoding="utf8") as f:
+    LANGUAGE_CODES = json.load(f)
+
+#Language codes originally obtained from droher/etymology-db repository
+#https://raw.githubusercontent.com/droher/etymology-db/master/wiktionary_codes.csv
 def is_subheading(child, parent):
     child_headings = child.split(".")
     parent_headings = parent.split(".")
@@ -32,7 +41,7 @@ def is_subheading(child, parent):
 
 class WiktionaryParser(object):
     def __init__(self):
-        self.url = "https://en.wiktionary.org/wiki/{}?printable=yes"
+        self.url = "https://{}.wiktionary.org/w/api.php"
         self.soup = None
         self.session = requests.Session()
         self.session.mount("http://", requests.adapters.HTTPAdapter(max_retries = 2))
@@ -143,9 +152,9 @@ def parse_pronunciations(self, word_contents):
         pronunciation_id_list = self.get_id_list(word_contents, 'pronunciation')
         pronunciation_list = []
         audio_links = []
-        pronunciation_text = []
         pronunciation_div_classes = ['mw-collapsible', 'vsSwitcher']
         for pronunciation_index, pronunciation_id, _ in pronunciation_id_list:
+            pronunciation_text = []
             span_tag = self.soup.find_all('span', {'id': pronunciation_id})[0]
             list_tag = span_tag.parent
             while list_tag.name != 'ul':
@@ -276,10 +285,18 @@ def map_to_object(self, word_data):
             json_obj_list.append(data_obj.to_json())
         return json_obj_list
 
-    def fetch(self, word, language=None, old_id=None):
+    def fetch(self, word, language=None, old_id=None, **params):
+        params.update({'oldid': old_id, "list": "search", 'srsearch': word, "format": "json"})
+        params['action'] = params.get("action", "query")
+        # params = parse.urlencode(params)
+        print(params)
         language = self.language if not language else language
-        response = self.session.get(self.url.format(word), params={'oldid': old_id})
-        self.soup = BeautifulSoup(response.text.replace('>\n<', '><'), 'html.parser')
+        lang_code = LANGUAGE_CODES.get(language.lower(), 'en')
+        response = self.session.get(self.url.format(lang_code), params=params)
+        self.soup = response.json()
         self.current_word = word
-        self.clean_html()
+        res = {"response": copy.deepcopy(params)}
+        res.update(params)
+        return res
+
         return self.get_word_data(language.lower())