In [26]:
####################################################################################
# Copied from Karoly Varasdi https://github.com/karoly-varasdi/de-wiktionary-parser 
# heavily modified to extract phonetic information for all words
####################################################################################

import re
import pandas as pd

from reg_expressions import *

def clean_up_string(string:str):
    string = re.sub(wiki_link_string, r'\2', string)
    string = re.sub(specstring, ' ', string)
    string = re.sub(quote_html, '"', string)
    string = re.sub(amp_html, '&', string)
    for to_del_string in to_del_strings:
        string = re.sub(to_del_string, '', string)
    return string

class wiktionary_dataframe:
    
    def __init__(self,file_path):
        
        self.word = []
        self.word_sep = []
        self.rhyme = []
        self.ipa = []
        self.label = []
        self.generate_entries(file_path)
        
        self.df = pd.DataFrame(list(zip(self.word,self.ipa, self.label, self.rhyme,self.word_sep)), 
                  columns =['word','ipa', 'label', 'rhyme','sep'])
        
    def generate_entries(self, file_path):
            '''Populates dictionary with noun information from the German wiktionary xml at file_path.'''
            with open(file_path, 'r', encoding='utf-8') as wikif:
                page_list = []
                page_no = 0
                print('Generating dictionary with word information from wiktionary source: {0}\nThis may take several minutes . . .'.format(file_path))
                for line in wikif:
          
                    page_list.append(line)
                    if '</page>' in line:
                        page_no += 1
                        if page_no % 50000 == 0:
                            print(page_no, 'pages processed')

                        one_page_str = ''.join(page_list)
                        # ## Cleaning up page before parsing:
                        one_page_str = clean_up_string(one_page_str)

                        page_list = one_page_str.splitlines()

                        # German words
                        if re.search(de_word_regex, one_page_str) is None:
                            page_list = []
                            continue

                        word_match = re.search(de_headword_spaces_allowed_regex, one_page_str)
                        if word_match:
                            title_match = re.search(title_pattern, one_page_str)
                            if title_match:
                                adj_form = title_match.group('pagetitle')
                                ## Ignore pages under wiktionary-namespaces:
                                if re.match(namensraum_simple + colon, adj_form) or re.match(namensraum_simple + diskussion_colon, adj_form):
                                    page_list = []
                                    continue
                            else:
                                page_list = []
                                continue
                        else:
                            page_list = []
                            continue
                            
                        '''   for item in page_list:
                            print(item)'''
                        # call the next lower level parser function on a page with German noun info:
                        self.parse_word_page(page_list)
                        # delete entries with no usages (can happen if Abkürzung-only info in page)
                       
                        page_list = []
                   
                       
                try:       
                    print('Read {0} pages.'.format(page_no))
                    print('Generated {0} entries.'.format(len(self)))
                except: pass

    def parse_word_page(self, page_list: list):
            '''Parses a word page from the xml file, separates it into usages and calls the usage parser function on each usage.'''
           
            title = ''
            rhyme_list = []
            word_sep_list = []
            ipa_list = []
            label_itm = ''
            
            for index, line in enumerate(page_list):
                try: 
                    title = re.search(r'(?<=<title>)(.*)(?=<\/title>)', line,flags=0).group(0)
                  

                except: pass
                
                ipa_found = re.search(r'\{\{IPA\}\}', line)
                if ipa_found: 
                    try: 
                        ipa_text = re.finditer(r'\{\{Lautschrift\|.*?\}\}', line,flags=0)
                        for item in ipa_text: 
                            ipa_list.append(re.search(r'(?<=\{\{Lautschrift\|)(.*)(?=\}\})', item.group(0),flags=0).group(0))
            
                    except: pass

                rhyme_found = re.search(r'\{\{Reime\}\}', line)
                if rhyme_found: 
                    try: 
                        rhyme_text = re.finditer(r'\{\{Reim\|.*?\|Deutsch\}\}', line,flags=0)
                        for item in rhyme_text: 
                            rhyme_list.append(re.search(r'(?<=\{\{Reim\|)(.*)(?=\|Deutsch\}\})', item.group(0),flags=0).group(0))
            
                    except: pass
                    
                sep_found = re.search(r'\{\{Worttrennung\}\}', line)
                if sep_found:
                    try: 
                        sep_text = page_list[index+1]
                        sep_text = re.sub(r'\{\{[^)]*\}\}', '', sep_text)
                        sep_text = re.sub(r'\'\'[^)]*\'\'', '', sep_text)
                        sep_text = re.sub(r'[:,]', '', sep_text)
                        sep_text = sep_text.split()
                        word_sep_list.append(sep_text)
                        
                    except:pass  
                        
                rhyme_found = re.search(r'{\{Wortart\|', line)
                if rhyme_found: 
                    try: 
                        rhyme_text = re.finditer(r'\{\{Wortart\|.*?\|Deutsch\}\}', line,flags=0)
                        for item in rhyme_text: 
                            label_itm = (re.search(r'(?<=\{\{Wortart\|)(.*)(?=\|Deutsch\}\})', item.group(0),flags=0).group(0)).lower()
                    except: pass
                        
                    
                
                end_found = re.search(r'==== {{Übersetzungen}} ====',line)
                if end_found: 
                    break
                    
               
            self.word.append(title.lower())  
            self.ipa.append(ipa_list)
            self.word_sep.append(word_sep_list)
            self.rhyme.append(rhyme_list)
            self.label.append(label_itm)

In [27]:
fname = 'wiktionary_dump'

wiktionary = wiktionary_dataframe(fname + '.xml')

wiktionary.df.to_csv('wiktionary_data.csv')

Generating dictionary with word information from wiktionary source: wiktionary_dump.xml
This may take several minutes . . .
50000 pages processed
100000 pages processed
150000 pages processed
200000 pages processed
250000 pages processed
300000 pages processed
350000 pages processed
400000 pages processed
450000 pages processed
500000 pages processed
550000 pages processed
600000 pages processed
650000 pages processed
700000 pages processed
750000 pages processed
800000 pages processed
850000 pages processed
900000 pages processed
950000 pages processed
1000000 pages processed
1050000 pages processed
1100000 pages processed
1150000 pages processed
Read 1180818 pages.


In [28]:
df = pd.read_csv('wiktionary_data.csv')
print(df.head())

   Unnamed: 0        word                ipa       label      rhyme  \
0           0       hallo         ['haˈloː']  substantiv     ['oː']   
1           1  subfamilia               ['']  substantiv         []   
2           2   subregnum    ['zʊpˈʁeːɡnʊm']  substantiv         []   
3           3  subdivisio  ['zʊpdiˈviːzi̯o']  substantiv         []   
4           4      phylum        ['ˈfyːlʊm']  substantiv  ['yːlʊm']   

                                         sep  
0                    [['Hal·lo', 'Hal·los']]  
1     [['Sub·fa·mi·lia', 'Sub·fa·mi·li·ae']]  
2            [['Sub·reg·num', 'Sub·reg·na']]  
3  [['Sub·di·vi·sio', 'Sub·di·vi·si·o·nes']]  
4                    [['Phy·lum', 'Phy·la']]  
