# Setting

In [None]:
# -*- coding: utf-8 -*-
import pickle
import sys
import os
from google.colab import drive
drive.mount('/content/gdrive',force_remount=True)

path = 'yourfolder'
os.chdir(path)

In [None]:
!pip install opencc

# Get 萌典 moe-dict

- https://github.com/g0v/moedict-data.git
- https://github.com/g0v/moedict-epub.git

In [None]:
!sudo apt update
!sudo apt install -y python g++ make nodejs python-lxml curl npm

In [None]:
!npm i
!pip install lxml
!sudo npm i -g gulp

In [None]:
!git clone --depth 1 https://github.com/g0v/moedict-data.git
!git clone --depth 1 https://github.com/g0v/moedict-epub.git
!cp -v moedict-data/dict-revised.json moedict-epub/
!cd moedict-epub

# Build DataFrame

Each row is a sense belonging to certain lemma (i.e., two rows may represent different definitions of the same lemma). 

In [4]:
import json
import pandas as pd

# read json
with open(path+'/moedict-data/dict-revised.json', "r") as read_file:
    data = json.load(read_file)
print(type(data),len(data))

<class 'list'> 163087


In [11]:
import pandas as pd

# function to extract information from each dictionary and create a list of dictionaries
def extract_info(d):
    title = d['title']
    non_radical_stroke_count = d["non_radical_stroke_count"] if "non_radical_stroke_count" in d else ''
    radical = d["radical"] if "radical" in d else ''
    stroke_count = d["stroke_count"] if "stroke_count" in d else ''

    rows = []
    for heteronym in d['heteronyms']:
    # data in the 2nd level 
        if 'bopomofo' in heteronym:
            bopomofo = heteronym['bopomofo']
        else:
            bopomofo = ''
        if 'pinyin' in heteronym:
            pinyin = heteronym['pinyin']
        else:
            pinyin = ''
            
        for i, definition in enumerate(heteronym['definitions']):
        # data in the 3rd level (iterrating each senses of this lemma)
            lemma_text = title
            sense_idx = i+1
            definition_text = definition['def']
            
            pos = definition.get('type', '')

            if 'quote' in definition:
                quote = definition['quote'][0]
            else:
                quote = ''
            if 'example' in definition:
                examples = [s[s.index('「')+1:s.index('」')] for s in definition['example']]
            else:
                examples = ''
            
            if 'anotonyms' in definition:
                anotonyms = definition['anotonyms'].split(',')
            else:
                anotonyms = []
            if 'synonyms' in definition:
                synonyms = definition['synonyms'].split(',')
            else:
                synonyms = []      

            row_dict = {'lemma_text': lemma_text,
                        'sense_idx': sense_idx,
                        'definition': definition_text,
                        'synonyms': synonyms,
                        'anotonyms': anotonyms,
                        'quote': quote,
                        'examples': examples,
                        'POS': pos,
                        'bopomofo': bopomofo,
                        'pinyin': pinyin,
                        'non_radical_stroke_count': non_radical_stroke_count,
                        'radical': radical,
                        'stroke_count':stroke_count,}
            rows.append(row_dict)
    return rows

In [12]:
# Extract information from the list of dictionaries
rows = []
for d in data:
    rows += extract_info(d)

In [13]:
# Create a pandas DataFrame from the list of dictionaries
df = pd.DataFrame(rows, columns=['lemma_text','sense_idx','definition', 'synonyms', 'anotonyms', 
                                    'quote', 'examples','POS','bopomofo','pinyin',
                                 'non_radical_stroke_count', 'radical', 'stroke_count'])

print(len(df))

213440


In [14]:
# combine quotes & examples into []
## only examples -> [examples], only quotes -> [quote]

def merge_quotes_and_examples(row):
    quote = row['quote']
    examples = row['examples']
    if not quote and not examples:
        return []
    elif quote and examples:
        if type(quote) == str:
            quote = [quote]
        result = []
        result+=examples
        result+=quote
        return result
    elif examples:
        return examples
    else:
        if type(quote) == str:
            quote = [quote]
            return quote

213440


In [None]:
# apply the function to each row of the DataFrame
df['examples_quote'] = df.apply(merge_quotes_and_examples, axis=1)

# drop the original quote and examples columns
df.drop(columns=['quote', 'examples'], inplace=True)

print(len(df))

In [21]:
df.head()

Unnamed: 0,lemma_text,sense_idx,definition,synonyms,anotonyms,POS,bopomofo,pinyin,non_radical_stroke_count,radical,stroke_count,examples_quote
0,{[8e40]},1,「籮」的異體字。,[],[],,,,8,竹,14,[]
1,{[8e41]},1,「鑼」的異體字。,[],[],,,,8,金,16,[]
2,{[8e43]},1,「盧」的異體字。,[],[],,,,11,皿,16,[]
3,{[8e44]},1,「廬」的異體字。,[],[],,,,4,广,7,[]
4,{[8e45]},1,「爐」的異體字。,[],[],,,,4,火,8,[]


Optional filtering

In [None]:
# find rows whose lemma_text contains {} \d a-zA-Z
index = df[df['lemma_text'].str.contains(r'[\{\}\da-zA-Z]')].index
len(index)

# drop the rows by their idxs
df.drop(index, inplace=True)
print(len(df))

In [None]:
 # save to csv
df.to_csv('moe_flat.csv', encoding='utf-8',index=False)