In [24]:
import json
import sqlite3
import pandas as pd
import numpy as np

Download latest version from [jmdict-simplified](https://github.com/scriptin/jmdict-simplified/releases)

In [2]:
filename = "jmdict-eng-3.6.1.json"

In [3]:
with open(filename, encoding="utf-8") as f:
    dict_data = json.load(f)

In [4]:
dict_data.keys()

dict_keys(['version', 'languages', 'commonOnly', 'dictDate', 'dictRevisions', 'tags', 'words'])

In [5]:
len(dict_data["words"])

211090

In [6]:
dict_data["version"], dict_data["dictDate"]

('3.6.1', '2025-03-03')

## jlpt

Download jlpt data from [yomitan-jlpt-vocab
Public
](https://github.com/stephenmk/yomitan-jlpt-vocab)

In [46]:
jlpt = {}
for i in range(5, 0, -1):
    print(i, len(pd.read_csv(f"n{i}.csv")))
    jlpt[f"n{i}"] = {
        "kanji": set(pd.read_csv(f"n{i}.csv")["kanji"].to_list()),
        "kana": set(pd.read_csv(f"n{i}.csv")["kana"].to_list()),
    }

5 684
4 640
3 1730
2 1812
1 3427


## To DB

create index from token/word -> dictionary entry

In [63]:
index = dict()
for i, word in enumerate(dict_data["words"]):
    for token in word["kanji"] + word["kana"]:
        key = token["text"]
        index.setdefault(key, []).append(i)

In [64]:
len(index)

444913

write jlpt, index, and dictionary data

In [65]:
conn = sqlite3.connect('gari.db')
cursor = conn.cursor()

In [66]:
cursor.execute('''
    CREATE TABLE IF NOT EXISTS DictionaryMeta (
        key TEXT PRIMARY KEY,
        value TEXT
    );
''')
cursor.execute('''
    CREATE TABLE IF NOT EXISTS Dictionary (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        data TEXT,
        jlpt INTEGER DEFAULT 0,
        isCommon INTEGER DEFAULT 0
    );
''')
cursor.execute('''
    CREATE TABLE IF NOT EXISTS DictionaryIndex (
        word TEXT,
        dicId INTEGER,
        
        PRIMARY KEY(word, dicId),
        FOREIGN KEY(dicId) REFERENCES Dictionary(id) ON DELETE CASCADE
    );
''')

<sqlite3.Cursor at 0x1b283f102c0>

insert meta data

In [67]:
cursor.execute('''
    INSERT OR REPLACE INTO DictionaryMeta (key, value)
    VALUES (?, ?);
''', ("version", dict_data["version"]))
cursor.execute('''
    INSERT OR REPLACE INTO DictionaryMeta (key, value)
    VALUES (?, ?);
''', ("date", dict_data["dictDate"]))

<sqlite3.Cursor at 0x1b283f102c0>

insert words indo dictionary, keep it as a json object. Add jlpt information and isCommon (for sorting)

In [68]:
for i, word in enumerate(dict_data["words"]):
    level = 0
    isCommon = False
    if len(word["kanji"]) == 0:
        isCommon = isCommon or word["kana"][0]["common"] 
        for n in range(5, 0, -1):
            if word["kana"][0]["text"] in jlpt[f"n{n}"]["kana"]:
                level = max(level, n)
    else:
        isCommon = isCommon or word["kanji"][0]["common"] 
        for n in range(5, 0, -1):
            if word["kanji"][0]["text"] in jlpt[f"n{n}"]["kanji"]:
                level = max(level, n)
    entry_json = json.dumps(word, ensure_ascii=False)
    cursor.execute('''
        INSERT OR IGNORE INTO Dictionary (id, data, jlpt, isCommon)
        VALUES (?, ?, ?, ?);
    ''', (i, entry_json, level, isCommon))
conn.commit()

insert dictionary index

In [69]:
for key, value in index.items():
    for dicId in value:
        cursor.execute('''
            INSERT OR IGNORE INTO DictionaryIndex (word, dicId)
            VALUES (?, ?);
        ''', (key, dicId))
conn.commit()

In [70]:
conn.close()

print("Database successfully created with dictionary entries.")

Database successfully created with dictionary entries.


In [33]:
words_applies_to_kanji = []
for i, word in enumerate(words_multi_reading):
    add = True
    for kana in word["kana"]:
        if len(kana["appliesToKanji"]) == 0 or kana["appliesToKanji"][0] == '*':
            add = False
    if add:
        words_applies_to_kanji.append(word)

In [34]:
len(words_applies_to_kanji), len(words_multi_reading)

(1631, 63131)

In [35]:
words_applies_to_kanji

[{'id': '1000110',
  'kanji': [{'common': True, 'text': 'ＣＤプレーヤー', 'tags': []},
   {'common': False, 'text': 'ＣＤプレイヤー', 'tags': []}],
  'kana': [{'common': True,
    'text': 'シーディープレーヤー',
    'tags': [],
    'appliesToKanji': ['ＣＤプレーヤー']},
   {'common': False,
    'text': 'シーディープレイヤー',
    'tags': [],
    'appliesToKanji': ['ＣＤプレイヤー']}],
  'sense': [{'partOfSpeech': ['n'],
    'appliesToKanji': ['*'],
    'appliesToKana': ['*'],
    'related': [],
    'antonym': [],
    'field': [],
    'dialect': [],
    'misc': [],
    'info': [],
    'languageSource': [],
    'gloss': [{'lang': 'eng',
      'gender': None,
      'type': None,
      'text': 'CD player'}]}]},
 {'id': '1001510',
  'kanji': [{'common': False, 'text': '御襁褓気触れ', 'tags': []},
   {'common': False, 'text': 'お襁褓気触れ', 'tags': []},
   {'common': False, 'text': 'オムツ気触れ', 'tags': []}],
  'kana': [{'common': False,
    'text': 'おむつかぶれ',
    'tags': [],
    'appliesToKanji': ['御襁褓気触れ', 'お襁褓気触れ']},
   {'common': False,
    'text': '

In [30]:
words_multi_reading[:100]

[{'id': '1000040',
  'kanji': [{'common': False, 'text': '〃', 'tags': []}],
  'kana': [{'common': False,
    'text': 'おなじ',
    'tags': [],
    'appliesToKanji': ['*']},
   {'common': False, 'text': 'おなじく', 'tags': [], 'appliesToKanji': ['*']}],
  'sense': [{'partOfSpeech': ['n'],
    'appliesToKanji': ['*'],
    'appliesToKana': ['*'],
    'related': [],
    'antonym': [],
    'field': [],
    'dialect': [],
    'misc': [],
    'info': [],
    'languageSource': [],
    'gloss': [{'lang': 'eng',
      'gender': None,
      'type': None,
      'text': 'ditto mark'}]}]},
 {'id': '1000060',
  'kanji': [{'common': False, 'text': '々', 'tags': []}],
  'kana': [{'common': False,
    'text': 'のま',
    'tags': [],
    'appliesToKanji': ['*']},
   {'common': False, 'text': 'ノマ', 'tags': [], 'appliesToKanji': []}],
  'sense': [{'partOfSpeech': ['unc'],
    'appliesToKanji': ['*'],
    'appliesToKana': ['*'],
    'related': [['同の字点']],
    'antonym': [],
    'field': [],
    'dialect': [],
    'mi

In [36]:
words_tags = []
for i, word in enumerate(dict_data["words"]):
    add = True
    for kana in word["kana"] + word["kanji"]:
        if len(kana["tags"]) == 0:
            add = False
    if add:
        words_tags.append(word)

In [45]:
dict_data["tags"]['sK']

'search-only kanji form'

In [50]:
words_tags[5]

{'id': '2009310',
 'kanji': [{'common': False, 'text': '泥濘む', 'tags': ['rK']}],
 'kana': [{'common': False,
   'text': 'ぬかるむ',
   'tags': ['gikun'],
   'appliesToKanji': ['*']}],
 'sense': [{'partOfSpeech': ['v5m', 'vi'],
   'appliesToKanji': ['*'],
   'appliesToKana': ['*'],
   'related': [],
   'antonym': [],
   'field': [],
   'dialect': [],
   'misc': ['uk'],
   'info': [],
   'languageSource': [],
   'gloss': [{'lang': 'eng',
     'gender': None,
     'type': None,
     'text': 'to be muddy'},
    {'lang': 'eng', 'gender': None, 'type': None, 'text': 'to be slushy'}]}]}

In [8]:
words_multi_reading = []
for i, word in enumerate(dict_data["words"]):
    if len(word["kanji"]) > 1 or len(word["kana"]) > 1:
        words_multi_reading.append(word)