# Learn Japanese with Sentence Segmentation

In [1]:
import numpy as np
import pandas as pd
import collections
import json

import neologdn  # japanese text normalization

In [2]:
df = pd.read_parquet("hf://datasets/if001/elementray_m/data/train-00000-of-00001.parquet")

### Segmentation Function: Based on Hiragana boundaries

In [4]:
def is_hiragana(c) -> bool:
    return u'\u3040' <= c <= u'\u309F'

def is_katakana(c) -> bool:
    return u'\u30A0' <= c <= u'\u30FF'

def segmentation(sentence):
    backwards = sentence[::-1]

    punctuation = ('、','。')
    words = collections.deque()
    inflections = collections.deque()

    i = 0
    while i < len(backwards):
        inflection = ""
        while i < len(backwards) and is_hiragana(backwards[i]):
            inflection = backwards[i] + inflection  # populate inflection with hiragana
            i += 1
        inflections.insert(0,inflection)

        word = ""
        while i < len(backwards) and not is_hiragana(backwards[i]):
            if (backwards[i] in punctuation):
                i += 1
                continue
                
            word = backwards[i] + word  # populate word with non-hiragana
            i += 1
        words.insert(0,word)

    #parts_df = pd.DataFrame(list(zip(words, inflections)), columns=['Word', 'Inflection'], index=range(1,len(words) + 1))
    parts_list = list(zip(words,inflections))
    return parts_list

### JMdict Japanese to English Translations
<a href=https://github.com/scriptin/jmdict-simplified>JSON data source</a>

In [6]:
with open("jmdict-eng-3.6.1.json", "r", encoding="utf-8") as f:
    jmdict = json.load(f)

print("Total entries:", len(jmdict["words"]))

Total entries: 213165


In [7]:
def lookup(word, data, threshold=0.7):
    results = []

    for entry in data.get("words", []):
        kanji_list = [k.get("text") for k in entry.get("kanji", []) if "text" in k]
        kana_list = [k.get("text") for k in entry.get("kana", []) if "text" in k]
        all_words = kanji_list + kana_list

        if word in all_words:
            glosses = []
            for sense in entry.get("sense", []):
                for gloss in sense.get("gloss", []):
                    if gloss.get("lang") == "eng":
                        glosses.append(gloss.get("text"))
            results.append({
                "kanji": kanji_list,
                "kana": kana_list,
                "glosses": glosses
            })

    return results

## Run Model

### Take Sample and Normalize Using neologdn
<a href=https://github.com/ikegami-yukino/neologdn/blob/master/README.md>Neologdn Documentation</a>

In [21]:
sample = df.sample().values[0][0]
neologdn.normalize(sample)

'彼女は料理の準備をしている。'

### Segment

In [22]:
segments = segmentation(sample)[:-1]
print(segments)

[('彼女', 'は'), ('料理', 'の'), ('準備', 'をしている')]


### Run Kanji Word Translations

In [37]:
print("="*50)
print(f"Sentence: {sample}")
print("="*50)

for word,inflection in segments:
    if not word or is_katakana(word[0]):
        continue
    
    entries = lookup(word,jmdict)
    # hiragana may be more than just inflection
    i = 0
    while (entries == [] and i < len(inflection)):
        word = word + inflection[i]
        entries = lookup(word,jmdict)
        i += 1
    inflection = inflection[i:]

    print()
    print("="*50)
    print(f"Segment: {word} | {inflection}")
    
    for e in entries:
        print(f"Kanji: {e['kanji'][0]} ({e['kana'][0]})")
        print("Meanings:", end="")
        for gloss in e["glosses"]:
            print(gloss,end=" | ")
        print()
    print("="*50)

Sentence: 彼女は料理の準備をしている。

Segment: 彼女 | は
Kanji: 彼女 (かのじょ)
Meanings:she | her | girlfriend | 

Segment: 料理 | の
Kanji: 料理 (りょうり)
Meanings:cooking | cookery | cuisine | food | dish | dealing with (skillfully) | handling (well) | managing | 

Segment: 準備 | をしている
Kanji: 準備 (じゅんび)
Meanings:preparation | arrangements | getting ready | provision | setup | reserving | 
