In [1]:
import os
import re

import pandas as pd

from anki.collection import Collection
from jinja2 import Environment, FileSystemLoader
from IPython.display import display, HTML
from pprint import pprint

In [2]:
path_to_collection = os.environ.get("PATH_TO_COLLECTION")
col = Collection(path_to_collection)

In [3]:
# Jinja Setup
loader = FileSystemLoader('templates')
env = Environment(loader = loader)
furigana_template = env.get_template('html-furigana.jinja')


In [4]:
def make_note_from_row(row, note_model, note_array):

    note = col.new_note(note_model)
    
    for (k,v) in row.items():
        if k == 'is_new':
            continue
        
        note[k]= str(v).strip()

    row['is_new'] = not bool(note.duplicate_or_empty())
    if row['is_new']:
        note_array.append(note)
        
    return row

In [5]:
def plain_text_to_html_kanji(match, target_word_pattern, template):

    if re.match(target_word_pattern, match.group(0)):
        output = template.render(group = match.groups(), is_target_word=True)
        return output
    
    output = template.render(group= match.groups(), is_target_word=False)
    return output

In [6]:
def model_furigana_series_from_template(
        row,
        template,
        kanji_text_pattern,
        sentence_col_name,
        base_col_name,
        output_col_name):
    
    # If the row does not have a sample sentence yet, skip it.
    if pd.isna(row[sentence_col_name]):
        return row
    
    # Strip blank space from target word and sample sentence
    sentence_string = row[sentence_col_name].strip()
    target_word = row[base_col_name].strip()

    # Get a list of the kanjis in the target word
    kanji_pattern = r'[一-龯]+'
    target_kanjis = re.findall(kanji_pattern, target_word)

    target_word_pattern = ''
    
    if target_kanjis:
        target_word_pattern = rf'([{''.join(target_kanjis)}]+)[（\(]([ぁ-ん]+?)[）\)]([ぁ-ん]*)'
    
    # replace all kanjis using the html template
    

    sentence_string = re.sub(
        kanji_text_pattern,
        lambda match: plain_text_to_html_kanji(match, target_word_pattern, template),
        sentence_string

    )
    row[output_col_name] = re.sub(r'[\n\t]', '', sentence_string)
    return row

In [7]:
# Get reading deck references
reading_deck_id = int(os.environ.get("READING_DECK_ID"))
reading_model_id = os.environ.get("READING_MODEL_ID")
reading_model = col.models.get(reading_model_id)
reading_cols = [field['name'] for field in reading_model['flds']]

# Read clipboard data
reading_data = pd.read_clipboard(
    sep='|',
    usecols=[i for i in range(1,5)],  # FIXME: bad practice, column numbers are hardcoded
    skiprows=2,
    names=reading_cols[:-1])

# Hard fix to match the prompt output with the desired input
# TODO: Fix prompt output formatting
# reading_data[['word','reading']] = reading_data[['reading','word']]

# Setting up words that use kana alone
# simple_word_locator = reading_data.loc[:,'word'].str.isspace()
# reading_data.loc[simple_word_locator, 'word'] = reading_data.loc[simple_word_locator, 'reading']
# reading_data.loc[simple_word_locator, 'reading'] = ''
# display(reading_data)
# Creating and formating new anki notes
new_notes_in_clipboard_data = []
kanji_text_pattern = r'([一-龯々]+)[（\(]([ぁ-ん]+)[）\)]*([ぁ-ん]*)'

reading_data = (
    reading_data
        .apply(
            model_furigana_series_from_template, 
            axis='columns',
            template=furigana_template,
            kanji_text_pattern= kanji_text_pattern,
            sentence_col_name='word-in-context',
            base_col_name='target-word',
            output_col_name='generated-furigana')
        .apply(
            make_note_from_row,
            axis='columns',
            note_model=reading_model,
            note_array=new_notes_in_clipboard_data)
        )

# TODO: this Breaks when clipboard is empty
    
print('\nNew notes')
display(reading_data.loc[reading_data['is_new']])

print('\nDuplicate or empty notes')
display(reading_data.loc[~reading_data['is_new']])



New notes


Unnamed: 0,target-word,word-meaning,word-in-context,context-translation,generated-furigana,is_new
0,馬（うま）,Horse,彼(かれ)は競馬(けいば)で速(はや)い馬(うま)に賭(か)けて、勝(か)ちました。,He bet on a fast horse at the horse race and ...,"<ruby class=""kanji"">彼 <rp>(</rp...",True
1,馬力（ばりき）,Horsepower,この車(くるま)は500馬力(ばりき)のエンジンを搭載(とうさい)しています。,This car is equipped with a 500-horsepower en...,"この <ruby class=""kanji"">車 <rp>(</...",True
2,馬車（ばしゃ）,Carriage,観光客(かんこうきゃく)は街(まち)を馬車(ばしゃ)でゆっくりと回(まわ)りました。,The tourists slowly toured the city by carria...,"<ruby class=""kanji"">観光客 <rp>(</...",True
3,鳥（とり）,Bird,朝(あさ)になると、庭(にわ)で美(うつく)しい鳥(とり)の声(こえ)が聞(き)こえます。,"In the morning, I can hear the beautiful soun...","<ruby class=""kanji"">朝 <rp>(</rp...",True
4,焼き鳥（やきとり）,Grilled chicken skewers,仕事(しごと)帰(がえ)りに、焼(や)き鳥(とり)を食(た)べに行(い)きませんか？,Shall we go eat grilled chicken skewers after...,"<ruby class=""kanji"">仕事 <rp>(</r...",True
5,白鳥（はくちょう）,Swan,湖(みずうみ)には白鳥(はくちょう)が優雅(ゆうが)に泳(およ)いでいました。,Swans were swimming gracefully on the lake.,"<ruby class=""kanji"">湖 <rp>(</rp...",True
6,野鳥（やちょう）,Wild bird,彼(かれ)は趣味(しゅみ)として野鳥(やちょう)の観察(かんさつ)をしています。,He observes wild birds as a hobby.,"<ruby class=""kanji"">彼 <rp>(</rp...",True



Duplicate or empty notes


Unnamed: 0,target-word,word-meaning,word-in-context,context-translation,generated-furigana,is_new


In [8]:
input_ok = input("Are new inputs ok?")

In [9]:
if input_ok:
    for note in new_notes_in_clipboard_data:
        
        col.add_note(note, reading_deck_id)

In [10]:
i = 6 
display(reading_data['word-in-context'].iloc[i])
display(reading_data['context-translation'].iloc[i])

' 彼(かれ)は趣味(しゅみ)として野鳥(やちょう)の観察(かんさつ)をしています。 '

' He observes wild birds as a hobby. '

In [11]:
HTML("""
     <style>
     .target_kanji {
        color: orange;
     }

     .target_furigana {
        color: cyan;
     }

     .target_okurigana {
        color: violet;
     }
     
     </style>""" +'<h1>'+reading_data['generated-furigana'].iloc[i]+'</h1>')