In [1]:
%%capture --no-stderr
%pip install --quiet -U langchain_core langgraph langchain_openai openpyxl pandas IPython

# First, we need to set the environment variables

This script requires OPENAI_API_KEY and LANGSMITH_API_KEY to be set in the environment variables.

Also, you need to choose the model to use for the AI.

In [2]:
import os, getpass
from langchain_openai import ChatOpenAI


def _set_env(var: str):
    if not os.environ.get(var):
        os.environ[var] = getpass.getpass(f"{var}: ")

_set_env("OPENAI_API_KEY")
_set_env("LANGSMITH_API_KEY")
os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_PROJECT"] = "anki_deck_generator"

gen_data_ai_chat = ChatOpenAI(model="gpt-4.1", temperature=0)
check_data_ai_chat = ChatOpenAI(model="gpt-4.1", temperature=0)

# Second, we need to define the data structure

You need to define the data structure for anki deck by `data_structure` variable:

```python
data_structure = {
    "field_1_name": "field_1_description",
    "field_name_2": "field_2_description",
    "field_name_3": "field_3_description"
}
```

Example:

```python
data_structure = {
    "vocabulary": "The word to be learned",
    "meaning_en": "The meaning of the word in English",
    "meaning_vi": "The meaning of the word in Japanese",
    "hiragana": "The hiragana of the word",
    "kanji_vi": "Explain the kanji of the word in Vietnamese as they are related to Chinese characters",
    "sentence": "The sentence using the word",
    "sentence_translation_en": "The translation of the sentence in English",
    "sentence_translation_vi": "The translation of the sentence in Vietnamese",
    "note": "Any additional information about the word"
}
```

You should also define a `data_structure_example` variable to show how the data structure should be filled:

```python
data_structure_example = {
    "vocabulary": "正社員",
    "meaning_en": "Full-time employee",
    "meaning_vi": "Nhân viên chính thức",
    "hiragana": "しょうしえん",
    "kanji_vi": "Chính Xã Viên\n正 (Chính): nghĩa là đúng đắn, chính thức, ngay thẳng.\n社 (Xã): nghĩa là công ty, tổ chức.\n員 (Viên): nghĩa là người, nhân viên.",
    "sentence": "私は正社員です。",
    "sentence_translation_en": "I am a full-time employee.",
    "sentence_translation_vi": "Tôi là nhân viên chính thức.",
    "note": "Common word for full-time employee, used in everyday conversation"
}
```

In [3]:
data_structure = {
    "vocabulary": "The word to be learned",
    "meaning_en": "The meaning of the word in English",
    "meaning_vi": "The meaning of the word in Japanese",
    "hiragana": "The hiragana of the word",
    "kanji_vi": "Explain the whole kanji of the word in Vietnamese, then each kanji individually in Vietnamese",
    "sentence": "The sentence using the word",
    "sentence_translation_en": "The translation of the sentence in English",
    "sentence_translation_vi": "The translation of the sentence in Vietnamese",
    "note": "Any additional information about the word"
}

data_structure_example = {
    "vocabulary": "正社員",
    "meaning_en": "Full-time employee",
    "meaning_vi": "Nhân viên chính thức",
    "hiragana": "しょうしえん",
    "kanji_vi": "Chính Xã Viên\n正 (Chính): nghĩa là đúng đắn, chính thức, ngay thẳng.\n社 (Xã): nghĩa là công ty, tổ chức.\n員 (Viên): nghĩa là người, nhân viên.",
    "sentence": "私は正社員です。",
    "sentence_translation_en": "I am a full-time employee.",
    "sentence_translation_vi": "Tôi là nhân viên chính thức.",
    "note": "Common word for full-time employee, used in everyday conversation"
}

# Third, we need to list out all the words to be learned, with note for the AI if any

You can/should generate this list using AI.

```python
vocabularies = ["word_without_note_1", "word_without_note_2", "word_without_note_3"]
vocabularies_with_notes = {
    "word_with_note_1": "note_to_ai_for_word_with_note_1",
    "word_with_note_2": "note_to_ai_for_word_with_note_2",
    "word_with_note_3": "note_to_ai_for_word_with_note_3"
}
```

Example:

```python
vocabularies = ["猫", "電車", "公園"] 
vocabularies_with_notes = {
    "勉強": "I am truggle to remember it, so please explain it more in detail with more examples",
    "学校": "This word is quite easy, so no need to explain it in detail",
    "食べ物": "", # Empty note is also fine
}
```

In [4]:
from typing import List, Dict

vocabularies: List[str] = ['端末', '一覧', '機能', '設定', '検索', '表示', '登録', '削除', '保存', '更新',
 '入力', '出力', '処理', '管理', '動作', '通信', '接続', '画面', '情報', '資料',
 '内容', '状態', '作業', '手順', '条件', '権限', '設定値', '変更', '合計', '制御']

default_note = "Example should use IT context that clearly show the meaning of the word"

vocabularies_with_notes: Dict[str, str] = {
}

# Finally, let's verify the data you added

In [5]:
def verify_vocabularies_data(vocabularies: List[str], vocabularies_with_notes: Dict[str, str]) -> bool:
    """
    Verify that vocabularies and vocabularies_with_notes follow the expected types and structure.
    
    Args:
        vocabularies: List of vocabulary words
        vocabularies_with_notes: Dictionary mapping vocabulary words to notes
        
    Returns:
        bool: True if data is valid, False otherwise
    """
    try:
        # Check if vocabularies is a list
        if not isinstance(vocabularies, list):
            print("Error: vocabularies must be a list")
            return False
            
        # Check if vocabularies_with_notes is a dictionary
        if not isinstance(vocabularies_with_notes, dict):
            print("Error: vocabularies_with_notes must be a dictionary")
            return False
            
        # Check if all items in vocabularies are strings
        for i, word in enumerate(vocabularies):
            if not isinstance(word, str):
                print(f"Error: vocabularies[{i}] must be a string, got {type(word)}")
                return False
                
        # Check if all keys and values in vocabularies_with_notes are strings
        for word, note in vocabularies_with_notes.items():
            if not isinstance(word, str):
                print(f"Error: Key in vocabularies_with_notes must be a string, got {type(word)}")
                return False
            if not isinstance(note, str):
                print(f"Error: Value in vocabularies_with_notes must be a string, got {type(note)}")
                return False
                
        print("✓ Data verification passed!")
        return True
        
    except Exception as e:
        print(f"Error during verification: {e}")
        return False

assert verify_vocabularies_data(vocabularies, vocabularies_with_notes)

✓ Data verification passed!


# All is good, now we can start to generate the anki deck

You can find the csv under `./output/anki_deck_data.csv`

In [6]:
# Merge vocabularies and vocabularies_with_notes into a single dictionary
for word in vocabularies:
    if word not in vocabularies_with_notes:
        vocabularies_with_notes[word] = default_note

print(f"Total vocabularies after merging: {len(vocabularies_with_notes)}")
print("All vocabularies with notes:")
for word, note in vocabularies_with_notes.items():
    if note:
        print(f"  {word}: {note}")
    else:
        print(f"  {word}: (no note)")
        
print("\n✓ Vocabularies merged successfully!")


Total vocabularies after merging: 30
All vocabularies with notes:
  端末: Example should use IT context that clearly show the meaning of the word
  一覧: Example should use IT context that clearly show the meaning of the word
  機能: Example should use IT context that clearly show the meaning of the word
  設定: Example should use IT context that clearly show the meaning of the word
  検索: Example should use IT context that clearly show the meaning of the word
  表示: Example should use IT context that clearly show the meaning of the word
  登録: Example should use IT context that clearly show the meaning of the word
  削除: Example should use IT context that clearly show the meaning of the word
  保存: Example should use IT context that clearly show the meaning of the word
  更新: Example should use IT context that clearly show the meaning of the word
  入力: Example should use IT context that clearly show the meaning of the word
  出力: Example should use IT context that clearly show the meaning of the wor

In [7]:
# Define the data structure for the AI LangGraph

from typing import TypedDict

class AnkiDeckData(TypedDict):
    vocabularies_with_notes: Dict[str, str] # The vocabularies with notes
    ai_gen_data: List[Dict[str, str]] # The data generated by the AI

In [8]:
# Define the graph

from IPython.display import Image, display
from langgraph.graph import StateGraph, START, END
import json

def node_generate_anki_deck_data(deck_data: AnkiDeckData) -> AnkiDeckData:
    """
    Generate Anki deck data for each vocabulary word using AI.
    """
    vocabularies_with_notes = deck_data["vocabularies_with_notes"]
    ai_gen_data = []
    max_retries = 3
    
    print(f"Generating data for {len(vocabularies_with_notes)} vocabulary words...")
    
    for i, (vocab_word, note) in enumerate(vocabularies_with_notes.items(), 1):
        print(f"Processing {i}/{len(vocabularies_with_notes)}: {vocab_word}")
        
        # Create the prompt for AI
        prompt = f"""
You are a Japanese language expert helping to create Anki flashcard data. 
Generate comprehensive information for the Japanese word: {vocab_word}

Please provide the following information in JSON format with these exact keys:
{'\n'.join([f"- {field}: {description}" for field, description in data_structure.items()])}

Special instructions for this word: {note if note else "No special instructions"}

Please respond with valid JSON only, no additional text.
Example format:
{data_structure_example}
        """
        # Retry logic
        for attempt in range(max_retries):
            try:
                # Get AI response
                response = gen_data_ai_chat.invoke(prompt)
                response_text = response.content.strip()
                word_data = json.loads(response_text)
                
                # Validate that all required fields are present
                required_fields = list(data_structure.keys())
                missing_fields = [field for field in required_fields if field not in word_data]
                
                if missing_fields:
                    raise ValueError(f"Missing fields for {vocab_word}: {missing_fields}")
            
                
                # Ensure vocabulary field matches the input word
                word_data["vocabulary"] = vocab_word
                
                ai_gen_data.append(word_data)
                print(f"✓ Successfully generated data for {vocab_word}")
                break

            except Exception as e:
                print(f"Attempt {attempt + 1}/{max_retries} failed for {vocab_word}: {e}")
                if attempt < max_retries - 1:
                    print(f"Retrying {vocab_word}...")
                else:
                    print(f"❌ Failed to generate data for {vocab_word} after {max_retries} attempts")
                    
    
    print(f"\n✓ Generated data for {len(ai_gen_data)} vocabulary words")
    
    # Update the deck data
    deck_data["ai_gen_data"] = ai_gen_data
    return deck_data

def node_check_data_structure(deck_data: AnkiDeckData) -> AnkiDeckData:
    """
    Check the data structure of the generated data.
    """
    ai_gen_data = deck_data["ai_gen_data"]
    for data in ai_gen_data:
        # Check that all required fields are present
        required_fields = list(data_structure.keys())
        missing_fields = [field for field in required_fields if field not in data]
        
        if missing_fields:
            print(f"❌ Missing fields in data: {missing_fields}")
            print(f"Data: {data}")
            raise ValueError(f"Missing required fields: {missing_fields}")
        
        # Check that all fields are strings and not empty
        for field, value in data.items():
            if not isinstance(value, str):
                print(f"❌ Field '{field}' is not a string: {type(value)}")
                raise ValueError(f"Field '{field}' must be a string, got {type(value)}")
            
            if not value.strip():
                print(f"❌ Field '{field}' is empty or contains only whitespace")
                raise ValueError(f"Field '{field}' cannot be empty")
        
        print(f"✓ Data structure validated for word: {data.get('vocabulary', 'Unknown')}")

    return deck_data

def node_check_data_content(deck_data: AnkiDeckData) -> AnkiDeckData:
    """
    Using AI to verify the content of the generated data.
    """
    # TODO: Implement this
    return deck_data

# Build the graph

builder = StateGraph(AnkiDeckData)

builder.add_node("generate_anki_deck_data", node_generate_anki_deck_data)
builder.add_node("check_data_structure", node_check_data_structure)
builder.add_node("check_data_content", node_check_data_content)

builder.add_edge(START, "generate_anki_deck_data")
builder.add_edge("generate_anki_deck_data", "check_data_structure")
builder.add_edge("check_data_structure", "check_data_content")
builder.add_edge("check_data_content", END)

graph = builder.compile()

# Display the graph
# display(Image(graph.get_graph().draw_mermaid_png()), max_retries=5, retry_delay=2.0)

In [9]:
# Trigger the graph to generate the anki deck data

result = graph.invoke({"vocabularies_with_notes": vocabularies_with_notes})

Generating data for 30 vocabulary words...
Processing 1/30: 端末
✓ Successfully generated data for 端末
Processing 2/30: 一覧
✓ Successfully generated data for 一覧
Processing 3/30: 機能
✓ Successfully generated data for 機能
Processing 4/30: 設定
✓ Successfully generated data for 設定
Processing 5/30: 検索
✓ Successfully generated data for 検索
Processing 6/30: 表示
✓ Successfully generated data for 表示
Processing 7/30: 登録
✓ Successfully generated data for 登録
Processing 8/30: 削除
✓ Successfully generated data for 削除
Processing 9/30: 保存
✓ Successfully generated data for 保存
Processing 10/30: 更新
✓ Successfully generated data for 更新
Processing 11/30: 入力
✓ Successfully generated data for 入力
Processing 12/30: 出力
✓ Successfully generated data for 出力
Processing 13/30: 処理
✓ Successfully generated data for 処理
Processing 14/30: 管理
✓ Successfully generated data for 管理
Processing 15/30: 動作
✓ Successfully generated data for 動作
Processing 16/30: 通信
✓ Successfully generated data for 通信
Processing 17/30: 接続
✓ Successfully ge

In [10]:
# Export the generated data to CSV for Anki import
import pandas as pd
import os
import csv

# Ensure output directory exists
os.makedirs("./output", exist_ok=True)

# Convert the generated data to a pandas DataFrame
if result["ai_gen_data"]:
    df = pd.DataFrame(result["ai_gen_data"])
    
    # Reorder columns according to data_structure order
    ordered_columns = list(data_structure.keys())
    df = df.reindex(columns=ordered_columns)
    
    # Clean data to handle commas and newlines that might break CSV format
    # Replace newlines with <br> tags for better display in Anki
    for col in df.columns:
        df[col] = df[col].str.replace('\n', '<br>', regex=False)
        df[col] = df[col].str.replace('\r', '', regex=False)
    
    # Save to CSV with proper quoting to handle commas and special characters
    csv_path = "./output/anki_deck_data.csv"
    df.to_csv(csv_path, 
              index=False, 
              encoding='utf-8',
              quoting=csv.QUOTE_ALL,  # Quote all fields
              quotechar='"',          # Use double quotes
              escapechar='\\')        # Escape character for quotes within quotes
    
    print(f"✓ Anki deck data exported to: {csv_path}")
    print(f"✓ Total words: {len(df)}")
    print(f"✓ Fields: {', '.join(df.columns)}")
    print("✓ Data cleaned: newlines converted to <br> tags, all fields properly quoted")

✓ Anki deck data exported to: ./output/anki_deck_data.csv
✓ Total words: 30
✓ Fields: vocabulary, meaning_en, meaning_vi, hiragana, kanji_vi, sentence, sentence_translation_en, sentence_translation_vi, note
✓ Data cleaned: newlines converted to <br> tags, all fields properly quoted
