In [21]:
%%capture --no-stderr
%pip install --quiet -U langchain_core langgraph langchain_openai openpyxl pandas IPython

# First, we need to set the environment variables

This script requires OPENAI_API_KEY and LANGSMITH_API_KEY to be set in the environment variables.

Also, you need to choose the model to use for the AI.

In [22]:
import os, getpass
from langchain_openai import ChatOpenAI


def _set_env(var: str):
    if not os.environ.get(var):
        os.environ[var] = getpass.getpass(f"{var}: ")

_set_env("OPENAI_API_KEY")
_set_env("LANGSMITH_API_KEY")
os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_PROJECT"] = "anki_deck_generator"

gen_data_ai_chat = ChatOpenAI(model="gpt-4.1", temperature=0)
check_data_ai_chat = ChatOpenAI(model="gpt-4.1", temperature=0)

# Second, we need to define the data structure

You need to define the data structure for anki deck by `data_structure` variable:

```python
data_structure = {
    "field_1_name": "field_1_description",
    "field_name_2": "field_2_description",
    "field_name_3": "field_3_description"
}
```

Example:

```python
data_structure = {
    "vocabulary": "The word to be learned",
    "meaning_en": "The meaning of the word in English",
    "meaning_vi": "The meaning of the word in Japanese",
    "hiragana": "The hiragana of the word",
    "kanji_vi": "Explain the kanji of the word in Vietnamese as they are related to Chinese characters",
    "sentence": "The sentence using the word",
    "sentence_translation_en": "The translation of the sentence in English",
    "sentence_translation_vi": "The translation of the sentence in Vietnamese",
    "note": "Any additional information about the word"
}
```



In [23]:
data_structure = {
    "vocabulary": "The word to be learned",
    "meaning_en": "The meaning of the word in English",
    "meaning_vi": "The meaning of the word in Japanese",
    "hiragana": "The hiragana of the word",
    "kanji_vi": "Explain the kanji of the word in Vietnamese as they are related to Chinese characters",
    "sentence": "The sentence using the word",
    "sentence_translation_en": "The translation of the sentence in English",
    "sentence_translation_vi": "The translation of the sentence in Vietnamese",
    "note": "Any additional information about the word"
}

# Third, we need to list out all the words to be learned, with note for the AI if any

You can/should generate this list using AI.

```python
vocabularies = ["word_without_note_1", "word_without_note_2", "word_without_note_3"]
vocabularies_with_notes = {
    "word_with_note_1": "note_to_ai_for_word_with_note_1",
    "word_with_note_2": "note_to_ai_for_word_with_note_2",
    "word_with_note_3": "note_to_ai_for_word_with_note_3"
}
```

Example:

```python
vocabularies = ["猫", "電車", "公園"] 
vocabularies_with_notes = {
    "勉強": "I am truggle to remember it, so please explain it more in detail with more examples",
    "学校": "This word is quite easy, so no need to explain it in detail",
    "食べ物": "", # Empty note is also fine
}
```

In [24]:
from typing import List, Dict

vocabularies: List[str] = ["猫", "電車", "公園"] 
vocabularies_with_notes: Dict[str, str] = {
    "勉強": "I am truggle to remember it, so please explain it more in detail with more examples",
    "学校": "This word is quite easy, so no need to explain it in detail",
    "食べ物": "", # Empty note is also fine
}

# Finally, let's verify the data you added

In [25]:
def verify_vocabularies_data(vocabularies: List[str], vocabularies_with_notes: Dict[str, str]) -> bool:
    """
    Verify that vocabularies and vocabularies_with_notes follow the expected types and structure.
    
    Args:
        vocabularies: List of vocabulary words
        vocabularies_with_notes: Dictionary mapping vocabulary words to notes
        
    Returns:
        bool: True if data is valid, False otherwise
    """
    try:
        # Check if vocabularies is a list
        if not isinstance(vocabularies, list):
            print("Error: vocabularies must be a list")
            return False
            
        # Check if vocabularies_with_notes is a dictionary
        if not isinstance(vocabularies_with_notes, dict):
            print("Error: vocabularies_with_notes must be a dictionary")
            return False
            
        # Check if all items in vocabularies are strings
        for i, word in enumerate(vocabularies):
            if not isinstance(word, str):
                print(f"Error: vocabularies[{i}] must be a string, got {type(word)}")
                return False
                
        # Check if all keys and values in vocabularies_with_notes are strings
        for word, note in vocabularies_with_notes.items():
            if not isinstance(word, str):
                print(f"Error: Key in vocabularies_with_notes must be a string, got {type(word)}")
                return False
            if not isinstance(note, str):
                print(f"Error: Value in vocabularies_with_notes must be a string, got {type(note)}")
                return False
                
        print("✓ Data verification passed!")
        return True
        
    except Exception as e:
        print(f"Error during verification: {e}")
        return False

assert verify_vocabularies_data(vocabularies, vocabularies_with_notes)

✓ Data verification passed!


# All is good, now we can start to generate the anki deck

You can find the csv under `./output/anki_deck_data.csv`

In [26]:
# Merge vocabularies and vocabularies_with_notes into a single dictionary
for word in vocabularies:
    if word not in vocabularies_with_notes:
        vocabularies_with_notes[word] = ""

print(f"Total vocabularies after merging: {len(vocabularies_with_notes)}")
print("All vocabularies with notes:")
for word, note in vocabularies_with_notes.items():
    if note:
        print(f"  {word}: {note}")
    else:
        print(f"  {word}: (no note)")
        
print("\n✓ Vocabularies merged successfully!")


Total vocabularies after merging: 6
All vocabularies with notes:
  勉強: I am truggle to remember it, so please explain it more in detail with more examples
  学校: This word is quite easy, so no need to explain it in detail
  食べ物: (no note)
  猫: (no note)
  電車: (no note)
  公園: (no note)

✓ Vocabularies merged successfully!


In [27]:
# Define the data structure for the AI LangGraph

from typing import Annotated, TypedDict

class AnkiDeckData(TypedDict):
    vocabularies_with_notes: Dict[str, str] # The vocabularies with notes
    ai_gen_data: List[Dict[str, str]] # The data generated by the AI

In [28]:
# Define the graph

from IPython.display import Image, display
from langgraph.graph import StateGraph, START, END
import json

def node_generate_anki_deck_data(deck_data: AnkiDeckData) -> AnkiDeckData:
    """
    Generate Anki deck data for each vocabulary word using AI.
    """
    vocabularies_with_notes = deck_data["vocabularies_with_notes"]
    ai_gen_data = []
    
    print(f"Generating data for {len(vocabularies_with_notes)} vocabulary words...")
    
    for i, (vocab_word, note) in enumerate(vocabularies_with_notes.items(), 1):
        print(f"Processing {i}/{len(vocabularies_with_notes)}: {vocab_word}")
        
        # Create the prompt for AI
        prompt = f"""
You are a Japanese language expert helping to create Anki flashcard data. 
Generate comprehensive information for the Japanese word: {vocab_word}

Please provide the following information in JSON format with these exact keys:
- vocabulary: The original Japanese word
- meaning_en: The meaning of the word in English
- meaning_vi: The meaning of the word in Vietnamese (not Japanese!)
- hiragana: The hiragana reading of the word (if applicable, empty string if not)
- kanji_vi: Explain the kanji components in Vietnamese, relating them to Chinese characters and their meanings
- sentence: A natural Japanese sentence using this word
- sentence_translation_en: English translation of the sentence
- sentence_translation_vi: Vietnamese translation of the sentence
- note: Additional helpful information about the word (usage notes, formality level, etc.)

Special instructions for this word: {note if note else "No special instructions"}

Please respond with valid JSON only, no additional text.
Example format:
{{
    "vocabulary": "猫",
    "meaning_en": "cat",
    "meaning_vi": "con mèo",
    "hiragana": "ねこ",
    "kanji_vi": "猫: Chữ Hán này có nghĩa là 'mèo', từ gốc Trung Quốc 貓",
    "sentence": "私は猫が好きです。",
    "sentence_translation_en": "I like cats.",
    "sentence_translation_vi": "Tôi thích mèo.",
    "note": "Common word for cat, used in everyday conversation"
}}
"""

        try:
            # Get AI response
            response = gen_data_ai_chat.invoke(prompt)
            response_text = response.content.strip()
            
            # Try to parse JSON response
            try:
                word_data = json.loads(response_text)
                
                # Validate that all required fields are present
                required_fields = list(data_structure.keys())
                missing_fields = [field for field in required_fields if field not in word_data]
                
                if missing_fields:
                    print(f"Warning: Missing fields for {vocab_word}: {missing_fields}")
                    # Fill missing fields with empty strings
                    for field in missing_fields:
                        word_data[field] = ""
                
                # Ensure vocabulary field matches the input word
                word_data["vocabulary"] = vocab_word
                
                ai_gen_data.append(word_data)
                print(f"✓ Successfully generated data for {vocab_word}")
                
            except json.JSONDecodeError as e:
                print(f"Error parsing JSON for {vocab_word}: {e}")
                print(f"Raw response: {response_text[:200]}...")
                
                # Create fallback data
                fallback_data = {field: "" for field in data_structure.keys()}
                fallback_data["vocabulary"] = vocab_word
                fallback_data["note"] = f"Error generating data: {str(e)}"
                ai_gen_data.append(fallback_data)
                
        except Exception as e:
            print(f"Error generating data for {vocab_word}: {e}")
            
            # Create fallback data
            fallback_data = {field: "" for field in data_structure.keys()}
            fallback_data["vocabulary"] = vocab_word
            fallback_data["note"] = f"Error generating data: {str(e)}"
            ai_gen_data.append(fallback_data)
    
    print(f"\n✓ Generated data for {len(ai_gen_data)} vocabulary words")
    
    # Update the deck data
    deck_data["ai_gen_data"] = ai_gen_data
    return deck_data

def node_check_anki_deck_data(deck_data: AnkiDeckData) -> AnkiDeckData:
    """
    Check and validate the generated Anki deck data.
    This function will review the generated data for quality and completeness.
    """
    ai_gen_data = deck_data["ai_gen_data"]
    
    print(f"Checking data quality for {len(ai_gen_data)} vocabulary words...")
    
    for i, word_data in enumerate(ai_gen_data):
        vocab_word = word_data.get("vocabulary", "Unknown")
        print(f"Checking {i+1}/{len(ai_gen_data)}: {vocab_word}")
        
        # Check for missing or empty critical fields
        critical_fields = ["vocabulary", "meaning_en", "meaning_vi", "hiragana"]
        missing_critical = []
        
        for field in critical_fields:
            if not word_data.get(field, "").strip():
                missing_critical.append(field)
        
        if missing_critical:
            print(f"  ⚠️  Missing critical fields: {missing_critical}")
            
            # Try to regenerate missing data using AI
            if len(missing_critical) > 1:  # If multiple critical fields are missing
                prompt = f"""
Please provide missing information for the Japanese word: {vocab_word}

I need the following information in JSON format:
{chr(10).join([f"- {field}: {data_structure[field]}" for field in missing_critical])}

Respond with JSON containing only the missing fields:
"""
                try:
                    response = check_data_ai_chat.invoke(prompt)
                    missing_data = json.loads(response.content.strip())
                    
                    # Update the missing fields
                    for field in missing_critical:
                        if field in missing_data and missing_data[field].strip():
                            word_data[field] = missing_data[field]
                            print(f"  ✓ Updated {field}")
                            
                except Exception as e:
                    print(f"  ❌ Could not regenerate missing data: {e}")
        
        # Validate Japanese characters
        if word_data.get("vocabulary") and not any('\u3040' <= c <= '\u309F' or '\u30A0' <= c <= '\u30FF' or '\u4E00' <= c <= '\u9FAF' for c in word_data["vocabulary"]):
            print(f"  ⚠️  Warning: '{vocab_word}' doesn't contain Japanese characters")
        
        # Check if hiragana is appropriate
        hiragana = word_data.get("hiragana", "")
        if hiragana and not all('\u3040' <= c <= '\u309F' or c in 'ー' for c in hiragana if c.strip()):
            print(f"  ⚠️  Warning: Hiragana field contains non-hiragana characters: {hiragana}")
        
        print(f"  ✓ Checked {vocab_word}")
    
    print(f"\n✓ Data quality check completed for {len(ai_gen_data)} words")
    
    return deck_data

In [None]:
# Test the data generation function
# Create initial deck data
initial_deck_data: AnkiDeckData = {
    "vocabularies_with_notes": vocabularies_with_notes,
    "ai_gen_data": []
}

# Generate the data
generated_deck_data = node_generate_anki_deck_data(initial_deck_data)

# Display a sample of the generated data
print("\n" + "="*50)
print("SAMPLE GENERATED DATA:")
print("="*50)
if generated_deck_data["ai_gen_data"]:
    sample_word = generated_deck_data["ai_gen_data"][0]
    for field, value in sample_word.items():
        print(f"{field}: {value}")
        
print(f"\nTotal words processed: {len(generated_deck_data['ai_gen_data'])}")


In [None]:
# Export the generated data to CSV for Anki import
import pandas as pd
import os

# Ensure output directory exists
os.makedirs("./output", exist_ok=True)

# Convert the generated data to a pandas DataFrame
if generated_deck_data["ai_gen_data"]:
    df = pd.DataFrame(generated_deck_data["ai_gen_data"])
    
    # Reorder columns according to data_structure order
    ordered_columns = list(data_structure.keys())
    df = df.reindex(columns=ordered_columns)
    
    # Save to CSV
    csv_path = "./output/anki_deck_data.csv"
    df.to_csv(csv_path, index=False, encoding='utf-8')
    
    print(f"✓ Anki deck data exported to: {csv_path}")
    print(f"✓ Total words: {len(df)}")
    print(f"✓ Fields: {', '.join(df.columns)}")
    
    # Display summary statistics
    print("\nData Summary:")
    for col in df.columns:
        non_empty = df[col].astype(str).str.strip().ne('').sum()
        print(f"  {col}: {non_empty}/{len(df)} entries filled ({non_empty/len(df)*100:.1f}%)")
        
    # Show first few rows
    print(f"\nFirst 3 rows preview:")
    print(df.head(3).to_string())
    
else:
    print("❌ No data to export. Please run the generation function first.")
