# Build Lexicon - AI Enrichment

This notebook uses the enrichment script to test and enrich Dutch words with linguistic metadata.

## Setup and Imports

In [None]:
import sys
from pathlib import Path
import pandas as pd

# Add project root to path
sys.path.insert(0, str(Path.cwd()))

from scripts.enrich_lexicon import enrich_word
from core.schemas import PartOfSpeech, CEFRLevel

print("âœ“ Setup complete")

## Test with Example Words

Test the enrichment with different parts of speech.

In [None]:
# Test with a noun
result_noun = enrich_word("huis", "house")
print("=== NOUN: huis ===")
print(f"Article: {result_noun.noun_meta.article if result_noun.noun_meta else 'N/A'}")
print(f"Plural: {result_noun.noun_meta.plural if result_noun.noun_meta else 'N/A'}")
print(f"Difficulty: {result_noun.difficulty}")
print(f"Tags: {result_noun.tags}")
print("\nFull output:")
print(result_noun.model_dump_json(indent=2))

In [None]:
# Test with a verb
result_verb = enrich_word("denken", "to think")
print("=== VERB: denken ===")
print(f"Past participle: {result_verb.verb_meta.past_participle if result_verb.verb_meta else 'N/A'}")
print(f"Auxiliary: {result_verb.verb_meta.auxiliary if result_verb.verb_meta else 'N/A'}")
print(f"Common prepositions: {result_verb.verb_meta.common_prepositions if result_verb.verb_meta else 'N/A'}")
print(f"Difficulty: {result_verb.difficulty}")
print("\nFull output:")
print(result_verb.model_dump_json(indent=2))

In [None]:
# Test with an adjective
result_adj = enrich_word("groot", "big")
print("=== ADJECTIVE: groot ===")
print(f"Comparative: {result_adj.adjective_meta.comparative if result_adj.adjective_meta else 'N/A'}")
print(f"Superlative: {result_adj.adjective_meta.superlative if result_adj.adjective_meta else 'N/A'}")
print(f"Difficulty: {result_adj.difficulty}")
print("\nFull output:")
print(result_adj.model_dump_json(indent=2))

## Inspect Examples

Look at how examples are organized by tense/form.

In [None]:
# Examine verb examples in detail
if result_verb.verb_meta:
    print("=== Verb Examples ===")
    print(f"Verb: {result_verb.lemma}")
    print(f"Prepositions: {', '.join(result_verb.verb_meta.common_prepositions)}\n")
    
    print("Present tense:")
    for ex in result_verb.verb_meta.examples_present:
        print(f"  ðŸ‡³ðŸ‡± {ex.dutch}")
        print(f"  ðŸ‡¬ðŸ‡§ {ex.english}\n")
    
    print("Past tense:")
    for ex in result_verb.verb_meta.examples_past:
        print(f"  ðŸ‡³ðŸ‡± {ex.dutch}")
        print(f"  ðŸ‡¬ðŸ‡§ {ex.english}\n")
    
    print("Perfect tense:")
    for ex in result_verb.verb_meta.examples_perfect:
        print(f"  ðŸ‡³ðŸ‡± {ex.dutch}")
        print(f"  ðŸ‡¬ðŸ‡§ {ex.english}\n")

## Load Word List

Load words from your CSV.

In [None]:
# Load word list
word_list = pd.read_csv("data/word_list.csv")

print(f"Total words: {len(word_list)}")
print(f"Columns: {list(word_list.columns)}")

# Show words not yet enriched
if 'added_to_lexicon' in word_list.columns:
    unenriched = word_list[word_list['added_to_lexicon'] == False]
    print(f"\nWords not yet enriched: {len(unenriched)}")
    print("\nFirst 5 unenriched words:")
    print(unenriched.head())
else:
    print("\nFirst 5 words:")
    print(word_list.head())

## Batch Enrichment

Process multiple words. Start small to test!

In [None]:
# Enrich a small batch
enriched_entries = []
errors = []

# Process first 3 words as a test
batch_size = 3
for idx, row in word_list.head(batch_size).iterrows():
    dutch = row['dutch']
    english = row['english']
    
    print(f"\n[{idx+1}/{batch_size}] Enriching: {dutch} ({english})...")
    
    try:
        result = enrich_word(dutch, english)
        enriched_entries.append({
            'dutch': dutch,
            'english': english,
            'result': result
        })
        print(f"  âœ“ Success - POS: {result.pos}, Difficulty: {result.difficulty}")
    except Exception as e:
        errors.append({
            'dutch': dutch,
            'english': english,
            'error': str(e)
        })
        print(f"  âœ— Error: {e}")

print(f"\n\n{'='*50}")
print(f"Successfully enriched: {len(enriched_entries)} words")
print(f"Errors: {len(errors)} words")

## Inspect Batch Results

Quick summary of what was enriched.

In [None]:
# Summary of enriched words
for entry in enriched_entries:
    result = entry['result']
    print(f"\n{entry['dutch']:15} ({entry['english']:20})")
    print(f"  POS: {result.pos:12} | Difficulty: {result.difficulty} | Tags: {', '.join(result.tags[:3])}")
    
    # Show POS-specific info
    if result.noun_meta:
        print(f"  â†’ {result.noun_meta.article} {result.lemma} / {result.noun_meta.plural}")
    elif result.verb_meta:
        preps = ', '.join(result.verb_meta.common_prepositions) if result.verb_meta.common_prepositions else 'none'
        print(f"  â†’ participle: {result.verb_meta.past_participle} | preps: {preps}")
    elif result.adjective_meta:
        print(f"  â†’ {result.lemma} / {result.adjective_meta.comparative} / {result.adjective_meta.superlative}")

## Next Steps

Once satisfied with enrichment quality:

1. Increase `batch_size` to process more words
2. Create `scripts/import_to_mongo.py` to insert enriched entries into MongoDB
3. Update `word_list.csv` to mark words as `added_to_lexicon=TRUE`
4. Add retry logic for rate limits if processing large batches