# 10_ngrams.ipynb

**Frequent Bigrams, Trigrams & POS N‑grams with Metadata and Examples**

In [None]:
from pathlib import Path

def setup_project_paths():
    current_dir = Path().cwd()
    base_path = current_dir.parent if current_dir.name == 'codigo' else current_dir
    tei_dir = base_path / 'corpus' / 'tei'
    csv_dir = base_path / 'resultados' / 'computational-analysis' / 'corpus_summary' / 'csv'
    ext_dir = base_path / 'resultados' / 'computational-analysis' / 'extensions'
    csv_dir.mkdir(parents=True, exist_ok=True)
    ext_dir.mkdir(parents=True, exist_ok=True)
    return tei_dir, csv_dir, ext_dir

TEI_DIR, CSV_DIR, EXT_DIR = setup_project_paths()

In [None]:
import pandas as pd
import spacy
from collections import Counter
from lxml import etree
import matplotlib.pyplot as plt

# Load models and namespaces
nlp = spacy.load('es_core_news_sm')
ns = {'tei': 'http://www.tei-c.org/ns/1.0'}

# Load texts and metadata
texts_df = pd.read_csv(CSV_DIR / 'raw_texts.csv')
meta = []
for xml_file in TEI_DIR.glob('*.xml'):
    tree = etree.parse(str(xml_file))
    title_el = tree.find('.//tei:title', namespaces=ns)
    author_el= tree.find('.//tei:author', namespaces=ns)
    meta.append({
        'filename': xml_file.name,
        'title': title_el.text if title_el is not None else '',
        'author': author_el.text if author_el is not None else ''
    })
meta_df = pd.DataFrame(meta)

# Collect global POS n-grams
all_pos_bi, all_pos_tri = [], []

# Per-poem counters
per_poem_records = []

for _, row in texts_df.iterrows():
    fn = row['filename']
    doc = nlp(row['text'] or '')
    tags = [tok.pos_ for tok in doc if tok.is_alpha]

    # Global
    all_pos_bi.extend(['_'.join(tags[i:i+2]) for i in range(len(tags)-1)])
    all_pos_tri.extend(['_'.join(tags[i:i+3]) for i in range(len(tags)-2)])

    # Per-poem
    bi_counts = Counter(['_'.join(tags[i:i+2]) for i in range(len(tags)-1)])
    tri_counts= Counter(['_'.join(tags[i:i+3]) for i in range(len(tags)-2)])
    per_poem_records.append({
        'filename': fn,
        'pos_bigram_dist': bi_counts,
        'pos_trigram_dist': tri_counts
    })

per_poem_df = pd.DataFrame(per_poem_records)

# Global frequency counters
freq_pbi = Counter(all_pos_bi)
freq_ptri= Counter(all_pos_tri)

# Save global distributions
def save_global(counter, name):
    top20 = counter.most_common(20)
    low20 = counter.most_common()[:-21:-1]
    pd.DataFrame(top20, columns=[name, 'count']).to_csv(EXT_DIR / f'global_top20_{name}.csv', index=False)
    pd.DataFrame(low20, columns=[name, 'count']).to_csv(EXT_DIR / f'global_low20_{name}.csv', index=False)

save_global(freq_pbi, 'pos_bigram')
save_global(freq_ptri, 'pos_trigram')

# Save per-poem distributions
def save_per(counter_name, dist_col):
    rows = []
    for r in per_poem_df.itertuples():
        fn = r.filename
        md = meta_df[meta_df['filename'] == fn].iloc[0]
        dist = getattr(r, dist_col)
        for ng, count in dist.items():
            rows.append({'filename':fn, 'title':md['title'], 'author':md['author'], counter_name:ng, 'count':count})
    pd.DataFrame(rows).to_csv(EXT_DIR / f'per_{counter_name}.csv', index=False)

save_per('pos_bigram', 'pos_bigram_dist')
save_per('pos_trigram', 'pos_trigram_dist')

# Extract examples for top 10 POS n-grams
top_bi = [ng for ng, _ in freq_pbi.most_common(10)]
top_tri= [ng for ng, _ in freq_ptri.most_common(10)]
bi_examples, tri_examples = [], []

for _, row in texts_df.iterrows():
    fn = row['filename']
    doc = nlp(row['text'] or '')
    words = [tok.text for tok in doc if tok.is_alpha]
    tags  = [tok.pos_ for tok in doc if tok.is_alpha]
    md = meta_df[meta_df['filename'] == fn].iloc[0]
    # bigrams
    for i in range(len(tags)-1):
        ng = f"{tags[i]}_{tags[i+1]}"
        if ng in top_bi:
            bi_examples.append({
                'filename':fn, 'title':md['title'], 'author':md['author'],
                'pos_bigram':ng, 'example':f"{words[i]} {words[i+1]}"
            })
    # trigrams
    for i in range(len(tags)-2):
        ng = f"{tags[i]}_{tags[i+1]}_{tags[i+2]}"
        if ng in top_tri:
            tri_examples.append({
                'filename':fn, 'title':md['title'], 'author':md['author'],
                'pos_trigram':ng, 'example':f"{words[i]} {words[i+1]} {words[i+2]}"
            })

pd.DataFrame(bi_examples).to_csv(EXT_DIR / 'pos_bigram_examples.csv', index=False)
pd.DataFrame(tri_examples).to_csv(EXT_DIR / 'pos_trigram_examples.csv', index=False)

print("Saved pos_bigram_examples.csv and pos_trigram_examples.csv")