# 05_keyness_cultismo.ipynb
**Lexical Keyness & Cultismo Frequencies**

Strip extensions on token filenames when loading existing counts.

In [23]:
from pathlib import Path

def setup_project_paths():
    current_dir = Path().cwd()
    base_path = current_dir.parent if current_dir.name == 'codigo' else current_dir
    input_path = base_path / 'corpus' / 'tei'
    output_path = base_path / 'resultados' / 'computational-analysis'
    return base_path, input_path, output_path

BASE_PATH, INPUT_PATH, OUTPUT_PATH = setup_project_paths()

In [24]:
import pandas as pd
import re
import math
from sklearn.feature_extraction.text import CountVectorizer

# Paths
csv_folder = OUTPUT_PATH / 'corpus_summary' / 'csv'
ext_folder = OUTPUT_PATH / 'extensions'
ext_folder.mkdir(parents=True, exist_ok=True)

# Load clusters
clust_df = pd.read_csv(csv_folder / 'clustered_features.csv')
key_col = clust_df.columns[0]
clust_df = clust_df.rename(columns={key_col: 'raw_filename'})
clust_df['filename'] = clust_df['raw_filename'].str.replace(r'\.xml$|\.txt$', '', regex=True)

# Load or generate token counts
lex_path = csv_folder / 'corpus_lexical_counts.csv'
if not lex_path.exists():
    raw_df = pd.read_csv(csv_folder / 'raw_texts.csv').rename(columns={'Unnamed: 0':'raw_filename'})
    raw_df['filename'] = raw_df['raw_filename'].str.replace(r'\.xml$|\.txt$', '', regex=True)
    clean_texts = raw_df['text'].astype(str).apply(lambda x: re.sub(r'<[^>]+>', ' ', x)).str.lower()
    vec = CountVectorizer(max_features=2000, stop_words='spanish', token_pattern=r"(?u)\b\w\w+\b")
    X = vec.fit_transform(clean_texts)
    df_tokens = pd.DataFrame(X.toarray(), columns=vec.get_feature_names_out())
    df_tokens.insert(0, 'filename', raw_df['filename'])
    df_tokens.to_csv(lex_path, index=False)
    print("Generated", lex_path, "with shape", df_tokens.shape)
else:
    df_tokens = pd.read_csv(lex_path)
    # Strip extensions in existing token filenames
    if 'filename' in df_tokens.columns:
        df_tokens['filename'] = df_tokens['filename'].str.replace(r'\.xml$|\.txt$', '', regex=True)
    print("Loaded and normalized token counts:", df_tokens.shape)

# Diagnostics
print("Token filenames sample:", df_tokens['filename'].head())
print("Cluster filenames sample:", clust_df['filename'].head())

# Merge for keyness
merged = df_tokens.merge(clust_df[['filename','Cluster']], on='filename', how='inner')
print("Merged shape:", merged.shape)

terms = [c for c in merged.columns if c not in ['filename','Cluster']]
print("Terms count:", len(terms))

# Keyness
def ll_score(k, n, K, N):
    exp = K * n / N
    return 2 * (k * math.log(k/exp) if k>0 else 0)

N = merged.shape[0]
results=[]
for term in terms:
    total_K = merged[term].sum()
    if total_K <= 0: continue
    for cl in sorted(merged['Cluster'].unique()):
        sub = merged[merged['Cluster']==cl]
        k = sub[term].sum()
        n = sub.shape[0]
        ll = ll_score(k,n,total_K,N)
        if ll>0: results.append({'term':term,'Cluster':cl,'LL':ll})
ll_df = pd.DataFrame(results)
if ll_df.empty:
    print("⚠️ No keyness entries.")
else:
    ll_df = ll_df.sort_values(['Cluster','LL'], ascending=[True,False])
    ll_df.to_csv(ext_folder/'keyness_scores.csv', index=False)
    print("Saved keyness_scores.csv")

# Cultismos unchanged...


Loaded and normalized token counts: (26, 2002)
Token filenames sample: 0    Hurtadodemendoza_fabulaadonis
1               Polo_fabapolodafne
2                    Carrillo_acis
3                Cetina_amorpsique
4               Villamediana_fenix
Name: filename, dtype: object
Cluster filenames sample: 0      Barahona_acteon
1    Barahona_vertumno
2     Bermudez_narciso
3     Bocangel_leandro
4        Carrillo_acis
Name: filename, dtype: object
Merged shape: (26, 2003)
Terms count: 2001
Saved keyness_scores.csv
