# 05_keyness_cultismo.ipynb
**Lexical Keyness & Cultismo Frequencies**

Computes log-likelihood keyness and generates a cultismo-count table from the given vocabulary.

In [None]:
from pathlib import Path

def setup_project_paths():
    current_dir = Path().cwd()
    base_path = current_dir.parent if current_dir.name == 'codigo' else current_dir
    input_path = base_path / 'corpus' / 'tei'
    output_path = base_path / 'resultados' / 'computational-analysis'
    return base_path, input_path, output_path

BASE_PATH, INPUT_PATH, OUTPUT_PATH = setup_project_paths()

In [None]:
import pandas as pd
import math
from sklearn.feature_extraction.text import CountVectorizer

# Paths
csv_folder = OUTPUT_PATH / 'corpus_summary' / 'csv'
ext_folder = OUTPUT_PATH / 'extensions'
ext_folder.mkdir(parents=True, exist_ok=True)

# Load clusters
clustered_path = csv_folder / 'clustered_features.csv'
clust = pd.read_csv(clustered_path)
key_col = clust.columns[0]
clust = clust.rename(columns={key_col: 'filename'})

# Load or generate lexical counts
lex_path = csv_folder / 'corpus_lexical_counts.csv'
if not lex_path.exists():
    raw = pd.read_csv(csv_folder / 'raw_texts.csv').rename(columns={'Unnamed: 0': 'filename'})
    docs = raw['text'].fillna('').tolist()
    filenames = raw['filename']
    vec = CountVectorizer(max_features=2000, stop_words='english')
    X = vec.fit_transform(docs)
    df_tokens = pd.DataFrame(X.toarray(), columns=vec.get_feature_names_out())
    df_tokens.insert(0, 'filename', filenames)
    df_tokens.to_csv(lex_path, index=False)
else:
    df_tokens = pd.read_csv(lex_path)

# Compute log-likelihood keyness per term
df = df_tokens.merge(clust[['filename', 'Cluster']], on='filename')
def ll_score(k, n, K, N):
    expected = K * n / N
    return 2 * (k * math.log(k / expected) if k > 0 else 0)

terms = [c for c in df.columns if c not in ['filename', 'Cluster']]
N = df.shape[0]
results = []
for term in terms:
    for cl in sorted(df['Cluster'].unique()):
        sub = df[df['Cluster'] == cl]
        k = sub[term].sum()
        n = sub.shape[0]
        K = df[term].sum()
        ll = ll_score(k, n, K, N)
        results.append({'term': term, 'Cluster': cl, 'LL': ll})

ll_df = pd.DataFrame(results)
ll_df = ll_df.sort_values(['Cluster', 'LL'], ascending=[True, False])
ll_df.to_csv(ext_folder / 'keyness_scores.csv', index=False)
print("Keyness scores saved to", ext_folder / 'keyness_scores.csv')

# Build and save cultismo count table
vocab_path = csv_folder / 'cultismo_list.csv'
if vocab_path.exists():
    vocab = pd.read_csv(vocab_path).iloc[:, 0].astype(str).tolist()
    raw = pd.read_csv(csv_folder / 'raw_texts.csv').rename(columns={'Unnamed: 0':'filename'})
    docs = raw['text'].fillna('').str.lower().tolist()
    filenames = raw['filename']
    vec_cult = CountVectorizer(vocabulary=[w.lower() for w in vocab])
    Xc = vec_cult.fit_transform(docs)
    df_cult = pd.DataFrame(Xc.toarray(), columns=vec_cult.get_feature_names_out())
    df_cult.insert(0, 'filename', filenames)
    df_cult.to_csv(csv_folder / 'cultismo_list.csv', index=False)
    print("Generated cultismo_list.csv with counts.")
    # Cultismo density per cluster
    df_c = df_cult.merge(clust[['filename','Cluster']], on='filename')
    cult_density = df_c.groupby('Cluster').mean().reset_index()
    cult_density.to_csv(ext_folder / 'cultismo_density.csv', index=False)
    print("Cultismo density saved to", ext_folder / 'cultismo_density.csv')
else:
    print("Vocabulary cultismo_list.csv not found; skipping cultismo counts.")