# 03_pca_clustering.ipynb
**Dimensionality Reduction & Clustering**

Now merges only numeric features to avoid duplicate metadata columns.

In [None]:
from pathlib import Path

def setup_project_paths():
    current_dir = Path().cwd()
    base_path = current_dir.parent if current_dir.name == 'codigo' else current_dir
    input_path = base_path / 'corpus' / 'tei'
    output_path = base_path / 'resultados' / 'computational-analysis'
    return base_path, input_path, output_path

BASE_PATH, INPUT_PATH, OUTPUT_PATH = setup_project_paths()

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import linkage, dendrogram

# Define paths
folder = OUTPUT_PATH / 'corpus_summary' / 'csv'
merged_path = folder / 'merged_features.csv'

# Merge numeric features only if merged file is missing
if not merged_path.exists():
    print("Merged file not found; merging numeric columns only.")
    files = [
        'corpus_basic_statistics.csv',
        'corpus_entity_frequencies.csv',
        'corpus_linguistic_features.csv',
        'corpus_semantic_fields.csv',
        'corpus_stylometric_features.csv'
    ]
    dfs = []
    for f in files:
        df = pd.read_csv(folder / f)
        key = df.columns[0]
        numeric_cols = df.select_dtypes(include='number').columns.tolist()
        df = df[[key] + numeric_cols]
        dfs.append(df)
    df_merged = dfs[0]
    for df_part in dfs[1:]:
        df_merged = df_merged.merge(df_part, on=key, how='inner')
    df_merged.to_csv(merged_path, index=False)
    print("Saved merged numeric features to", merged_path)
else:
    print("Loading existing merged_features.csv")

df = pd.read_csv(merged_path)
key = df.columns[0]
nums = df.select_dtypes(include='number')
X = StandardScaler().fit_transform(nums)

# PCA
pca = PCA(n_components=5).fit(X)
var_ratio = pca.explained_variance_ratio_
cum_var = var_ratio.cumsum()
print("Explained variance ratios:", var_ratio)
print("Cumulative variance:", cum_var)

plt.figure()
plt.plot(range(1,6), var_ratio, marker='o', label='Individual')
plt.plot(range(1,6), cum_var, marker='x', label='Cumulative')
plt.xlabel('PC'); plt.ylabel('Variance Ratio')
plt.title('PCA Variance & Cumulative Variance')
plt.legend(); plt.tight_layout()
plt.savefig(OUTPUT_PATH / 'corpus_summary' / 'visualizations' / 'pca_variance_cumulative.png')
plt.show()

# Top loadings
loadings = pd.DataFrame(pca.components_.T, index=nums.columns, columns=[f'PC{i+1}' for i in range(5)])
print("Top 10 loadings PC1:", loadings['PC1'].abs().nlargest(10).to_dict())
print("Top 10 loadings PC2:", loadings['PC2'].abs().nlargest(10).to_dict())

# Silhouette analysis
scores = {k: silhouette_score(X, KMeans(n_clusters=k, random_state=42).fit_predict(X)) for k in range(2,7)}
plt.figure()
plt.plot(list(scores.keys()), list(scores.values()), marker='o')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Analysis')
plt.tight_layout()
plt.savefig(OUTPUT_PATH / 'corpus_summary' / 'visualizations' / 'silhouette.png')
plt.show()
print("Silhouette scores:", scores)

# Final clustering k=3
df['Cluster'] = KMeans(n_clusters=3, random_state=42).fit_predict(X)

# Dendrogram
linked = linkage(X, method='ward')
plt.figure(figsize=(10, 6))
dendrogram(linked, labels=df[key].tolist(), leaf_rotation=90)
plt.title('Hierarchical Clustering Dendrogram')
plt.tight_layout()
plt.savefig(OUTPUT_PATH / 'corpus_summary' / 'visualizations' / 'dendrogram.png')
plt.show()

df.to_csv(folder / 'clustered_features.csv', index=False)
print("Saved clustered_features.csv")