In [None]:

import pandas as pd
from sklearn.decomposition import PCA
import umap

# Load embeddings
embeddings_df = pd.read_parquet('./article_embeddings.parquet')
article_ids = embeddings_df['article_id']
embeddings = embeddings_df.drop('article_id', axis=1).values

# Reduce dimensionality using PCA
pca = PCA(n_components=50)  # Reduce to 50 components first to decrease complexity
pca_result = pca.fit_transform(embeddings)
print(f"Explained Variance Ratio after PCA: {sum(pca.explained_variance_ratio_)}")

# Apply UMAP to reduce further to 16 dimensions
umap_reducer = umap.UMAP(n_components=16, random_state=42)
umap_result = umap_reducer.fit_transform(pca_result)

# Create a DataFrame for the UMAP output
umap_df = pd.DataFrame(umap_result, columns=[f'umap_{i}' for i in range(16)])
umap_df['article_id'] = article_ids

# Save the UMAP reduced embeddings
umap_df.to_parquet('./reduced_embeddings.parquet')

# Optionally, print or plot the UMAP result
print(umap_df.head())
