In [None]:
from umap import UMAP
import numpy as np
import matplotlib.pyplot as plt
import holoviews as hv
hv.extension('bokeh', 'plotly')

In [None]:
import faiss
from autofaiss import build_index

In [None]:
from tblenc import FECNetPL, FECNetDataset, FECNetConfig
import torch
import pickle
import pandas as pd
import numpy as np

cm = pd.read_parquet('./cm.parquet')
fecdf = pd.read_parquet("./fecpreprocd.parquet")

with open("./meta.pkl", "rb") as rf:
    lbls = pickle.load(rf)
cmdf = pd.DataFrame(lbls['ents'], columns=['CMTE_ID']).join(cm, on='CMTE_ID')
len(cmdf)

In [None]:
named_cmtes = cmdf.dropna(subset=['CMTE_NM'])
named_cmtes.to_parquet('ncdf.parquet')

In [None]:
fnds = FECNetDataset(lbls, fecdf)
from pathlib import Path
cdir = Path('./fec-ckpt')

In [None]:
latest = sorted((c for c in cdir.glob('**/*.ckpt')), key=lambda c: c.stat().st_mtime)[-1]
model = FECNetPL.load_from_checkpoint(str(latest))
rawe = model.core.encoder.entity_embedding.weight.detach().cpu().numpy()
toemb = rawe[named_cmtes.index.to_numpy()]
latest

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pcaemb = pca.fit_transform(toemb)
w=450
h=w
emb = pcaemb
df = named_cmtes.assign(x=emb[:,0], y=emb[:,1])
#df = df[df['CMTE_TP'] != 'N/A']|
print(latest)
(
    hv.Points(df, kdims=['x', 'y']).opts(width=w, height=h, color='CMTE_TP', cmap='Category20').opts(tools=['hover']) +
    hv.Points(df, kdims=['x', 'y']).opts(width=w, height=h, color='CMTE_DSGN', cmap='Category20') +
    hv.Points(df, kdims=['x', 'y']).opts(width=w, height=h, color='CMTE_PTY_AFFILIATION', cmap='Category20').opts(tools=['hover']) + 
    hv.Points(df, kdims=['x', 'y']).opts(width=w, height=h, color='ORG_TP', cmap='Category20')
).cols(2)

In [None]:
print(latest)
dmap = UMAP(metric="cosine", min_dist=0.01, verbose=True) #, n_neighbors=30)
dmap.fit(toemb)
uemb = dmap.embedding_
w = 450
h = w
emb = uemb
df = named_cmtes.assign(x=emb[:, 0], y=emb[:, 1])
# df = df[df['CMTE_TP'] != 'N/A']
print(latest)
(
    hv.Points(df, kdims=["x", "y"]).opts(width=w, height=h, color="CMTE_TP", cmap="Category20").opts(tools=["hover"])+ 
    hv.Points(df, kdims=["x", "y"]).opts(width=w, height=h, color="CMTE_DSGN", cmap="Category20") + 
    hv.Points(df, kdims=["x", "y"]).opts(width=w, height=h, color="CMTE_PTY_AFFILIATION", cmap="Category20") + 
    hv.Points(df, kdims=["x", "y"]).opts(width=w, height=h, color="ORG_TP", cmap="Category20")
).cols(2)

In [None]:
# df.to_parquet('./visdf.parquet', compression='zstd')

In [None]:
prep = rawe.copy()
faiss.normalize_L2(prep)
# prep = rawe
np.save('./embeddings.npy', prep)

In [None]:
index, iinf = build_index(prep, metric_type='ip', use_gpu=False)

In [None]:
ds, ids = index.search(prep[3:4], 10)
ds, ids

In [None]:
cmdf.loc[ids[0]]

In [None]:
from phate import PHATE

In [None]:
pop = PHATE(knn_dist='cosine', mds_dist='cosine')
# pop = PHATE()
pop.fit(toemb)
pemb = pop.transform(toemb)
w=450
h=w
emb = pemb
df = named_cmtes.assign(x=emb[:,0], y=emb[:,1])
#df = df[df['CMTE_TP'] != 'N/A']
print(latest)
(
    hv.Points(df, kdims=['x', 'y']).opts(width=w, height=h, color='CMTE_TP', cmap='Category20').opts(tools=['hover']) +
    hv.Points(df, kdims=['x', 'y']).opts(width=w, height=h, color='CMTE_DSGN', cmap='Category20') +
    hv.Points(df, kdims=['x', 'y']).opts(width=w, height=h, color='CMTE_PTY_AFFILIATION', cmap='Category20') + 
    hv.Points(df, kdims=['x', 'y']).opts(width=w, height=h, color='ORG_TP', cmap='Category20')
).cols(2)

In [None]:
cd = pd.read_parquet('./cmdesc.parquet')
cd[['CMTE_NM', 'desc']]

In [None]:
dcmap = UMAP(metric="cosine", verbose=True, n_components=100, min_dist=0.0, random_state=42, n_neighbors=30, n_jobs=-1)
dcemb = dcmap.fit_transform(toemb)
from hdbscan import HDBSCAN
clusterer = HDBSCAN()
clusterer.fit(dcemb)

In [None]:
cldf = named_cmtes.assign(cluster=clusterer.labels_)
for g, df in cldf.groupby('cluster'):
    print(f'Cluster {g} has {len(df)} members')
    if g == -1:
        continue
    print(df[['CMTE_NM','CMTE_ID']])
    print()

In [None]:
w=900
h=w
emb = uemb
df = cldf.assign(x=emb[:,0], y=emb[:,1])
hv.Points(df, kdims=['x', 'y']).opts(width=w, height=h, color='cluster', cmap='Category20').opts(tools=['hover'])

In [None]:
import sklearn
from nomic import atlas

In [None]:
ap = atlas.map_embeddings(
    name='US Federal Election Commission PAC embeddings', 
    description='Embeddings created from the "Any transaction from one committee to another" dataset (itoth.txt) from the Federal Election Commission',
    id_field='CMTE_ID', 
    embeddings=sklearn.preprocessing.normalize(toemb),
    #data=named_cmtes.assign(CMTE_NM=named_cmtes['CMTE_NM'].str.lower()).to_dict('records'),
    data = cd.to_dict('records'),
    topic_label_field='desc',
    colorable_fields=['CMTE_TP', 'CMTE_DSGN', 'CMTE_PTY_AFFILIATION', 'ORG_TP'],
    reset_project_if_exists=True,
    build_topic_model=True,
    projection_n_neighbors=30,
)

In [None]:
from sklearn.manifold import TSNE
tsop = TSNE(n_components=2, metric='cosine', n_jobs=-1, verbose=2, perplexity=50, n_iter=1500)
temb = tsop.fit_transform(toemb)

In [None]:
w=450
h=w
emb = temb
df = named_cmtes.assign(x=emb[:,0], y=emb[:,1])
#df = df[df['CMTE_TP'] != 'N/A']|
print(latest)
(
    hv.Points(df, kdims=['x', 'y']).opts(width=w, height=h, color='CMTE_TP', cmap='Category20').opts(tools=['hover']) +
    hv.Points(df, kdims=['x', 'y']).opts(width=w, height=h, color='CMTE_DSGN', cmap='Category20') +
    hv.Points(df, kdims=['x', 'y']).opts(width=w, height=h, color='CMTE_PTY_AFFILIATION', cmap='Category20').opts(tools=['hover']) + 
    hv.Points(df, kdims=['x', 'y']).opts(width=w, height=h, color='ORG_TP', cmap='Category20')
).cols(2)