In [1]:
from interpac_embedding import FECDenoisingAutoEncoder, FECNetDataset
import pickle
import pandas as pd

cmh = pd.read_csv('./data/cm_header_file.csv')
cm = pd.read_csv('./data/cm.txt', sep='|', names=cmh.columns, quoting=3).set_index('CMTE_ID')

fecdf = pd.read_parquet("./fecpreprocd.parquet")

with open("./meta.pkl", "rb") as rf:
    lbls = pickle.load(rf)
cmdf = pd.DataFrame(lbls['ents'], columns=['CMTE_ID']).join(cm, on='CMTE_ID').fillna('N/A')
cmdf.head()

Unnamed: 0,CMTE_ID,CMTE_NM,TRES_NM,CMTE_ST1,CMTE_ST2,CMTE_CITY,CMTE_ST,CMTE_ZIP,CMTE_DSGN,CMTE_TP,CMTE_PTY_AFFILIATION,CMTE_FILING_FREQ,ORG_TP,CONNECTED_ORG_NM,CAND_ID
0,C00000000,,,,,,,,,,,,,,
1,C00000059,HALLMARK CARDS PAC,SARAH MOE,2501 MCGEE,MD #500,KANSAS CITY,MO,64108.0,U,Q,UNK,M,C,,
2,C00000422,AMERICAN MEDICAL ASSOCIATION POLITICAL ACTION ...,"WALKER, KEVIN MR.","25 MASSACHUSETTS AVE, NW",SUITE 600,WASHINGTON,DC,200017400.0,B,Q,,M,M,DELAWARE MEDICAL PAC,
3,C00000489,D R I V E POLITICAL FUND CHAPTER 886,JERRY SIMS JR,3528 W RENO,,OKLAHOMA CITY,OK,73107.0,U,N,,Q,L,,
4,C00000547,KANSAS MEDICAL SOCIETY POLITICAL ACTION COMMITTEE,JERRY SLAUGHTER,623 SW 10TH AVE,,TOPEKA,KS,666121627.0,U,Q,UNK,Q,M,KANSAS MEDICAL SOCIETY,


In [2]:
fnds = FECNetDataset(lbls, fecdf)

from pathlib import Path
cdir = Path('./fec-d-ckpt')
latest = sorted((c for c in cdir.glob('*.ckpt')), key=lambda c: c.stat().st_mtime)[-1]
latest

Column types:
{'conts': ['amt_scaled', 'time_abs_scaled'], 'cats': ['ENTITY_TP', 'TRANSACTION_TP'], 'bins': ['amt_positive'], 'ents': ['CMTE_ID', 'OTHER_ID']}


PosixPath('fec-d-ckpt/epoch=149-step=6899.ckpt')

In [3]:
model = FECDenoisingAutoEncoder.load_from_checkpoint(
    str(latest),
    n_entities=len(lbls["ents"]),
    n_etypes=len(lbls["etype"]),
    n_txtypes=len(lbls["txtype"]),
    n_conts=len(fnds.contcols),
    n_bins=len(fnds.bincols),
    max_epochs=0,
)

In [4]:
rawe = model.entity_embedding.weight.detach().cpu().numpy()

In [5]:
from umap import UMAP
import numpy as np

In [6]:
import matplotlib.pyplot as plt

In [7]:
import holoviews as hv
hv.extension('bokeh')

In [8]:
dmap = UMAP(verbose=True, min_dist=0.01, n_neighbors=30)
dmap.fit(rawe);

UMAP(dens_frac=0.0, dens_lambda=0.0, min_dist=0.01, n_neighbors=30,
     verbose=True)
Construct fuzzy simplicial set
Wed Aug 25 00:11:47 2021 Finding Nearest Neighbors
Wed Aug 25 00:11:47 2021 Building RP forest with 10 trees
Wed Aug 25 00:11:47 2021 NN descent for 14 iterations
	 1  /  14
	 2  /  14
	 3  /  14
	 4  /  14
	 5  /  14
	 6  /  14
	Stopping threshold met -- exiting after 6 iterations
Wed Aug 25 00:11:52 2021 Finished Nearest Neighbor Search
Wed Aug 25 00:11:54 2021 Construct embedding
	completed  0  /  200 epochs
	completed  20  /  200 epochs
	completed  40  /  200 epochs
	completed  60  /  200 epochs
	completed  80  /  200 epochs
	completed  100  /  200 epochs
	completed  120  /  200 epochs
	completed  140  /  200 epochs
	completed  160  /  200 epochs
	completed  180  /  200 epochs
Wed Aug 25 00:11:59 2021 Finished embedding


In [9]:
w=450
h=w
df = cmdf.assign(x=dmap.embedding_[:,0], y=dmap.embedding_[:,1])
#df = df[df['CMTE_TP'] != 'N/A']
(
    hv.Points(df, kdims=['x', 'y']).opts(width=w, height=h, color='CMTE_TP', cmap='Category20').opts(tools=['hover']) +
    hv.Points(df, kdims=['x', 'y']).opts(width=w, height=h, color='CMTE_DSGN', cmap='Category20') +
    hv.Points(df, kdims=['x', 'y']).opts(width=w, height=h, color='CMTE_PTY_AFFILIATION', cmap='Category20')

).cols(2)