In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import fer.data as fecdata
from pathlib import Path
import torch.nn.functional as F
device = 'cpu'

In [3]:
df = fecdata.pac_to_pac_transactions()
dataset, df, labelers = fecdata.prepare(df)

In [4]:
from fer.model import Config, FECEncoder, TabDataset, TabularDenoiser
import torch

cfg = Config(
    embedding_init_std=1/512.,
    tied_encoder_decoder_emb=True,
    entity_emb_normed=False,
    cos_sim_decode_entity=False,
    transformer_dim = 384,
    transformer_heads = 12,
    transformer_layers = 8,
    entity_dim = 384,
)
lr = 1e-3
n_epochs = 4
model = TabularDenoiser(
    cfg,
    n_entities=max(dataset["src"].max(), dataset["dst"].max()) + 1,
    n_etype=dataset["etype"].max() + 1,
    n_ttype=dataset["ttype"].max() + 1,
)
tds = TabDataset(dataset)

Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda


In [5]:
from torch.utils.data import DataLoader, random_split
model = model.to(device)
model = torch.compile(model)
model.load_state_dict(torch.load('./rose-wind-106.bin'))

<All keys matched successfully>

In [6]:
from umap import UMAP
import umap.plot as upl

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [7]:
import holoviews as hv
hv.extension('bokeh')

In [8]:
entemb = model.encoder.entity_embeddings.weight.detach().cpu().numpy()
entemb.shape

(17567, 384)

In [9]:
import pandas as pd
id2cid = labelers['id_labeler'].encoder.classes_
idorder = pd.DataFrame({'CMTE_ID':id2cid})

In [10]:
def read_frame(header_file, data_file, dtypes={}):
    header = pd.read_csv(header_file)
    dt = {c: str for c in header.columns}
    dt.update(dtypes)
    data = pd.read_csv(data_file, sep="|", names=header.columns, dtype=dt)
    return data

def read_cm(year, basedir='./data'):
    cm = read_frame(
        f"{basedir}/cm_header_file.csv",
        f"{basedir}/{year}/cm.txt",
        dtypes={
            c: "str"
            for c in (
                "CMTE_DSGN",
                "CMTE_TP",
                "CMTE_PTY_AFFILIATION",
                "CMTE_FILING_FREQ",
            )
        },
    )
    return cm

cmdf = idorder.join(pd.concat([read_cm(2020), read_cm(2022), read_cm(2024)]).drop_duplicates(subset=['CMTE_ID'], keep='last').set_index('CMTE_ID'), on='CMTE_ID').dropna(subset=['CMTE_NM'])

In [11]:
namedemb = entemb[cmdf.index]
namedemb.shape

(12886, 384)

In [12]:
import numpy as np
uop = UMAP(verbose=True, metric='cosine')
e2d = uop.fit_transform(namedemb)
eframe = pd.DataFrame(e2d, columns=['x', 'y'])

UMAP(angular_rp_forest=True, metric='cosine', verbose=True)
Tue Aug 22 01:44:37 2023 Construct fuzzy simplicial set
Tue Aug 22 01:44:37 2023 Finding Nearest Neighbors
Tue Aug 22 01:44:37 2023 Building RP forest with 11 trees
Tue Aug 22 01:44:41 2023 NN descent for 14 iterations
	 1  /  14
	 2  /  14
	 3  /  14
	 4  /  14
	 5  /  14
	 6  /  14
	Stopping threshold met -- exiting after 6 iterations
Tue Aug 22 01:44:53 2023 Finished Nearest Neighbor Search
Tue Aug 22 01:44:55 2023 Construct embedding


Epochs completed:   0%|            0/200 [00:00]

Tue Aug 22 01:45:03 2023 Finished embedding


In [13]:
sz=450
(hv.Points(eframe.join(cmdf.reset_index(drop=True))).opts(width=sz, height=sz, color='CMTE_PTY_AFFILIATION', cmap='Category20') + 
 hv.Points(eframe.join(cmdf.reset_index(drop=True))).opts(width=sz, height=sz, color='CMTE_DSGN', cmap='Category20') + 
 hv.Points(eframe.join(cmdf.reset_index(drop=True))).opts(width=sz, height=sz, color='CMTE_TP', cmap='Category20') +
 hv.Points(eframe.join(cmdf.reset_index(drop=True))).opts(width=sz, height=sz, color='ORG_TP', cmap='Category20')).cols(2)

  layout_plot = gridplot(
  layout_plot = gridplot(


In [14]:
def do_atlas(do_norm=True):
    from nomic import atlas
    from sklearn.preprocessing import normalize
    
    atlas.map_embeddings(
        normalize(namedemb) if do_norm else namedemb,
        data=cmdf.reset_index(drop=True),
        name='fecentrep-2' + ('-norm' if do_norm else ''),
        colorable_fields=['CMTE_TP', 'CMTE_DSGN', 'ORG_TP', 'CMTE_PTY_AFFILIATION'],
        id_field='CMTE_ID',
        topic_label_field='CMTE_NM',
        reset_project_if_exists=True,
    )

In [15]:
do_atlas(do_norm=True)

[32m2023-08-22 01:45:10.328[0m | [1mINFO    [0m | [36mnomic.project[0m:[36m__init__[0m:[36m691[0m - [1mFound existing project `fecentrep-2-norm` in organization `aaron`. Clearing it of data by request.[0m
[32m2023-08-22 01:45:13.539[0m | [1mINFO    [0m | [36mnomic.project[0m:[36m_create_project[0m:[36m779[0m - [1mCreating project `fecentrep-2-norm` in organization `aaron`[0m
[32m2023-08-22 01:45:14.237[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_embeddings[0m:[36m107[0m - [1mUploading embeddings to Atlas.[0m
4it [00:09,  2.32s/it]                       
[32m2023-08-22 01:45:23.613[0m | [1mINFO    [0m | [36mnomic.project[0m:[36m_add_data[0m:[36m1411[0m - [1mUpload succeeded.[0m
[32m2023-08-22 01:45:23.616[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_embeddings[0m:[36m126[0m - [1mEmbedding upload succeeded.[0m
[32m2023-08-22 01:45:24.534[0m | [1mINFO    [0m | [36mnomic.project[0m:[36mcreate_index[0m:[36m1121[0

In [16]:
# do_atlas(do_norm=False)