In [21]:
from main import model, labelers
import pandas as pd
import torch
from umap import UMAP
import umap.plot as upl
from pathlib import Path
import os

In [22]:
id2cid = labelers["id_labeler"].encoder.classes_
idorder = pd.DataFrame({"CMTE_ID": id2cid})


def read_frame(header_file, data_file, dtypes={}):
    header = pd.read_csv(header_file)
    dt = {c: str for c in header.columns}
    dt.update(dtypes)
    data = pd.read_csv(data_file, sep="|", names=header.columns, dtype=dt)
    return data


def read_cm(year, basedir="./data"):
    cm = read_frame(
        f"{basedir}/cm_header_file.csv",
        f"{basedir}/{year}/cm.txt",
        dtypes={
            c: "str"
            for c in (
                "CMTE_DSGN",
                "CMTE_TP",
                "CMTE_PTY_AFFILIATION",
                "CMTE_FILING_FREQ",
            )
        },
    )
    return cm


cmdf = (
    idorder.join(
        pd.concat([read_cm(2020), read_cm(2022), read_cm(2024)])
        .drop_duplicates(subset=["CMTE_ID"], keep="last")
        .set_index("CMTE_ID"),
        on="CMTE_ID",
    )
    .dropna(subset=["CMTE_NM"])
    .fillna("N/A")
)

In [23]:
def k(p):
    stat = os.stat(p)
    return stat.st_mtime
latest = sorted(Path('.').glob('*.bin'), key=k, reverse=True)[0]
print(latest)

jumping-fog-293-e15.bin


In [24]:
sd = torch.load(latest, map_location=torch.device("cpu"))
sd = {k.replace('_orig_mod.', ''): v for k, v in sd.items()}
model.load_state_dict(sd)
entemb = model.encoder.entity_embeddings.weight.detach().cpu().numpy()
print(entemb.shape)
namedemb = entemb[cmdf.index]

(18629, 256)


In [25]:
uop = UMAP(verbose=True, n_jobs=-1, min_dist=0.01, metric='cosine')
p2d = uop.fit_transform(namedemb)

UMAP(angular_rp_forest=True, metric='cosine', min_dist=0.01, verbose=True)
Sat Dec  2 05:27:53 2023 Construct fuzzy simplicial set
Sat Dec  2 05:27:53 2023 Finding Nearest Neighbors
Sat Dec  2 05:27:53 2023 Building RP forest with 11 trees
Sat Dec  2 05:27:53 2023 NN descent for 14 iterations


	 1  /  14
	 2  /  14
	 3  /  14
	 4  /  14


In [None]:
upl.output_notebook()

In [None]:
p = upl.interactive(
    uop,
    hover_data=cmdf.reset_index(drop=True),
    point_size=2,
    labels=cmdf["CMTE_DSGN"].values,
)
upl.show(p)

In [None]:
latest

PosixPath('jumping-fog-293-e11.bin')

In [None]:
p = upl.interactive(
    uop,
    hover_data=cmdf.reset_index(drop=True),
    point_size=2,
    labels=cmdf["CMTE_PTY_AFFILIATION"].values,
)
upl.show(p)

In [None]:
#!python main.py atlas {str(latest)}