In [2]:
import pyarrow as pa
from pyarrow import csv, feather, parquet, compute as pc
from pyarrow import parquet as pq
import pyarrow as pa
import json


# Data Schema

Deepscatter won't be able to directly plot things encoded as strings but they can display and be used for lookup.

Other fields should be float32 arrays to minimize size.

In [3]:
schema = pa.schema({
  "Title": pa.string(),
  "Journal": pa.string(),
  "PMID": pa.string(),
  "X_tfidf": pa.float32(),
  "Y_tfidf": pa.float32(),
  "X_bert": pa.float32(),
  "Y_bert": pa.float32(),
  "Year": pa.float32(),
  "Labels": pa.dictionary(pa.int32(), pa.string())
})

In [4]:
big = csv.read_csv("/Users/benschmidt/Downloads/pubmed_dataset_v3.csv",
  convert_options=csv.ConvertOptions(column_types=schema, include_columns=schema.names))

Rename columns in the form 'tfidf.x' b/c that format is easy to display in deepscatter.

In [6]:
clean = big.append_column("x", big["X_bert"])\
  .append_column("y", pc.multiply(big["Y_bert"], -1))\
  .drop(["Y_bert", "X_bert"])\
  .append_column("tfidf.x", big["X_tfidf"])\
  .append_column("tfidf.y", pc.multiply(big["Y_tfidf"], -1))\
  .drop(["Y_tfidf", "X_tfidf"])

Change the ordering for display. We're displaying the labeled fields first, randomly shuffled, followed by the unlabeled fields.

In [7]:
from pyarrow import compute as pc
lab = pc.filter(clean, pc.invert(pc.equal(clean['Labels'], "unlabeled")))
unlab = pc.filter(clean, pc.equal(clean['Labels'], "unlabeled"))

In [8]:
import numpy as np
from numpy import random
ix1 = np.arange(0, len(lab))
np.random.shuffle(ix1)
ix2 = np.arange(0, len(unlab))
np.random.shuffle(ix2)
lab = lab.take(ix1)
unlab = unlab.take(ix2)
joint = pa.concat_tables([lab, unlab])

Actually write out and run the quadtiling operation to be able to serve through deepscatter.

In [10]:
parquet.write_table(joint, "/tmp/big.parquet")


: 

: 

In [1]:
import quadfeather as qf
qf.main(["--files", "/tmp/big.parquet", "--destination", "/Users/benschmidt/scrolly_tiles/pubmed"])