## Pairwise

In [None]:
client = Client()
client

In [None]:
from dask.distributed import Client
import joblib
# from sklearn.metrics import pairwise_distances
from dask_ml.metrics.pairwise import pairwise_distances

with joblib.parallel_backend('dask'):
    pairwise_distances(wt_corpus, owtc_corpus)

## Another test

In [8]:
wt_docs = split_docs(np.load(wt_files[0]))

In [None]:
wt_docs

In [None]:
owtc_docs = split_docs(np.load(owtc_files[0]))

In [None]:
wt_docs[0]

In [27]:
from sklearn.metrics import pairwise_distances, pairwise_distances_chunked

In [28]:
import editdistance

In [29]:
gen = pairwise_distances_chunked(wt_docs, owtc_docs, metric=editdistance.eval, n_jobs=8)

In [1]:
# HACK: use project root as the working directory 
from pathlib import Path

while Path.cwd().name != 'language-model-toxicity':
    %cd ..

/home/sam/research/language-model-toxicity


In [2]:
from typing import List
import logging
from pathlib import Path
import tempfile

import dask
import dask.array as da
from joblib import Memory
import numpy as np
from tqdm.auto import tqdm

from utils.constants import DATA_DIR, OUTPUT_DIR

# Disable logging from transformers
logging.disable(logging.CRITICAL)

# Create joblib memory
mem = Memory(OUTPUT_DIR / 'cache' / 'webtext_overlap')

In [3]:
EOS = 50256
vocab_size = EOS + 1

def load_meta(bpe_dir: Path):
    files = [file for file in bpe_dir.iterdir() if file.suffix == '.npy']
    meta = [(np.count_nonzero(array == EOS) - 1, array.dtype)
            for array 
            in tqdm(map(np.load, files), total=len(files), desc='Loading meta')]
    shapes, dtypes = zip(*meta)
    return files, shapes, dtypes[0]

# Cache calls to load_meta
load_meta = mem.cache(load_meta)

## Load metadata

In [4]:
wt_dir = DATA_DIR / 'webtext'
wt_meta = load_meta(wt_dir)

In [5]:
owtc_dir = DATA_DIR / 'openwebtext_bpe'
owtc_meta = load_meta(owtc_dir)

## Load corpus

In [6]:
def split_docs(tokens: np.array) -> np.array:
    idx = np.nonzero(tokens == EOS)[0]
    docs = np.split(tokens, idx)
    docs = [doc[1:] for doc in docs if len(doc) > 1]
    return np.array(docs)

def load_corpus_into_memory(files: List[Path]):
    corpus = []
    for shard in tqdm(map(np.load, files), total=len(files)):
        corpus.extend(split_docs(shard))
    return corpus

delayed_load = dask.delayed(lambda f: split_docs(np.load(f)))

def load_corpus(meta):
    files, shapes, dtype = meta
    
    # Create delayed arrays
    delayed_arrays = list(map(delayed_load, files))
        
    # Concatenate arrays
    corpus = da.concatenate([da.from_delayed(array, shape=(shape,), dtype=dtype) 
                             for array, shape in zip(delayed_arrays, shapes)])

    return corpus

In [7]:
# Load OWTC into memory
# owtc_corpus = []
# for shard in tqdm(map(np.load, owtc_files), total=len(owtc_files)):
#     owtc_corpus.extend(split_docs(shard))

In [8]:
wt_corpus = load_corpus(wt_meta)
wt_corpus

Unnamed: 0,Array,Chunk
Bytes,33.13 MB,1.66 MB
Shape,"(8282020,)","(414101,)"
Count,60 Tasks,20 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 33.13 MB 1.66 MB Shape (8282020,) (414101,) Count 60 Tasks 20 Chunks Type int32 numpy.ndarray",8282020  1,

Unnamed: 0,Array,Chunk
Bytes,33.13 MB,1.66 MB
Shape,"(8282020,)","(414101,)"
Count,60 Tasks,20 Chunks
Type,int32,numpy.ndarray


In [9]:
owtc_corpus = load_corpus(owtc_meta)
owtc_corpus

Unnamed: 0,Array,Chunk
Bytes,16.01 MB,803.01 kB
Shape,"(8003003,)","(401504,)"
Count,60 Tasks,20 Chunks
Type,uint16,numpy.ndarray
"Array Chunk Bytes 16.01 MB 803.01 kB Shape (8003003,) (401504,) Count 60 Tasks 20 Chunks Type uint16 numpy.ndarray",8003003  1,

Unnamed: 0,Array,Chunk
Bytes,16.01 MB,803.01 kB
Shape,"(8003003,)","(401504,)"
Count,60 Tasks,20 Chunks
Type,uint16,numpy.ndarray


In [10]:
# Ideas for computing distance:
# - Locality-based hashing
# - https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.HashingVectorizer.html
# - https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
# - CosineSimilarity

## Features

In [13]:
# import dask.bag as db
# owtc_bag = db.from_sequence(owtc_corpus)

In [14]:
from sklearn.feature_extraction import FeatureHasher

In [15]:
h = FeatureHasher()

In [16]:
owtc_corpus.map_blocks(h.transform).compute()

ValueError: `dtype` inference failed in `map_blocks`.

Please specify the dtype explicitly using the `dtype` kwarg.

Original error is below:
------------------------
AttributeError("'numpy.uint16' object has no attribute 'items'")

Traceback:
---------
  File "/home/sam/anaconda3/envs/lm-toxicity/lib/python3.7/site-packages/dask/array/core.py", line 343, in apply_infer_dtype
    o = func(*args, **kwargs)
  File "/home/sam/anaconda3/envs/lm-toxicity/lib/python3.7/site-packages/sklearn/feature_extraction/_hash.py", line 155, in transform
    self.alternate_sign, seed=0)
  File "sklearn/feature_extraction/_hashing_fast.pyx", line 51, in sklearn.feature_extraction._hashing_fast.transform
  File "/home/sam/anaconda3/envs/lm-toxicity/lib/python3.7/site-packages/sklearn/feature_extraction/_hash.py", line 150, in <genexpr>
    raw_X = (_iteritems(d) for d in raw_X)
  File "/home/sam/anaconda3/envs/lm-toxicity/lib/python3.7/site-packages/sklearn/feature_extraction/_hash.py", line 24, in _iteritems
    return d.iteritems() if hasattr(d, "iteritems") else d.items()


In [27]:
owtc_corpus.blocks[0]

Unnamed: 0,Array,Chunk
Bytes,801.58 kB,801.58 kB
Shape,"(400792,)","(400792,)"
Count,61 Tasks,1 Chunks
Type,uint16,numpy.ndarray
"Array Chunk Bytes 801.58 kB 801.58 kB Shape (400792,) (400792,) Count 61 Tasks 1 Chunks Type uint16 numpy.ndarray",400792  1,

Unnamed: 0,Array,Chunk
Bytes,801.58 kB,801.58 kB
Shape,"(400792,)","(400792,)"
Count,61 Tasks,1 Chunks
Type,uint16,numpy.ndarray


In [24]:
owtc_corpus[0].compute()

array([  477,   588,   284,   787,  1257,   286, 21204, 33902,   557,
         290,   262,   588,  1690,    13,   887,  3737,   356,  3588,
         470,  3501,   777,  1022,    12,  1169,    12, 26968,  7638,
        1576,  3884,    11,   780,  3360,   511,   898,  9188,   389,
         826,  1306,   284,   606,    13,   198,   198, 40640,   262,
         886,   286,   262,   717,  2278,  1022,   262, 25479,   290,
         262, 12469,    11,   309, 15571,   338,  1022,    12,  1169,
          12, 26968,   582,  7760, 19130, 12022,  5495,   257,  1310,
         410,   570,  5857,   319,  5780,   520,    13,  5593,  1022,
       32262,   829,    13,   632,   373,   534,  3210,  1057,    12,
        1659,    12,  1169,    12, 17805,   989,    11,  9593,  9640,
         286,   520,    13,  5593,   572,    12,   290,   319,    12,
        1169,    12,   501,   290, 19130, 12022,  2111,   284,  3197,
       17909,  1022,   262,   734,    13,   198,   198,  1026,   338,
        1310,  4240,

In [25]:
x = _

In [28]:
400792 * 20

8015840