In [1]:
# HACK: use project root as the working directory 
from pathlib import Path

while Path.cwd().name != 'language-model-toxicity':
    %cd ..

/homes/gws/sgehman/language-model-toxicity


In [2]:
from typing import List
import logging
from pathlib import Path
import tempfile

import dask
import dask.array as da
from joblib import Memory, Parallel, delayed
import numpy as np
from tqdm.auto import tqdm

from utils.constants import DATA_DIR, OUTPUT_DIR

# Create joblib memory
mem = Memory(OUTPUT_DIR / 'cache' / 'webtext_overlap')

In [3]:
EOS = 50256
vocab_size = EOS + 1

def bpe_files(bpe_dir: Path) -> List[Path]:
    return [file for file in bpe_dir.iterdir() if file.suffix == '.npy']

def load_meta(bpe_dir: Path):
    files = bpe_files(bpe_dir)
    meta = [(np.count_nonzero(array == EOS) - 1, array.dtype)
            for array 
            in tqdm(map(np.load, files), total=len(files), desc='Loading meta')]
    shapes, dtypes = zip(*meta)
    return files, shapes, dtypes[0]

# Cache calls to load_meta
load_meta = mem.cache(load_meta)

## Load metadata

In [4]:
wt_dir = DATA_DIR / 'webtext'
wt_meta = load_meta(wt_dir)

In [5]:
owtc_dir = DATA_DIR / 'openwebtext_bpe'
owtc_meta = load_meta(owtc_dir)

In [6]:
owtc_files = owtc_meta[0]

## Load corpus

In [7]:
def split_docs(tokens: np.array) -> np.array:
    idx = np.nonzero(tokens == EOS)[0]
    docs = np.split(tokens, idx)
    docs = [doc[1:] for doc in docs if len(doc) > 1]
    return np.array(docs)

def load_corpus_into_memory(files: List[Path]):
    corpus = []
    for shard in tqdm(map(np.load, files), total=len(files)):
        corpus.extend(split_docs(shard))
    return corpus

delayed_load = dask.delayed(lambda f: split_docs(np.load(f)))

def load_corpus(meta):
    files, shapes, dtype = meta
    
    # Create delayed arrays
    delayed_arrays = list(map(delayed_load, files))
        
    # Concatenate arrays
    corpus = da.concatenate([da.from_delayed(array, shape=(shape,), dtype=dtype) 
                             for array, shape in zip(delayed_arrays, shapes)])

    return corpus

In [8]:
from itertools import chain
from sklearn.feature_extraction.text import HashingVectorizer, CountVectorizer

def _load_shard(file: Path):
    print("Loading shard:", file.stem)
    shard = np.load(file)
    print("Splitting documents:", file.stem)
    docs = split_docs(shard)
    return docs

def load_corpus_vectors(files: List[Path], n_jobs: int):
    with Parallel(n_jobs=n_jobs) as parallel:
        print("Loading shards...")
        shards = parallel(
            delayed(_load_shard)(file) for file in files
        )

        print("CountVectorizing...")
        identity = lambda x: x
        vectorizer = CountVectorizer(vocabulary=range(EOS + 1), analyzer=identity)

        vectorized_docs = parallel(
            delayed(vectorizer.transform)(shard) for shard in shards
        )
    
    return vectorized_docs

In [9]:
from scipy.sparse import vstack, save_npz

In [10]:
vectorized_docs = load_corpus_vectors(owtc_files, n_jobs=20)
vectorized_docs = vstack(vectorized_docs)
save_npz(OUTPUT_DIR / 'vecs', vectorized_docs)

Loading shards...
CountVectorizing...
