In [4]:
# HACK: use project root as the working directory 
from pathlib import Path

while Path.cwd().name != 'language-model-toxicity':
    %cd ..

/homes/gws/sgehman/language-model-toxicity


In [21]:
import logging
from typing import List
import pickle
import re

from joblib import Memory, Parallel, delayed, dump, load
from lsh import cache, minhash
import numpy as np
from itertools import chain, islice
from tqdm.auto import tqdm

from utils.constants import DATA_DIR, OUTPUT_DIR
from utils.webtext import load_meta, delayed_corpus, split_docs

mem = Memory(OUTPUT_DIR / 'cache')

## Load candidate data from disk

In [6]:
duplicates_dir = OUTPUT_DIR / 'lsh_duplicates' / 'char_ngram_5_seeds_100_bands_10_str_2'
!ls {duplicates_dir}

all_duplicates.joblib  doc_ids.joblib  fingerprints.npy
all_duplicates.pkl     doc_ids.pkl


In [8]:
with open(duplicates_dir / 'all_duplicates.pkl', 'rb') as f:
    all_duplicates = pickle.load(f)

fingerprints = np.load(duplicates_dir / 'fingerprints.npy')

with open(duplicates_dir / 'doc_ids.pkl', 'rb') as f:
    doc_ids = pickle.load(f)

In [9]:
len(all_duplicates)

22033483

In [12]:
# # TODO: use pickles

# import pickle
# with open(duplicates_dir / 'all_duplicates.pkl', 'wb') as f:
#     pickle.dump(all_duplicates, f, pickle.HIGHEST_PROTOCOL)
# with open(duplicates_dir / 'doc_ids.pkl', 'wb') as f:
#     pickle.dump(doc_ids, f, pickle.HIGHEST_PROTOCOL)

In [10]:
def filter_same_corpus(candidates):
    # Remove duplicates found from the same corpus and ensure all have webtext first to eliminate symmetric duplicates
    return set((x, y) if x[1] == 'wt' else (y, x)
               for x, y in tqdm(candidates) if x[1] != y[1])

In [11]:
candidate_duplicates = filter_same_corpus(all_duplicates)

In [12]:
len(candidate_duplicates)

3474977

## Filter candidates

In [13]:
fingerprints_dict = dict(zip(doc_ids, fingerprints))

In [14]:
char_ngram, seeds, bands = [int(s) for s in re.findall(r'\d+', duplicates_dir.name)][:3]  # experimental settings
hashbytes = 4  # constant

In [15]:
hasher = minhash.MinHasher(seeds=seeds, char_ngram=char_ngram, hashbytes=hashbytes)

In [16]:
def filter_candidates(candidate_id_pairs, min_jaccard, fingerprints, hasher):
    res = set()
    for id1, id2 in tqdm(candidate_id_pairs):
        jaccard = hasher.jaccard(fingerprints[id1], fingerprints[id2])
        if jaccard > min_jaccard:
            res.add((id1, id2))
    return res

In [17]:
filtered_duplicates = filter_candidates(candidate_duplicates, min_jaccard=0.99, fingerprints=fingerprints_dict, hasher=hasher)

HBox(children=(FloatProgress(value=0.0, max=3474977.0), HTML(value='')))




In [18]:
len(filtered_duplicates)

3474977

In [20]:
next(iter(filtered_duplicates))

(('webtext_17-405461', 'wt'),
 ('0663096-1d59052b731f3266672436c4cea31f75', 'owtc'))

## Look up documents in corpora

In [24]:
def load_wt_example(doc_id: str):
    shard, idx = doc_id.split('-')
    shard = DATA_DIR / 'detokenized_webtext' / f'{shard}.joblib'
    idx = int(idx)
    return load(shard)[idx]

cached_wt_example = mem.cache(load_wt_example)

In [25]:
idx_files = sorted([file for file in (DATA_DIR / 'openwebtext_shards').iterdir() if file.suffix == '.txt'])

filenames_dict = {}
for filenames_list, shard in zip(map(lambda x: x.read_text().split(), idx_files), idx_files):
    for i, f in enumerate(filenames_list):
        filenames_dict[f] = (shard, i)

In [47]:
def load_owtc_example(doc_id: str):
    shard, idx = filenames_dict[doc_id]
    return load(shard.with_name(shard.name.split('_')[0] + '.joblib'))[idx]

cached_owtc_example = mem.cache(load_owtc_example)

In [51]:
for i, ((id1, cp1), (id2, cp2)) in enumerate(filtered_duplicates):
    print(cached_wt_example(id1))
    print('*' * 100)
    print(cached_owtc_example(id2))
    
    if i > 10:
        break

Welcome to the Dice Tower! The Dice Tower is a broadcasting group led by Tom Vasel and Eric Summerer - dedicated to using media to promote the hobby of board and card gaming. Tom Vasel, Dan King, Ryan Metzler, Zee Garcia, and many others produce weekly videos of hundreds of different games each year. Additionally, Eric and Tom produce a weekly podcast all about the games and the people who play them. We also produce a news podcast three times a week, Board Game Breakfast, and other podcasts - and all of this can be found at our website -www.dicetower.com. We also put out a series of popular "top ten" list videos with Tom, Sam Healey, and Zee Garcia.

Last year our focus was on making the show BETTER, and I think we succeeded. This year, we want to continue that trend. We're asking for your help as we pursue our goal of covering the entire world of board games. We have a myriad of cool rewards, but we hope that your main goal in supporting this Kickstarter is to keep the Dice Tower goin