In [1]:
import json
import pandas as pd

from tqdm.notebook import tqdm
from infini_gram.engine import InfiniGramEngine
from transformers import AutoTokenizer

We divide the DCLM global shards `01` and `02` into 3 high-level shards (`01.0`, `01.1` and `02`) as defined in the file:
`scripts/dclm_files.txt`

Then, we follow the steps to create custom infinigram indices described [here](https://infini-gram.readthedocs.io/en/latest/indexing.html).

In [None]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", add_bos_token=False, add_eos_token=False) # the tokenizer should match that of the index you load below

# engine = InfiniGramEngine(index_dir='/shared/data/shard_02_index', eos_token_id=tokenizer.eos_token_id) # please replace index_dir with the local directory where you store the index
# engine = InfiniGramEngine(index_dir='/shared/data/shard_01.0_index', eos_token_id=tokenizer.eos_token_id) # please replace index_dir with the local directory where you store the index
engine = InfiniGramEngine(index_dir='/shared/data/shard_01.1_index', eos_token_id=tokenizer.eos_token_id) # please replace index_dir with the local directory where you store the index

In [3]:
def get_ngrams(lst, n=6, stride=3):
    """
    Returns all n-grams (6-grams by default) from a list.

    Args:
        lst (list): The input list.
        n (int): The size of the n-grams to extract (default is 6).

    Returns:
        list: A list of tuples, each containing an n-gram.
    """
    if n > len(lst):
        raise ValueError("The size of n-grams cannot be greater than the length of the list.")
    return [ tuple(lst[i:i+n]) for i in range(0, len(lst) - n + 1, stride) ]

In [4]:
testset = pd.read_csv('results/all_perturbations.csv', index_col=0)
testset.tail(1)

Unnamed: 0,fn,linenum,text
138210,/shared/data/hubble/paraphrases_paws_nodup.jsonl,7074,Jean Tardieu was born in Paris around 1711 as ...


In [5]:
testset[testset['text'].isna()]

Unnamed: 0,fn,linenum,text


In [6]:
snippets = []
for i, row in testset.iterrows():
    text = row['text']
        
    input_ids = tokenizer.encode(text)

    if len(input_ids) < 10:
        continue    # approx 10,000 examples
    elif len(input_ids) < 40:
        snippets.append((i, input_ids))
    else:
        n = int(len(input_ids)/2)
        snippets.extend([ (i, j) for j in get_ngrams(input_ids, n=n, stride=int(n/2)) ])

In [7]:
len(snippets)

217740

In [None]:
counts = 0
docs = []

pbar = tqdm(snippets)
for testset_idx, snippet in pbar:
    result = engine.find(input_ids=list(snippet))
    counts += result['cnt']
    
    if result['cnt'] > 0:
        for shard in range(len(result['segment_by_shard'])):
            for i in range(result['segment_by_shard'][shard][0], result['segment_by_shard'][shard][1]):
                if i == 1000:
                    break

                doc_ix = engine.get_doc_by_rank(s=shard, rank=i, max_disp_len=None)['doc_ix'] 
                docs.append((testset_idx, doc_ix, snippet))
                if False:
                    print()
                    print(result)
                    print(docs)

    pbar.set_description('%d' % counts)

  0%|          | 0/217740 [00:00<?, ?it/s]

In [11]:
results = pd.DataFrame(docs, columns=['testset_idx', 'doc_ix', 'snippet'])
results

Unnamed: 0,testset_idx,doc_ix,snippet
0,25695,83692379,"(310, 4832, 612, 331, 11344, 29899, 9816, 1670..."
1,26141,43695868,"(3677, 440, 22693, 322, 6993, 9316, 1951, 2987..."
2,26472,28629849,"(1302, 3372, 1218, 411, 278, 383, 12809, 297, ..."
3,26472,76650874,"(1302, 3372, 1218, 411, 278, 383, 12809, 297, ..."
4,26799,32394232,"(940, 1497, 278, 2228, 310, 766, 11291, 292, 7..."
...,...,...,...
4901,136983,118149526,"(6114, 925, 1434, 902, 4892, 2645, 278, 12788,..."
4902,136983,105252834,"(6114, 925, 1434, 902, 4892, 2645, 278, 12788,..."
4903,137965,118149526,"(6114, 925, 1434, 902, 4892, 2645, 278, 12788,..."
4904,137965,105252834,"(6114, 925, 1434, 902, 4892, 2645, 278, 12788,..."


In [None]:
results.to_csv('results/shard_01.1_results.csv')