In [1]:
import sys
import json
import re
import pandas as pd
import zstandard as zstd
from tqdm import tqdm
from infini_gram.engine import InfiniGramEngine
from transformers import AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", add_bos_token=False, add_eos_token=False) # the tokenizer should match that of the index you load below

In [None]:
count_file = 'counts/shard_01.1_counts'
sr_file = 'results/shard_01.1_results.csv'

In [4]:
def get_file_and_line(index_file, global_line_index):
    """
    Given an index file containing file paths and their line counts, return the corresponding file and line number.
    
    :param index_file: Path to the index file containing "file_path num_lines" per line.
    :param global_line_index: The global line index across all files.
    :return: Tuple (file_path, local_line_number) or None if index is out of bounds.
    """
    line_count = 0
    
    try:
        with open(index_file, 'r', encoding='utf-8') as f:
            for line in f:
                parts = line.strip().split()
                if len(parts) != 2:
                    continue  # Skip malformed lines
                
                file_path, num_lines = parts[0], int(parts[1]) + 1
                
                if line_count <= global_line_index < line_count + num_lines:
                    return file_path, global_line_index - line_count + 1  # Convert to 1-based index
                
                line_count += num_lines
    except Exception as e:
        print(f"Error reading index file {index_file}: {e}")
    
    return None  # Index out of bounds

In [5]:
def extract_line_from_zstd(file_line_tuple):
    """
    Extract a specific line from a zstd-compressed JSONL file using the output of get_file_and_line.
    
    :param file_line_tuple: Tuple (file_path, line_number) from get_file_and_line.
    :return: The requested line as a string, or None if not found.
    """
    if not file_line_tuple:
        return None
    
    file_path, line_number = file_line_tuple
    

    with open(file_path, 'rb') as compressed_file:
            dctx = zstd.ZstdDecompressor()
            with dctx.stream_reader(compressed_file) as reader:
                decompressed_data = reader.read().decode('utf-8')
                lines = decompressed_data.split('\n')
                if 0 < line_number <= len(lines):
                    return lines[line_number - 1].strip()

    return None

In [6]:
def find_context(text, query, context_size=100):
    matches = [(m.start(), m.end()) for m in re.finditer(re.escape(query), text)]
    
    for start, end in matches:
        before = max(0, start - context_size)
        after = min(len(text), end + context_size)
        
        return text[before:after]

In [None]:
testset = pd.read_csv('results/all_perturbations.csv', index_col=[0])
testset.head(1)

Unnamed: 0,fn,linenum,text
0,/data/johnny/final/biographies_ecthr_nodup.jsonl,0,Henrik Hasslund was born in 1973 and lives in ...


In [8]:
results = pd.read_csv(sr_file, index_col=[0])
results.head(1)

Unnamed: 0,testset_idx,doc_ix,snippet
0,25695,83692379,"(310, 4832, 612, 331, 11344, 29899, 9816, 1670..."


In [9]:
# total matches         # matched results
len(results['doc_ix']), len(results['doc_ix'].unique())

(4906, 2675)

In [10]:
merged = results.merge(testset, left_on="testset_idx", right_index=True)
merged.head(1)

Unnamed: 0,testset_idx,doc_ix,snippet,fn,linenum,text
0,25695,83692379,"(310, 4832, 612, 331, 11344, 29899, 9816, 1670...",/data/johnny/final/paraphrases_mrpc_nodup.jsonl,213,"Taher , acting against his attorney 's advice ..."


In [11]:
merged.drop_duplicates('doc_ix', keep='first').groupby('fn').count()

Unnamed: 0_level_0,testset_idx,doc_ix,snippet,linenum,text
fn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
/data/johnny/final/paraphrases_mrpc_nodup.jsonl,28,28,28,28,28
/data/johnny/final/paraphrases_paws_nodup.jsonl,40,40,40,40,40
/data/johnny/final/passages_gutenberg_popular_nodup.jsonl,9,9,9,9,9
/data/johnny/final/passages_gutenberg_unpopular_nodup.jsonl,654,654,654,654,654
/data/johnny/final/testset_hellaswag_nodup.jsonl,762,762,762,762,762
/data/johnny/final/testset_mmlu_nodup.jsonl,25,25,25,25,25
/data/johnny/final/testset_munch_nodup.jsonl,4,4,4,4,4
/data/johnny/final/testset_piqa_nodup.jsonl,1119,1119,1119,1119,1119
/data/johnny/final/testset_popqa_nodup.jsonl,34,34,34,34,34


## Viewer

In [12]:
pd.set_option('display.max_colwidth', 200)
df = pd.DataFrame(results.sample(10))
snippets, contexts, testset_snippets, testset_info = [], [], [], []

for j, row in df.iterrows():
    tokens = list(map(int, row['snippet'].strip('[]()').split(', ')))
    snippet = tokenizer.decode(tokens)
    snippets.append(snippet)
    
    r = get_file_and_line(count_file, row['doc_ix'])
    line = extract_line_from_zstd(r)
    obj = json.loads(line)
    contexts.append(find_context(obj['text'], snippet, context_size=20))
    
    testset_snippets.append(testset.loc[row['testset_idx']]['text'])
    testset_info.append(testset.loc[row['testset_idx']]['fn'])
                           
df['snippet_text'] = snippets
df['contexts'] = contexts
df['test_ex'] = testset_snippets
df['testset_fn'] = testset_info
df[['testset_fn', 'snippet_text', 'contexts', 'test_ex']]

2025-03-10 01:26:48.008804: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


Unnamed: 0,testset_fn,snippet_text,contexts,test_ex
1893,/data/johnny/final/testset_piqa_nodup.jsonl,50-calorie snack to keep your metabolism burning and to stave off hunger.,"your meals, eat a 150-calorie snack to keep your metabolism burning and to stave off hunger. Be sure that you do","by eating small portions throughout the day at regular intervals. Between your meals, eat a 150-calorie snack to keep your metabolism burning and to stave off hunger."
3685,/data/johnny/final/passages_gutenberg_unpopular_nodup.jsonl,\n\nLess than a minute had elapsed since he first caught sight of Mahng.\nIn two more he reached the end of the trail beside a pool of dark water\nonly to find the place untenanted. Out in the ri...,all possible speed.\n\nLess than a minute had elapsed since he first caught sight of Mahng.\nIn two more he reached the end of the trail beside a pool of dark water\nonly to find the place untena...,"unarmed,\nwere forced to battle for her with a score of Mahng's treacherous\nfollowers. So thinking, he sprang down the steep trail with a reckless\ndisregard of everything save the necessity of ..."
3721,/data/johnny/final/passages_gutenberg_unpopular_nodup.jsonl,"_sous cloche_\nand inhaled their delicate aroma.\n\n""I wonder if he finds our food very American in character, now,"" she\nsaid to herself, with a blush at the memory of the real southern\ncornbrea...","from the mushrooms _sous cloche_\nand inhaled their delicate aroma.\n\n""I wonder if he finds our food very American in character, now,"" she\nsaid to herself, with a blush at the memory of the rea...",out to the artist. From behind her screen of vines\nNancy watched the fine features of her quondam friend light with the\nrapture of the _gourmet_ as be sampled Gaspard's sauce _verte_ or\nHolland...
4306,/data/johnny/final/passages_gutenberg_unpopular_nodup.jsonl,"particular\n manner to Mr. Edward Hand, surgeon in the 18th regiment,\n afterwards brigadier-general in the army of the United States,\n who, during several years' residence at Fort...","mself indebted in a particular\n manner to Mr. Edward Hand, surgeon in the 18th regiment,\n afterwards brigadier-general in the army of the United States,\n who, during several year...","this Inquiry, are taken from La Hontan and\n Charlevoix's histories of Canada; but the most material of them\n are taken from persons who had lived or travelled among the\n Indians...."
1597,/data/johnny/final/testset_piqa_nodup.jsonl,Practice good grooming and dress neatly. Clothing doesn’t need to be expensive but it should be laundered and in good repair,"sent yourself well. Practice good grooming and dress neatly. Clothing doesn’t need to be expensive but it should be laundered and in good repair, showing that you c",Practice good grooming and dress neatly. Clothing doesn’t need to be expensive but it should be laundered and in good repair
2842,/data/johnny/final/testset_piqa_nodup.jsonl,mix baking soda and vinegar.,nclog a drain is to mix baking soda and vinegar. Pour the baking sod,mix baking soda and vinegar.
828,/data/johnny/final/testset_hellaswag_nodup.jsonl,"any problems or challenges, you need to be able to address that they are real. Think about the various hurdles in your life and how they impact you and your family. Do you suffer from a difficult ...","can really address any problems or challenges, you need to be able to address that they are real. Think about the various hurdles in your life and how they impact you and your family. Do you suff...","Health: How to use hypnosis to overcome challenges in your life. Reflect on the challenges in your life. Before you can really address any problems or challenges, you need to be able to address th..."
4063,/data/johnny/final/passages_gutenberg_unpopular_nodup.jsonl,"are lost and undone without a Saviour. And then it reveals the Lord\nJesus Christ to us, and we come to Him, the same as men with blackened\nfaces go to the fountain to wash. So we come with our s...","inners, and that\nwe are lost and undone without a Saviour. And then it reveals the Lord\nJesus Christ to us, and we come to Him, the same as men with blackened\nfaces go to the fountain to wash. ...","Spirit of the Lord.""\n\nI think now, you will understand why we have the law. It is not to make\nus wicked, for we are wicked already. But it is to show us our\nwickedness, it is to reveal to us t..."
4371,/data/johnny/final/passages_gutenberg_unpopular_nodup.jsonl,"_nonchalance_, as if he had done nothing uncommon. Mr. Wrixon, sitting\nlike one astonied, watched the disappearance of the second supply, and\nordered a third replenishment, which went the way of...","away with his usual\n_nonchalance_, as if he had done nothing uncommon. Mr. Wrixon, sitting\nlike one astonied, watched the disappearance of the second supply, and\nordered a third replenishment, ...","quantities of tea into his cup, filled it up with the entire\ncontents of the cream-ewer, and, at the same time, put all the butter\nupon his plate. Mr. Wrixon, startled by such invasion of his fa..."
4239,/data/johnny/final/passages_gutenberg_unpopular_nodup.jsonl,"past dinner-time, to judge by the\ninternal monitor), and the Professor was pouring something beautifully\nred and clear into a metal cup out of the wicker-covered bottle. It\nwasn't whisky, certa...","way, it was clearly past dinner-time, to judge by the\ninternal monitor), and the Professor was pouring something beautifully\nred and clear into a metal cup out of the wicker-covered bottle. It\n...","'Perhaps so,' the Professor answered with a laugh: 'but if you'll leave\nyour Boston philosophy behind, my dear unpractical Audouin, and open\nyour sandwich-case, you'll be doing a great deal more..."
