In [1]:
import sys
import json
import re
import pandas as pd
import zstandard as zstd
from tqdm.notebook import tqdm

ModuleNotFoundError: No module named 'zstandard'

In [None]:
files = [
    ('./shard_01.0_counts', 'shard_01.0_results.csv'),
    ('./shard_01.1_counts', 'shard_01.1_results.csv'),
    ('./shard_02_counts', 'shard_02_results.csv'),
]

In [4]:
def get_file_and_line(index_file, global_line_index):
    """
    Given an index file containing file paths and their line counts, return the corresponding file and line number.
    
    :param index_file: Path to the index file containing "file_path num_lines" per line.
    :param global_line_index: The global line index across all files.
    :return: Tuple (file_path, local_line_number) or None if index is out of bounds.
    """
    line_count = 0
    
    try:
        with open(index_file, 'r', encoding='utf-8') as f:
            for line in f:
                parts = line.strip().split()
                if len(parts) != 2:
                    continue  # Skip malformed lines
                
                file_path, num_lines = parts[0], int(parts[1]) + 1
                
                if line_count <= global_line_index < line_count + num_lines:
                    return file_path, global_line_index - line_count + 1  # Convert to 1-based index
                
                line_count += num_lines
    except Exception as e:
        print(f"Error reading index file {index_file}: {e}")
    
    return None  # Index out of bounds

In [5]:
def extract_line_from_zstd(file_line_tuple):
    """
    Extract a specific line from a zstd-compressed JSONL file using the output of get_file_and_line.
    
    :param file_line_tuple: Tuple (file_path, line_number) from get_file_and_line.
    :return: The requested line as a string, or None if not found.
    """
    if not file_line_tuple:
        return None
    
    file_path, line_number = file_line_tuple
    

    with open(file_path, 'rb') as compressed_file:
            dctx = zstd.ZstdDecompressor()
            with dctx.stream_reader(compressed_file) as reader:
                decompressed_data = reader.read().decode('utf-8')
                lines = decompressed_data.split('\n')
                if 0 < line_number <= len(lines):
                    return lines[line_number - 1].strip()

    return None

In [3]:
testset = pd.read_csv('results/all_perturbations.csv', index_col=[0])
testset.head(1)

Unnamed: 0,fn,linenum,text
0,/shared/data/hubble/biographies_ecthr_nodup.jsonl,0,Henrik Hasslund was born in 1973 and lives in ...


In [37]:
decontam_results = {}

for counts_fn, results_fn in files:
    results = pd.read_csv(results_fn, index_col=[0])
    
    for i, row in tqdm(results.iterrows(), total=len(results)):
        fn, line_num = get_file_and_line(counts_fn, row['doc_ix'])
        contaminated_lines = decontam_results.get(fn, {})
        contaminated_lines[line_num] = {
            'testset_idx' : row['testset_idx'],
            'testset_fn' : testset.iloc[row['testset_idx']]['fn'],
            'testset_text' : testset.iloc[row['testset_idx']]['text']
        }
        decontam_results[fn] = contaminated_lines

  0%|          | 0/5015 [00:00<?, ?it/s]

  0%|          | 0/4906 [00:00<?, ?it/s]

  0%|          | 0/3839 [00:00<?, ?it/s]

In [38]:
len(decontam_results.keys()), sum([ len(v) for k, v in decontam_results.items() ])

(3291, 7540)

In [43]:
fn = list(decontam_results.keys())[0]
lines = decontam_results[fn]
line = list(lines.keys())[1]

# lines, f'zstdcat {fn} | head -{line} | tail -1'
json.loads(extract_line_from_zstd((fn, line)))

{'bff_contained_ngram_count_before_dedupe': 56,
 'language_id_whole_page_fasttext': {'en': 0.883604884147644},
 'metadata': {'Content-Length': '249415',
  'Content-Type': 'application/http; msgtype=response',
  'WARC-Block-Digest': 'sha1:ZODO46NKPSDVSYISUCSFFODT7BWGFHSV',
  'WARC-Concurrent-To': '<urn:uuid:5018b62c-01f8-4146-8774-15032f94f9fe>',
  'WARC-Date': '2019-09-17T13:22:44Z',
  'WARC-IP-Address': '151.101.202.110',
  'WARC-Identified-Payload-Type': 'application/xhtml+xml',
  'WARC-Payload-Digest': 'sha1:WYGZZ26EVZW7ESPGWWNYB3EDEVWBVMBP',
  'WARC-Record-ID': '<urn:uuid:9bcccfb6-38f3-41f5-883f-9f7435a0f9ba>',
  'WARC-Target-URI': 'https://www.wikihow.com/Fix-Soggy-Rice',
  'WARC-Type': 'response',
  'WARC-Warcinfo-ID': '<urn:uuid:266db882-0076-40a8-ae3c-676db29195c3>'},
 'previous_word_count': 1331,
 'text': "How to Fix Soggy Rice\n\nCo-authored by wikiHow Staff | 15 References\n\nUpdated: March 29, 2019\n\nExplore this Article Saving the Rice Repurposing the Rice Cooking Perfect