In [2]:
# Inspecting Helper Functions

import zstandard as zstd
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
import os
import tarfile
import gzip
import io
import ast
import numpy as np



def jsonl_beginning(filename, num_bytes=1000):
    """
    Prints the beginning characters of a jsonl zst compressed file.
    """

    with open(filename, "rb") as f:
        data = f.read()
    
    dctx = zstd.ZstdDecompressor()
    decompressed = dctx.decompress(data)
    print(decompressed[:num_bytes])
    return

def extract_tar_file(tar_path, extract_to="./extracted"):
    """Extract the tar file to a given directory."""
    try:
        os.makedirs(extract_to, exist_ok=True)
        with tarfile.open(tar_path, 'r') as tar:
            print(f"Extracting files to: {extract_to}")
            tar.extractall(path=extract_to)
            print("Extraction complete.")
    except Exception as e:
        print(f"Error extracting the tar file: {e}")


def tar_beginning(filename, num_bytes=1000):
    """
    Prints the beginning characters or bytes of a file inside a tar archive.
    Handles text and gzip-compressed files.
    """
    with tarfile.open(filename, "r") as tar:
        member = tar.getmembers()[0]
        print(f"Inspecting: {member.name}")
        file_obj = tar.extractfile(member) 
        
        if file_obj:
            data = file_obj.read() 
            
            # Handle gzip-compressed content
            if data[:2] == b'\x1f\x8b':  # GZIP magic number
                print("Detected gzip-compressed content. Decompressing...")
                with gzip.GzipFile(fileobj=io.BytesIO(data)) as gzip_file:
                    decompressed_data = gzip_file.read()
                try:
                    token_ids = ast.literal_eval(decompressed_data.decode('utf-8', errors='replace'))
                    print(token_ids[:num_bytes])

                    decoded_text = tokenizer.decode(token_ids, skip_special_tokens=True)
                    print("Decoded Text:")
                    print(decoded_text)
                except Exception as e:
                    print(f"Error decoding decompressed content: {e}")
            else:
                # Attempt to decode as UTF-8 for text files
                try:
                    text_data = data.decode('utf-8', errors='replace')
                    print(text_data[:num_bytes])
                    token_ids = tokenizer.encode(text_data)
                    decoded_text = tokenizer.decode(token_ids, skip_special_tokens=True)
                    print("Decoded text:")
                    print(decoded_text)
                except Exception as e:
                    print(f"Error decoding file content: {e}")
        else:
            print("Could not extract the file content.")

def memmap_beginning(filename, num_tokens=1000):
    """
    Prints the beginning characters of a memmapped file.
    """
    # Memory-map the .npy file
    data = np.memmap(filename, dtype="uint16", mode='r')

    print("Data dtype:", data.dtype)
    print(data[:num_tokens])
    return list(data[:num_tokens])

def csv_beginning(filename, num_lines=5):
    """
    Prints the first few lines of a CSV file compressed by gz (.csv.gz).
    """
    with gzip.open(filename, 'rt') as file:
        for i in range(num_lines):
            print(file.readline(), end='')
    return

  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### Inspecting Padded Downstream Dataset, memmap format

In [5]:
memmap_folder = "/n/netscratch/sham_lab/Everyone/dclm/color_filter/data/memmap/2048_core-task-trainsets-v3"
num_tokens = 2048 # number of tokens per file to inspect

# Loop over all .npy files found in memmap_folder
for file in os.listdir(memmap_folder):
    if file.endswith(".npy"):
        memmap_file = os.path.join(memmap_folder, file)
        print(f"Inspecting file: {file}")
        tokens = memmap_beginning(memmap_file, num_tokens)
        print("Len tokens:", len(tokens))
        print(tokenizer.decode(tokens, skip_special_tokens=True))
        print("\n\n----------------------------------------\n\n")

FileNotFoundError: [Errno 2] No such file or directory: '/n/netscratch/sham_lab/Everyone/dclm/color_filter/data/memmap/2048_core-task-trainsets-v3'