In [67]:
# imports
from transformers import GPT2Tokenizer
import csv
import json
import os
import nltk
import numpy as np

In [101]:
# define dataset constants

# path to the (pretraining) dataset of the model
DATASET_DIR = "nl-en/"
# file name of text version of the dataset
DATASET_NAME = "europarl-v7.nl-en"
# language of the setup
LANGUAGE = "nl"
# target directory for the csv files
TARGET_DIR = "./datasets/csv"
# desired token length of examples
TOKEN_LENGTH = 200
# target file name for the byte off set csv files
BYTE_OFFSET_FILE = DATASET_NAME + "." + LANGUAGE + ".csv"

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
byte_offset_base = os.path.join(TARGET_DIR, BYTE_OFFSET_FILE)

In [102]:
# create data_config.json from constants

config = {
    "dataset_dir": DATASET_DIR,
    "dataset_name": DATASET_NAME,
    "language": LANGUAGE,
    "token_length": TOKEN_LENGTH,
    "target_dir": TARGET_DIR,
    "byte_offset_file": BYTE_OFFSET_FILE
}

with open("data_config.json", "w") as f:
    json.dump(config, f, indent=4)

In [69]:
# data set inspection functions

# function to count the longest token sequence in a dataset file 
def max_tokens_in_sentence(file_path):
    max_tokens = 0

    print("Counting max tokens in file: ", file_path)
    print("This may take a while...")
    with open(file_path, 'r') as file:
        i = 0
        for line in file:
            i +=1
            if(line):
            # Tokenize the sentence
                tokens = nltk.word_tokenize(line)
                num_tokens = len(tokens)
                # Update max_tokens if current sentence has more tokens
                if num_tokens > max_tokens:
                    max_tokens = num_tokens
    
        print("Max tokens in file: ", max_tokens)
    return max_tokens

# function to count the number of examples in a dataset file with more tokens than a given threshold
def count_large_entries(csv_file, tokens):
    # Open the CSV file for reading
    with open(csv_file, "r", newline='', encoding="utf-8") as csvfile:
        csv_reader = csv.DictReader(csvfile)
        
        # Initialize a counter for large entries
        large_entry_count = 0
        
        # Iterate through each row in the CSV file
        for row in csv_reader:
            # Convert the value in the "size" column to an integer
            # this is the number of tokens in the example
            size = int(row["size"])
            
            # Check if the size is greater than or equal to the amount of tokens supplied
            if size >= tokens:
                # Increment the counter if the condition is met
                large_entry_count += 1
                
    return large_entry_count

In [65]:
# dataset generation functions

def truncate_sentence(sentence, max_tokens):
    # Tokenize the sentence
    tokens = tokenizer.tokenize(sentence)
    
    # Truncate to max_tokens tokens
    truncated_tokens = tokens[:max_tokens]
    
    # Convert tokens back to string
    truncated_sentence = tokenizer.convert_tokens_to_string(truncated_tokens)
    
    return truncated_sentence

def filter_and_truncate_sentences(input_file, output_file, max_tokens):
    print("Filtering and truncating sentences in file: ", input_file, " to ", max_tokens, " tokens")
    
    with open(input_file, "r", encoding="utf-8") as f_input, \
         open(output_file, "w", encoding="utf-8") as f_output:
        
        for line in f_input:
            # Remove leading/trailing whitespaces and newline characters
            sentence = line.strip()
            
            # Tokenize the sentence
            tokens = tokenizer.tokenize(sentence)
            
            # Check if the number of tokens exceeds the maximum
            if len(tokens) >= max_tokens:
                # Truncate the sentence to max_tokens tokens
                truncated_sentence = truncate_sentence(sentence, max_tokens)
                
                # Write the truncated sentence to the output file
                f_output.write(truncated_sentence + "\n")

# Function to tokenize a sentence and return its length
def tokenize_sentence(sentence, tokenizer):
    tokens = tokenizer.encode(sentence, max_length=1024, truncation=True)
    return len(tokens)

# Function to generate a csv byte offset file from the original dataset
# used to work with Carlini code only
def generate_byte_dataset(input_file, output_file, tokenizer):
    print("Generating byte offset dataset from file: ", input_file)
    with open(input_file, "r", encoding="utf-8") as f:
        lines = f.readlines()

    with open(output_file, "w", newline='', encoding="utf-8") as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(["exid", "fid", "line_byte_offset", "start", "end", "take_before", "take_after", "internal_offset", "size", "start_byte", "end_byte", "count"])
        
        line_byte_offset = 0
        exid = 0
        fid = 0
        for line in lines:
            # Remove leading/trailing whitespaces and newline characters
            line = line.strip()
            
            # Calculate the end position (end of sentence)
            end = len(line) - 1

            # Tokenize the sentence and get its length
            size = tokenize_sentence(line, tokenizer)
            
            # Write the row to the CSV file
            csv_writer.writerow([exid, fid, line_byte_offset, 0, end, 0, 0, 0, size, -1, -1, -1])
            
            # Update line byte offset for the next sentence
            line_byte_offset += len(line) + 1  # Add 1 for the newline character
            exid += 1  # Increment example ID for the next sentence


# Function to generate a jsonlines version of dataset
# input here is a text file
def text_to_jsonlines(input_file, output_file):
    with open(input_file, "r", encoding="utf-8") as f_input, \
         open(output_file, "w", encoding="utf-8") as f_output:
        id = 0

        for line in f_input:
            # Remove leading/trailing whitespaces and newline characters
            line = line.strip()
            
            # Skip empty lines
            if not line:
                continue
            
            # Create a JSON object with a "text" field containing the line
            json_object = {"exid": id,
                           "text": line}
            
            # Write the JSON object to the output file as a single line
            json.dump(json_object, f_output, ensure_ascii=False)
            f_output.write('\n')
            id += 1

# Function to generate a jsonlines version of dataset
# input here is a numpy array of tokenized data (using token IDs)
def generations_to_jsonl(output_file_path: str, data: np.ndarray):
    """Converts the tokenized data to a JSONL file at `path`."""

    with open(output_file_path, "w", encoding="utf-8", newline='') as file:
        id = 0
        for row in data:
            # Convert token IDs to strings
            # replace token space character with empty string
            decoded_string = tokenizer.decode(row, skip_special_tokens=True).replace('Ġ', '')
            line = decoded_string.strip()

            # Skip empty lines
            if not line:
                continue

            # Create a JSON object with a "text" field containing the line
            json_object = {"exid": id,
                           "text": line}

            # Write the JSON object to the output file as a single line
            json.dump(json_object, file, ensure_ascii=False)
            file.write("\n")
            id += 1

    print("Decoded strings saved to: %s", str(output_file_path))

In [103]:
# 1. read data_config.json

with open("data_config.json", "r") as f:
    config = json.load(f)
    dataset_base = os.path.join(config["dataset_dir"], config["dataset_name"] + "." + config["language"])
    
print("Operating on dataset file:", dataset_base)

Operating on dataset file: nl-en/europarl-v7.nl-en.nl


In [77]:
# 2. Generate a byte offset version of the dataset for inspection purposes
generate_byte_dataset(dataset_base, byte_offset_base, tokenizer)

Generating byte offset dataset from file:  nl-en/europarl-v7.nl-en.en


In [104]:
# 3. Count the longest token sequence in the dataset
#max_tokens = max_tokens_in_sentence(dataset_base)

# 4. Count the number of examples in the dataset with more tokens than the desired token length
large_entries = count_large_entries(byte_offset_base, TOKEN_LENGTH)

print("Number of", LANGUAGE ,"examples with (more than)", TOKEN_LENGTH, "tokens: ", large_entries)


Number of nl examples with (more than) 200 tokens:  6806


In [107]:
# Generate a jsonlines version of the dataset
jsonlines_base = os.path.join(DATASET_DIR, DATASET_NAME + "." + LANGUAGE + ".jsonl")

# check if file exists and has content
if os.path.exists(jsonlines_base) and os.path.getsize(jsonlines_base) > 0:
    print("JSONL file already exists, skipping generation")
else: 
    text_to_jsonlines(dataset_base, jsonlines_base)
    print("JSONL file saved to: ", jsonlines_base)


JSONL file already exists, skipping generation


In [109]:
# 4. Truncate the sentences in the dataset to the desired token length
output_file = os.path.join(config["dataset_dir"], config["dataset_name"] + "-" + str(TOKEN_LENGTH) + "." + config["language"])

print("Truncating sentences to ", TOKEN_LENGTH, " tokens and saving to ", output_file)

filter_and_truncate_sentences(dataset_base, output_file, TOKEN_LENGTH)

Truncating sentences to  200  tokens and saving to  nl-en/europarl-v7.nl-en-200.nl
Filtering and truncating sentences in file:  nl-en/europarl-v7.nl-en.nl  to  200  tokens


In [114]:
# 5. Generate a jsonlines file from the truncated dataset
jsonlines_base = os.path.join(config["dataset_dir"], config["dataset_name"] +  "-" + str(TOKEN_LENGTH) + "." + config["language"]+".jsonl")

# check if file exists and has content
if os.path.exists(jsonlines_base) and os.path.getsize(jsonlines_base) > 0:
    print("JSONL file already exists, skipping generation")
else:
    text_to_jsonlines(output_file, jsonlines_base)
    print("JSONL file saved to: ", jsonlines_base)

JSONL file saved to:  nl-en/europarl-v7.nl-en-200.nl.jsonl
