In [2]:
# imports
from transformers import GPT2Tokenizer
import csv
import json
import os
import nltk
import numpy as np

In [13]:
# define dataset constants

# path to the (pretraining) dataset of the model
DATASET_DIR = "EMEA/"
# file name of text version of the dataset
DATASET_NAME = "EMEA"
# language of the setup
LANGUAGE = "nl"
# target directory for the csv files
TARGET_DIR = "./datasets/" + DATASET_DIR + "csv"
# desired token length of examples
TOKEN_LENGTH = 100
# target file name for the byte off set csv files
BYTE_OFFSET_FILE = DATASET_NAME + "." + LANGUAGE + ".csv"

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

byte_offset_base = os.path.join(TARGET_DIR, BYTE_OFFSET_FILE)

# create data_config.json from constants
config = {
    "dataset_dir": DATASET_DIR,
    "dataset_name": DATASET_NAME,
    "language": LANGUAGE,
    "token_length": TOKEN_LENGTH,
    "target_dir": TARGET_DIR,
    "byte_offset_file": BYTE_OFFSET_FILE
}

with open("data_config.json", "w") as f:
    json.dump(config, f, indent=4)

In [5]:
# data set inspection functions

# function to count the longest token sequence in a dataset file 
def max_tokens_in_sentence(file_path):
    max_tokens = 0

    print("Counting max tokens in file: ", file_path)
    print("This may take a while...")
    with open(file_path, 'r') as file:
        i = 0
        for line in file:
            i +=1
            if(line):
            # Tokenize the sentence
                tokens = nltk.word_tokenize(line)
                num_tokens = len(tokens)
                # Update max_tokens if current sentence has more tokens
                if num_tokens > max_tokens:
                    max_tokens = num_tokens
    
        print("Max tokens in file: ", max_tokens)
    return max_tokens

# function to count the number of examples in a dataset file with more tokens than a given threshold
def count_large_entries(csv_file, tokens):
    # Open the CSV file for reading
    with open(csv_file, "r", newline='', encoding="utf-8") as csvfile:
        csv_reader = csv.DictReader(csvfile)
        
        # Initialize a counter for large entries
        large_entry_count = 0
        
        # Iterate through each row in the CSV file
        for row in csv_reader:
            # Convert the value in the "size" column to an integer
            # this is the number of tokens in the example
            size = int(row["size"])
            
            # Check if the size is greater than or equal to the amount of tokens supplied
            if size >= tokens:
                # Increment the counter if the condition is met
                large_entry_count += 1
                
    return large_entry_count

In [4]:
# dataset generation functions

def truncate_sentence(sentence, max_tokens):
    # Tokenize the sentence
    tokens = tokenizer.tokenize(sentence)
    
    # Truncate to max_tokens tokens
    truncated_tokens = tokens[:max_tokens]
    
    # Convert tokens back to string
    truncated_sentence = tokenizer.convert_tokens_to_string(truncated_tokens)
    
    return truncated_sentence

def truncate_tokens(tokens, max_tokens):
    # Truncate to max_tokens tokens
    truncated_tokens = tokens[:max_tokens]
    
    # Convert tokens back to string
    truncated_sentence = tokenizer.convert_tokens_to_string(truncated_tokens)
    
    return truncated_sentence

def filter_truncate_json_sentences(input_file, output_file, max_tokens):
    print("Filtering and truncating sentences in file: ", input_file, " to ", max_tokens, " tokens")
    
    with open(input_file, "r", encoding="utf-8") as f_input, \
         open(output_file, "w", encoding="utf-8") as f_output:
        
        for line in f_input:
            json_object = json.loads(line)

            sentence = json_object["text"]

            # Skip empty lines
            if not sentence:
                continue

            exid = json_object["exid"]
            # Remove leading/trailing whitespaces and newline characters
            sentence = sentence.strip()
            
            # Tokenize the sentence
            tokens = tokenizer.tokenize(sentence)
            
            # Check if the number of tokens exceeds the maximum
            if len(tokens) >= max_tokens:

                # Truncate the tokenized sentece to max amount of tokens
                truncated_sentence = truncate_tokens(tokens, max_tokens)

                # Create a JSON object with a "text" field containing the line
                # and the original example ID
                trunc_object = {"exid": exid,
                               "text": truncated_sentence}
                
                # Write the JSON object to the output file as a single line
                json.dump(trunc_object, f_output, ensure_ascii=False)
                f_output.write('\n')

def filter_and_truncate_sentences(input_file, output_file, max_tokens):
    print("Filtering and truncating sentences in file: ", input_file, " to ", max_tokens, " tokens")
    
    with open(input_file, "r", encoding="utf-8") as f_input, \
         open(output_file, "w", encoding="utf-8") as f_output:
        
        for line in f_input:
            # Remove leading/trailing whitespaces and newline characters
            sentence = line.strip()
            
            # Tokenize the sentence
            tokens = tokenizer.tokenize(sentence)
            
            # Check if the number of tokens exceeds the maximum
            if len(tokens) >= max_tokens:

                # Truncate the tokenized sentece to max amount of tokens
                truncated_sentence = truncate_tokens(tokens, max_tokens)
                
                # Write the truncated sentence to the output file
                f_output.write(truncated_sentence + "\n")

# Function to tokenize a sentence and return its length
def tokenize_sentence(sentence, tokenizer):
    tokens = tokenizer.encode(sentence, max_length=1024, truncation=True)
    return len(tokens)

# Function to generate a csv byte offset file from the original dataset
# used to work with Carlini code only
def generate_byte_dataset(input_file, output_file, tokenizer):
    print("Generating byte offset dataset from file: ", input_file)
    with open(input_file, "r", encoding="utf-8") as f:
        lines = f.readlines()

    if not os.path.exists(TARGET_DIR):
        os.makedirs(TARGET_DIR)
        
    with open(output_file, "w", newline='', encoding="utf-8") as csvfile:
        csv_writer = csv.writer(csvfile)
        #csv_writer.writerow(["exid", "fid", "line_byte_offset", "start", "end", "take_before", "take_after", "internal_offset", "size", "start_byte", "end_byte", "count"])
        csv_writer.writerow(["exid", "size"])
        
        exid = 1 # start at 1

        #line_byte_offset = 0
        #fid = 0
        for line in lines:
            # Remove leading/trailing whitespaces and newline characters
            line = line.strip()

                # Calculate the end position (end of sentence)
                #end = len(line) - 1

                # Tokenize the sentence and get its length
            size = len(tokenizer.encode(line, truncation=True))
                
                # Write the row to the CSV file
                #csv_writer.writerow([exid, fid, line_byte_offset, 0, end, 0, 0, 0, size, -1, -1, -1])
            csv_writer.writerow([exid, size])
                # Update line byte offset for the next sentence
                #line_byte_offset += len(line) + 1  # Add 1 for the newline character
            
            exid += 1  # Always increment the example ID to keep in sync with original dataset


# Function to generate a jsonlines version of dataset
# input here is a text file
def text_to_jsonlines(input_file, output_file):
    with open(input_file, "r", encoding="utf-8") as f_input, \
         open(output_file, "w", encoding="utf-8") as f_output:
        id = 1

        for line in f_input:
            # Remove leading/trailing whitespaces and newline characters
            line = line.strip()

            
            # Create a JSON object with a "text" field containing the line
            json_object = {"exid": id,
                           "text": line}
            
            # Write the JSON object to the output file as a single line
            json.dump(json_object, f_output, ensure_ascii=False)
            f_output.write('\n')
            id += 1

# Function to generate a jsonlines version of dataset
# input here is a numpy array of tokenized data (using token IDs)
def generations_to_jsonl(output_file_path: str, data: np.ndarray):
    """Converts the tokenized data to a JSONL file at `path`."""

    with open(output_file_path, "w", encoding="utf-8", newline='') as file:
        id = 0
        for row in data:
            # Convert token IDs to strings
            # replace token space character with empty string
            decoded_string = tokenizer.decode(row, skip_special_tokens=True).replace('Ġ', '')
            line = decoded_string.strip()

            # Skip empty lines
            if not line:
                continue

            # Create a JSON object with a "text" field containing the line
            json_object = {"exid": id,
                           "text": line}

            # Write the JSON object to the output file as a single line
            json.dump(json_object, file, ensure_ascii=False)
            file.write("\n")
            id += 1

    print("Decoded strings saved to: %s", str(output_file_path))

In [14]:
# 1. read data_config.json

with open("data_config.json", "r") as f:
    config = json.load(f)
    dataset_base = os.path.join(config["dataset_dir"], config["dataset_name"])
    dataset_file = os.path.join(dataset_base + "." + config["language"])
    
print("Operating on dataset base:", dataset_base, "in language", config["language"])
print("Dataset file:", dataset_file)
print("Byte offset base:", byte_offset_base)

Operating on dataset base: EMEA/EMEA in language nl
Dataset file: EMEA/EMEA.nl
Byte offset base: ./datasets/EMEA/csv/EMEA.nl.csv


In [11]:
# 2. Generate a byte offset version of the dataset for inspection purposes
generate_byte_dataset(dataset_file, byte_offset_base, tokenizer)

Generating byte offset dataset from file:  EMEA/EMEA.en


In [15]:
count_large_entries(byte_offset_base, TOKEN_LENGTH)

17156

In [26]:
def filter_csv(input_file, output_file, min_size):
    # Open the input CSV file for reading
    with open(input_file, mode='r', newline='', encoding='utf-8') as infile:
        # Create a CSV reader object
        reader = csv.DictReader(infile)
        
        # Open the output CSV file for writing
        with open(output_file, mode='w', newline='', encoding='utf-8') as outfile:
            # Create a CSV writer object
            writer = csv.DictWriter(outfile, fieldnames=reader.fieldnames)
            
            # Write the header to the output file
            writer.writeheader()
            
            # Iterate through each row in the input file
            for row in reader:
                # Check if the size column value is at least min_size
                if int(row['size']) >= min_size:
                    # Write the row to the output file
                    writer.writerow(row)

# Input and output file paths
input_csv = os.path.join(TARGET_DIR, BYTE_OFFSET_FILE)
output_csv = os.path.join(TARGET_DIR, DATASET_NAME + "-" + str(TOKEN_LENGTH) + "." + LANGUAGE + ".csv")

# Call the function to filter the CSV file
filter_csv("datasets/ECB/csv/ECB.nl.csv", "datasets/ECB/csv/ECB-100.nl.csv", TOKEN_LENGTH)

print(f"Filtered rows have been written to {output_csv}")


Filtered rows have been written to ./datasets/ECB/csv/extracted/ecb_en.txt-100.nl.csv


In [27]:
def read_exids_from_csv(file):
    # integer set
    exids = set()
    with open(file, mode='r', newline='', encoding='utf-8') as infile:
        reader = csv.DictReader(infile)
        next(reader, None)  # Skip the header
        for row in reader:
            exids.add(int(row['exid']))
    return exids, len(exids)

def find_common_exids(file1, file2):
    exids1, len1 = read_exids_from_csv(file1)
    print(len1)
    exids2, len2 = read_exids_from_csv(file2)
    print(len2)
    common_exids = exids1.intersection(exids2)
    # sort
    common_exids = sorted(common_exids)

    print(len(common_exids))
    return common_exids

def write_exids_to_file(exids, output_file):
    with open(output_file, mode='w', newline='', encoding='utf-8') as outfile:
        writer = csv.writer(outfile)
        for exid in exids:
            writer.writerow([exid])

# Input CSV file paths
csv_file1 = 'datasets/ECB/csv/ECB-100.en.csv'
csv_file2 = 'datasets/ECB/csv/ECB-100.nl.csv'
output_csv = 'datasets/ECB/csv/common_exids-100.csv'

# Find common exids and write them to the output file
common_exids = find_common_exids(csv_file1, csv_file2)
write_exids_to_file(common_exids, output_csv)

print(f"Common exids have been written to {output_csv}")


2440
19619
698
Common exids have been written to datasets/ECB/csv/common_exids-100.csv


In [22]:
def read_common_exids(file):
    exids = []
    with open(file, mode='r', newline='', encoding='utf-8') as infile:
        reader = csv.reader(infile)
        for row in reader:
            exid = row  # Strip any leading/trailing whitespace
            exid = exid[0]
            exids.append(exid)
    return exids

def trunc_json(input_file, output_file, max_tokens, exid_list):
    # takes common example ids from csv file and truncates the corresponding sentences in the jsonl file
    # produces a new jsonl file with the truncated sentences to length max_tokens
    print("Truncating sentences in file: ", input_file, " to ", max_tokens, " tokens")
    count = 0
    
    with open(input_file, "r", encoding="utf-8") as f_input, \
         open(output_file, "w", encoding="utf-8") as f_output:
        
        # loop over all examples in the original dataset (jsonl version)
        for line in f_input:
            # Remove leading/trailing whitespaces and newline characters
            json_object = json.loads(line)
            
            exid = json_object["exid"]
            

            if(str(exid) not in exid_list):
                continue

            else: 
                sentence = json_object["text"]
                # Tokenize the sentence
                tokens = tokenizer.tokenize(sentence)
            
                # Truncate the tokenized sentece to max amount of tokens
                truncated_sentence = truncate_tokens(tokens, max_tokens)

                trunc_obj = {"exid": exid,
                             "text": truncated_sentence}
                    
                # Write the truncated sentence to the output file
                json.dump(trunc_obj, f_output, ensure_ascii=False)
                f_output.write('\n')
                count += 1
    print("Truncated ", count, " sentences to ", output_file)
    print("Done!")

exid_list = read_common_exids('datasets/csv/common_exids-100.csv')
print(len(exid_list))

input_file = 'nl-en/europarl-v7.nl-en.nl.jsonl'
output_file = 'nl-en/europarl-v7.nl-en-100.nl.jsonl'

trunc_json(input_file, output_file, TOKEN_LENGTH, exid_list)


7398
['81', '83', '568', '577', '765', '766', '1061', '1289', '1396', '1876']
Truncating sentences in file:  nl-en/europarl-v7.nl-en.nl.jsonl  to  100  tokens
Truncated  7398  sentences to  nl-en/europarl-v7.nl-en-100.nl.jsonl
Done!


In [179]:
# Generate a jsonlines version of the dataset
jsonlines_base = os.path.join(DATASET_DIR, DATASET_NAME + "." + LANGUAGE + ".jsonl")

# check if file exists and has content
if os.path.exists(jsonlines_base) and os.path.getsize(jsonlines_base) > 0:
    print("JSONL file", jsonlines_base, "already exists, skipping generation")
else: 
    text_to_jsonlines(dataset_file, jsonlines_base)
    print("JSONL file saved to: ", jsonlines_base)


JSONL file nl-en/europarl-v7.nl-en.en.jsonl already exists, skipping generation


In [16]:
def process_train_data(byte_offset_csv, output_file, max_tokens):
    # Read the byte offset CSV file
    with open(byte_offset_csv, "r", newline='', encoding="utf-8") as csvfile:
        with open(output_file, "w", newline='', encoding="utf-8") as f_output:
            csv_reader = csv.DictReader(csvfile)
            rows = len(list(csv_reader))
                
            # Iterate through each row in the CSV file
            for i in range(rows): 
                ids = []
                exid = int(rows[i]["exid"])
                size = int(rows[i]["size"])

                # sentence is long enough
                if (size >= max_tokens):
                    ids.append(exid)
                    json_object = {"exid": ids,
                                   "size": size}
                    json.dump(json_object, f_output)
                    f_output.write("\n")
                
                elif(size < max_tokens):
                    if(rows[i+1]["size"] < max_tokens):
                        ids.append(exid)
                        ids.append(int(rows[i+1]["exid"]))
                        size = size + int(rows[i+1]["size"])

                        json_object = {"exid": ids,
                                       "size": size}
                        json.dump(json_object, f_output)
                        f_output.write("\n")
                        i +=1
                    else: 
                        i+=1      
                    
        

SyntaxError: incomplete input (1846208603.py, line 4)