In [1]:
# imports
from transformers import GPT2Tokenizer
import csv
import json
import os
import nltk
import numpy as np

In [3]:
# define dataset constants

# path to the (pretraining) dataset of the model
DATASET_DIR = "DGT-TM"
# file name of text version of the dataset
DATASET_NAME = "vol-2015"
# language of the setup
LANGUAGE = "en"
# target directory for the csv files
SOURCE_DIR = "./datasets"
# desired token length of examples
EXAMPLE_TOKEN_LEN = 100
# target file name for the byte off set csv files
BYTE_OFFSET_FILE = DATASET_NAME + "." + LANGUAGE + ".csv"

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

byte_offset_base = os.path.join(SOURCE_DIR, DATASET_DIR, "csv", BYTE_OFFSET_FILE)

# create data_config.json from constants
config = {
    "dataset_dir": DATASET_DIR,
    "dataset_name": DATASET_NAME,
    "language": LANGUAGE,
    "example_token_len": EXAMPLE_TOKEN_LEN,
    "source_dir": SOURCE_DIR,
    "byte_offset_file": BYTE_OFFSET_FILE,
}

with open("data_config.json", "w") as f:
    json.dump(config, f, indent=4)

In [102]:
# data set inspection functions

# function to count the longest token sequence in a dataset file 
def max_tokens_in_sentence(file_path):
    max_tokens = 0

    print("Counting max tokens in file: ", file_path)
    print("This may take a while...")
    with open(file_path, 'r') as file:
        i = 0
        for line in file:
            i +=1
            if(line):
            # Tokenize the sentence
                tokens = nltk.word_tokenize(line)
                num_tokens = len(tokens)
                # Update max_tokens if current sentence has more tokens
                if num_tokens > max_tokens:
                    max_tokens = num_tokens
    
        print("Max tokens in file: ", max_tokens)
    return max_tokens

# function to count the number of examples in a dataset file with more tokens than a given threshold
def count_large_entries(csv_file, tokens):
    # Open the CSV file for reading
    with open(csv_file, "r", newline='', encoding="utf-8") as csvfile:
        csv_reader = csv.DictReader(csvfile)
        
        # Initialize a counter for large entries
        large_entry_count = 0
        
        # Iterate through each row in the CSV file
        for row in csv_reader:
            # Convert the value in the "size" column to an integer
            # this is the number of tokens in the example
            size = int(row["size"])
            
            # Check if the size is greater than or equal to the amount of tokens supplied
            if size >= tokens:
                # Increment the counter if the condition is met
                large_entry_count += 1
                
    return large_entry_count

# dataset generation functions
def truncate_sentence(sentence, max_tokens):
    # Tokenize the sentence
    tokens = tokenizer.tokenize(sentence)
    
    # Truncate to max_tokens tokens
    truncated_tokens = tokens[:max_tokens]
    
    # Convert tokens back to string
    truncated_sentence = tokenizer.convert_tokens_to_string(truncated_tokens)
    
    return truncated_sentence

def truncate_tokens(tokens, max_tokens):
    # Truncate to max_tokens tokens
    truncated_tokens = tokens[:max_tokens]
    
    # Convert tokens back to string
    truncated_sentence = tokenizer.convert_tokens_to_string(truncated_tokens)
    
    return truncated_sentence

def filter_truncate_json_sentences(input_file, output_file, max_tokens):
    print("Filtering and truncating sentences in file: ", input_file, " to ", max_tokens, " tokens")
    
    with open(input_file, "r", encoding="utf-8") as f_input, \
         open(output_file, "w", encoding="utf-8") as f_output:
        
        for line in f_input:
            json_object = json.loads(line)

            sentence = json_object["text"]

            # Skip empty lines
            if not sentence:
                continue

            exid = json_object["exid"]
            # Remove leading/trailing whitespaces and newline characters
            sentence = sentence.strip()
            
            # Tokenize the sentence
            tokens = tokenizer.tokenize(sentence)
            
            # Check if the number of tokens exceeds the maximum
            if len(tokens) >= max_tokens:

                # Truncate the tokenized sentece to max amount of tokens
                truncated_sentence = truncate_tokens(tokens, max_tokens)

                # Create a JSON object with a "text" field containing the line
                # and the original example ID
                trunc_object = {"exid": exid,
                               "text": truncated_sentence}
                
                # Write the JSON object to the output file as a single line
                json.dump(trunc_object, f_output, ensure_ascii=False)
                f_output.write('\n')

def filter_and_truncate_sentences(input_file, output_file, max_tokens):
    print("Filtering and truncating sentences in file: ", input_file, " to ", max_tokens, " tokens")
    
    with open(input_file, "r", encoding="utf-8") as f_input, \
         open(output_file, "w", encoding="utf-8") as f_output:
        
        for line in f_input:
            # Remove leading/trailing whitespaces and newline characters
            sentence = line.strip()
            
            # Tokenize the sentence
            tokens = tokenizer.tokenize(sentence)
            
            # Check if the number of tokens exceeds the maximum
            if len(tokens) >= max_tokens:

                # Truncate the tokenized sentece to max amount of tokens
                truncated_sentence = truncate_tokens(tokens, max_tokens)
                
                # Write the truncated sentence to the output file
                f_output.write(truncated_sentence + "\n")
                
# Function to generate a csv byte offset file from the original dataset
# used to work with Carlini code only
def generate_byte_dataset(input_file, output_file, tokenizer):
    print("Generating byte offset dataset from file: ", input_file)
    with open(input_file, "r", encoding="utf-8") as f:
        lines = f.readlines()

    if not os.path.exists(SOURCE_DIR):
        os.makedirs(SOURCE_DIR)
        
    with open(output_file, "w", newline='', encoding="utf-8") as csvfile:
        csv_writer = csv.writer(csvfile)
        #csv_writer.writerow(["exid", "fid", "line_byte_offset", "start", "end", "take_before", "take_after", "internal_offset", "size", "start_byte", "end_byte", "count"])
        csv_writer.writerow(["exid", "size"])
        
        exid = 1 # start at 1

        #line_byte_offset = 0
        #fid = 0
        for line in lines:
            # Remove leading/trailing whitespaces and newline characters
            line = line.strip()

                # Calculate the end position (end of sentence)
                #end = len(line) - 1

                # Tokenize the sentence and get its length
            size = len(tokenizer.encode(line, truncation=True))
                
            # Write the row to the CSV file
            csv_writer.writerow([exid, size])
            
            exid += 1  # Always increment the example ID to keep in sync with original dataset

# Function to generate a csv byte offset file from the original dataset
def generate_byte_dataset_jsonl(input_file, output_file, tokenizer):
    print("Generating byte offset dataset from file: ", input_file)
    with open(input_file, "r", encoding="utf-8") as f:
        lines = f.readlines()

    if not os.path.exists(SOURCE_DIR):
        os.makedirs(SOURCE_DIR)
        
    with open(output_file, "w", newline='', encoding="utf-8") as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(["exid", "size"])

        for line in lines:
            # Tokenize the sentence and get its length
            json_object = json.loads(line)
            exid = json_object["exid"]
            sentence = json_object["text"]
            size = len(tokenizer.encode(sentence, truncation=True))
                
            # Write the row to the CSV file
            csv_writer.writerow([exid, size])
                # Update line byte offset for the next sentence
                #line_byte_offset += len(line) + 1  # Add 1 for the newline character

# Function to generate a jsonlines version of dataset
# input here is a text file
def text_to_jsonlines(input_file, output_file):
    with open(input_file, "r", encoding="utf-8") as f_input, \
         open(output_file, "w", encoding="utf-8") as f_output:
        id = 1

        for line in f_input:
            # Remove leading/trailing whitespaces and newline characters
            line = line.strip()

            
            # Create a JSON object with a "text" field containing the line
            json_object = {"exid": id,
                           "text": line}
            
            # Write the JSON object to the output file as a single line
            json.dump(json_object, f_output, ensure_ascii=False)
            f_output.write('\n')
            id += 1

In [103]:
# 1. read data_config.json

with open("data_config.json", "r") as f:
    config = json.load(f)
    dataset_base = os.path.join(config["dataset_dir"], config["dataset_name"])
    dataset_file = os.path.join(dataset_base + "." + config["language"])
    
print("Operating on dataset base:", dataset_base, "in language", config["language"])
print("Dataset file:", dataset_file)
print("Byte offset base:", byte_offset_base)

Operating on dataset base: EMEA/EMEA-c in language en
Dataset file: EMEA/EMEA-c.en
Byte offset base: ./datasets/EMEA/csv/EMEA-c.en.csv


In [27]:
# 2. Generate a byte offset version of the dataset for inspection purposes
# from text .en or .nl file
generate_byte_dataset(dataset_file, byte_offset_base, tokenizer)

Generating byte offset dataset from file:  ECB/ECB.en


In [96]:
in_file = dataset_file + ".jsonl"
# from .jsonl version of dataset
generate_byte_dataset_jsonl(in_file, byte_offset_base, tokenizer)

Generating byte offset dataset from file:  EMEA/EMEA-c.nl.jsonl


In [97]:
count_large_entries(byte_offset_base, EXAMPLE_TOKEN_LENGTH)

48134

In [98]:
# This function filters the CSV file based on the size column
def filter_csv(input_file, output_file, min_size):
    # Open the input CSV file for reading
    with open(input_file, mode='r', newline='', encoding='utf-8') as infile:
        # Create a CSV reader object
        reader = csv.DictReader(infile)
        
        # Open the output CSV file for writing
        with open(output_file, mode='w', newline='', encoding='utf-8') as outfile:
            # Create a CSV writer object
            writer = csv.DictWriter(outfile, fieldnames=reader.fieldnames)
            
            # Write the header to the output file
            writer.writeheader()
            
            # Iterate through each row in the input file
            for row in reader:
                # Check if the size column value is at least min_size
                if int(row['size']) >= min_size:
                    # Write the row to the output file
                    writer.writerow(row)

# Input and output file paths
input_csv = os.path.join(byte_offset_base)
output_csv = os.path.join(SOURCE_DIR, DATASET_DIR, "csv", DATASET_NAME + "-" + str(EXAMPLE_TOKEN_LEN) + "." + LANGUAGE + ".csv")

# Call the function to filter the CSV file
filter_csv(input_csv, output_csv, EXAMPLE_TOKEN_LENGTH)

print(f"Filtered rows have been written to {output_csv}")


Filtered rows have been written to ./datasets/EMEA/csv/EMEA-c-100.nl.csv


In [99]:
def read_exids_from_csv(file):
    # integer set
    exids = set()
    with open(file, mode='r', newline='', encoding='utf-8') as infile:
        reader = csv.DictReader(infile)
        for row in reader:
            exids.add(int(row['exid']))
    return exids, len(exids)

def find_common_exids(file1, file2):
    exids1, len1 = read_exids_from_csv(file1)
    print(len1)
    exids2, len2 = read_exids_from_csv(file2)
    print(len2)
    common_exids = exids1.intersection(exids2)
    # sort
    common_exids = sorted(common_exids)

    print(len(common_exids))
    return common_exids

def write_exids_to_file(exids, output_file):
    with open(output_file, mode='w', newline='', encoding='utf-8') as outfile:
        writer = csv.writer(outfile)
        for exid in exids:
            writer.writerow([exid])

# Input CSV file paths
csv_file1 = os.path.join(SOURCE_DIR, DATASET_DIR, "csv", DATASET_NAME + "-" + str(EXAMPLE_TOKEN_LEN) + "." + "en" + ".csv")
csv_file2 = os.path.join(SOURCE_DIR, DATASET_DIR, "csv", DATASET_NAME + "-" + str(EXAMPLE_TOKEN_LEN) + "." + "nl" + ".csv")
output_csv = os.path.join(SOURCE_DIR, DATASET_DIR, "csv", "common_exids-" + str(EXAMPLE_TOKEN_LEN) + ".csv")

# Find common exids and write them to the output file
common_exids = find_common_exids(csv_file1, csv_file2)
write_exids_to_file(common_exids, output_csv)

print(f"Common exids have been written to {output_csv}")


9170
48133
8309
Common exids have been written to ./datasets/EMEA/csv/common_exids-100.csv


In [104]:
def read_common_exids(file):
    exids = []
    with open(file, mode='r', newline='', encoding='utf-8') as infile:
        reader = csv.reader(infile)
        for row in reader:
            exid = row  # Strip any leading/trailing whitespace
            exid = exid[0]
            exids.append(exid)
    return exids

def trunc_json(input_file, output_file, max_tokens, exid_list):
    # takes common example ids from csv file and truncates the corresponding sentences in the jsonl file
    # produces a new jsonl file with the truncated sentences to length max_tokens
    print("Truncating sentences in file: ", input_file, " to ", max_tokens, " tokens")
    count = 0
    
    with open(input_file, "r", encoding="utf-8") as f_input, \
         open(output_file, "w", encoding="utf-8") as f_output:
        
        # loop over all examples in the original dataset (jsonl version)
        for line in f_input:
            # Remove leading/trailing whitespaces and newline characters
            json_object = json.loads(line)
            
            exid = json_object["exid"]
            

            if(str(exid) not in exid_list):
                continue

            else: 
                sentence = json_object["text"]
                # Tokenize the sentence
                tokens = tokenizer.tokenize(sentence)
            
                # Truncate the tokenized sentece to max amount of tokens
                truncated_sentence = truncate_tokens(tokens, max_tokens)

                trunc_obj = {"exid": exid,
                             "text": truncated_sentence}
                    
                # Write the truncated sentence to the output file
                json.dump(trunc_obj, f_output, ensure_ascii=False)
                f_output.write('\n')
                count += 1
    print("Truncated ", count, " sentences to ", output_file)
    print("Done!")

# read common exids
path = os.path.join(SOURCE_DIR, DATASET_DIR, "csv", "common_exids-" + str(EXAMPLE_TOKEN_LEN) + ".csv")
exid_list = read_common_exids(path)
print(len(exid_list), "common exids found")

input_file = os.path.join(DATASET_DIR, DATASET_NAME + "." + LANGUAGE + ".jsonl")
output_file = os.path.join(DATASET_DIR, DATASET_NAME + "-" + str(EXAMPLE_TOKEN_LEN) + "." + LANGUAGE + ".jsonl")

trunc_json(input_file, output_file, EXAMPLE_TOKEN_LEN, exid_list)


8309 common exids found
Truncating sentences in file:  EMEA/EMEA-c.en.jsonl  to  100  tokens
Truncated  8309  sentences to  EMEA/EMEA-c-100.en.jsonl
Done!


In [66]:
# function to concat sentences in dataset to get more examples with >= desired token length
# uses the byte offset csv file for faster processing
def process_train_data(byte_offset_csv, output_file, max_tokens):
    # Read the byte offset CSV file
    with open(byte_offset_csv, "r", newline='', encoding="utf-8") as csvfile:
        with open(output_file, "w", newline='', encoding="utf-8") as out_file:
            csv_reader = list(csv.DictReader(csvfile))
            idx = 0
            while idx < len(csv_reader):
                ids = []
                exid = int(csv_reader[idx]["exid"])
                size = int(csv_reader[idx]["size"])

                # If the current line has enough tokens, process it
                if size >= max_tokens:
                    ids.append(exid)
                    json_object = {"exid": ids, "size": size}
                    json.dump(json_object, out_file)
                    out_file.write("\n")
                    idx += 1
                else:
                    # If the next line also does not have enough tokens, concatenate them
                    if idx + 1 < len(csv_reader) and int(csv_reader[idx + 1]["size"]) < max_tokens:
                        ids.append(exid)
                        ids.append(int(csv_reader[idx + 1]["exid"]))
                        size += int(csv_reader[idx + 1]["size"])
                        json_object = {"exid": ids, "size": size}
                        json.dump(json_object, out_file)
                        out_file.write("\n")
                        idx += 2
                    else:
                        # Skip to the next line
                        idx += 1

def count_large_entries_json(json_file, max_tokens):
    # Open the JSON file for reading
    with open(json_file, "r", encoding="utf-8") as file:
        # Initialize a counter for large entries
        large_entry_count = 0
        
        # Iterate through each line in the JSON file
        for line in file:
            # Parse the JSON object from the line
            entry = json.loads(line)
            
            # Extract the size value from the JSON object
            size = entry["size"]
            
            # Check if the size is greater than or equal to the max_tokens
            if size >= max_tokens:
                # Increment the counter if the condition is met
                large_entry_count += 1
                
    return large_entry_count


In [67]:
out_file = os.path.join(SOURCE_DIR, DATASET_DIR, DATASET_NAME + "." + LANGUAGE + ".jsonl")
process_train_data(byte_offset_base, out_file, EXAMPLE_TOKEN_LEN)

# NOTE: you should only need to run process_train_data once for EN, so that you have 1 unique versioin of exids groups!!!!!
# NOTE: should we keep repeating this if we want to test with 200 tokens for extra context?

In [68]:
count_large_entries_json(out_file, EXAMPLE_TOKEN_LEN)

9210

In [86]:
def reformat_dataset(json_file, dataset_file, output_file):
    with open(json_file, "r", encoding="utf-8") as in_file, open(output_file, "w", encoding="utf-8") as outfile:
        # Initialize new exid counter
        new_exid = 1

        # Read the dataset file into a list of lines
        with open(dataset_file, 'r') as file:
            # this list will hold the dataset, starting at index 0
            dataset = file.readlines()
        
        # Get the total number of lines in the dataset
        total_lines = len(dataset)
        
        # Read the input JSON file line by line
        lines = in_file.readlines()
        for line in lines:
            # Parse the JSON object
            data = json.loads(line)
            exids = data["exid"]
            
            concat_sentence = ""
            for exid in exids:
                concat_sentence += dataset[exid-1].strip() + " "
            
            # Remove trailing space
            concat_sentence = concat_sentence.strip()
            
            new_data = {
                "exid": new_exid,
                "text": concat_sentence
            }

            new_exid += 1  # Increment the exid for the next line
            
            # Write the new JSON object to the output file
            json.dump(new_data, outfile, ensure_ascii=False)
            outfile.write("\n")


In [88]:
# NOTE: reformat nl dataset with ENGLISH paired json faile!!
reformat_dataset("datasets/EMEA/EMEA.en.jsonl", "EMEA/EMEA.en", "EMEA/EMEA-c.en.jsonl")

EMEA/ H/ C/ 471



In [4]:
# model training wants plain text, not npy so we converting concat trunc json version to plain text again
def extract_text_from_json(json_file, output_txt_file):
    with open(json_file, 'r', encoding='utf-8') as infile, open(output_txt_file, 'w', encoding='utf-8') as outfile:
        for line in infile:
            data = json.loads(line)
            text = data["text"]
            outfile.write(text + "\n")

# Example usage
json_file = 'EMEA/EMEA-c-100.nl.jsonl'  # Replace with your JSON file name
output_txt_file = 'EMEA/EMEA-c-100.nl'  # The output text file
extract_text_from_json(json_file, output_txt_file)


In [13]:
!python preprocessing.py --config_file data_config.json

2024-06-06 21:31:49,438 - INFO - Parsing arguments...
Parsing arguments...
2024-06-06 21:31:49,438 - INFO - Loading tokenizer...
Loading tokenizer...
2024-06-06 21:31:49,661 - INFO - Counting tokens for en...
Counting tokens for en...
2024-06-06 21:31:49,661 - INFO - This may take a while depending on the size of the dataset...
This may take a while depending on the size of the dataset...
Generating byte offset dataset from file:  DGT-TM/vol-2015.en
2024-06-06 21:32:30,012 - INFO - Number of samples >= 100 tokens in ./datasets/DGT-TM/csv/vol-2015.en.csv: 11316
Number of samples >= 100 tokens in ./datasets/DGT-TM/csv/vol-2015.en.csv: 11316
2024-06-06 21:32:30,013 - INFO - Filtering sentences for en...
Filtering sentences for en...
2024-06-06 21:32:30,510 - INFO - Generating JSONL for en...
Generating JSONL for en...
2024-06-06 21:32:34,163 - INFO - Counting tokens for nl...
Counting tokens for nl...
2024-06-06 21:32:34,163 - INFO - This may take a while depending on the size of the data