In [2]:
import csv
import tiktoken

# Load the tokenizer for GPT-4
model_name = "gpt-4o-mini"  # Model name
enc = tiktoken.encoding_for_model(model_name)

# Function to tokenize words and retrieve separated tokens and token count
def tokenize_word(word):
    token_ids = enc.encode(word)
    subwords = [enc.decode([token_id]) for token_id in token_ids]
    num_tokens = len(subwords)
    return num_tokens, subwords

# Read the existing CSV file, process it, and add new columns
def process_csv(input_csv):
    # Generate the output file name
    output_csv = f"{input_csv.split('.')[0]}_tokenized_by_{model_name}.csv"
    
    with open(input_csv, 'r', encoding='utf-8') as infile, open(output_csv, 'w', newline='', encoding='utf-8') as outfile:
        reader = csv.DictReader(infile)
        fieldnames = reader.fieldnames + ['Tokens_Number', 'Tokens']  # Add new columns
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)

        # Write the header
        writer.writeheader()

        # Process each row and add new data
        for row in reader:
            word = row['Word']
            num_tokens, tokens = tokenize_word(word)
            row['Tokens_Number'] = num_tokens
            row['Tokens'] = ' '.join(tokens)
            writer.writerow(row)
        print(f"Processed file saved to: {output_csv}")

# Path to the CSV file
input_csv = 'test_words.csv'  # Name of the original file

# Execute the processing function
process_csv(input_csv)


Processed file saved to: test_words_tokenized_by_gpt-4-mini.csv
