In [None]:
import csv
import re
import tiktoken
import requests  # To download the file from the URL
from collections import Counter

# Function to download the file from a URL
def download_file(url, local_file_path):
    response = requests.get(url)
    if response.status_code == 200:
        with open(local_file_path, 'w', encoding='utf-8') as file:
            file.write(response.text)
        print(f"File downloaded and saved as {local_file_path}")
    else:
        raise Exception(f"Error downloading the file. Status code: {response.status_code}")

# Function to read the file and extract words and their frequencies
def extract_words_and_frequencies(file_path):
    words_with_freq = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            # Each line has the format: word frequency
            parts = line.split()
            if len(parts) == 2:  # Ensure there's a word and frequency
                word = parts[0]
                frequency = int(parts[1])
                words_with_freq.append((word, frequency))
    return words_with_freq

# Function to tokenize words and calculate the cumulative frequency of tokens
def tokenize_and_count(words_with_freq, model_name):
    enc = tiktoken.encoding_for_model(model_name)  # Use the specified tokenizer
    token_counts = Counter()  # To store tokens and their counts
    tokenized_words = []  # To store tokens for each word

    for word, frequency in words_with_freq:
        token_ids = enc.encode(word)  # Tokenize the word
        tokens = [enc.decode([token_id]) for token_id in token_ids]
        for token in tokens:
            token_counts[token] += frequency  # Add the word frequency to the token
        tokenized_words.append((word, frequency, len(tokens), tokens))  # Store the word info
    
    return token_counts, tokenized_words

# Function to save words, frequencies, number of tokens, and subwords to a CSV
def save_word_tokens_to_csv(tokenized_words, input_file, model_name):
    base_name = re.sub(r'\.txt$', '', input_file.split("/")[-1])  # Remove the .txt extension
    output_file_path = f"{base_name}_tokenized_by_{model_name}.csv"
    
    with open(output_file_path, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        # Write the header
        writer.writerow(['Word', 'Frequency', 'Number of Tokens', 'Subwords'])
        
        # Write each word, frequency, number of tokens, and tokens
        for word, frequency, num_tokens, tokens in tokenized_words:
            writer.writerow([word, frequency, num_tokens, ' '.join(tokens)])
    
    print(f"CSV file with tokenized words saved to: {output_file_path}")

# Function to save tokens and their frequencies to a CSV
def save_token_counts_to_csv(token_counts, input_file, model_name):
    base_name = re.sub(r'\.txt$', '', input_file.split("/")[-1])  # Remove the .txt extension
    output_file_path = f"{base_name}_frequency_tokens_tokenized_by_{model_name}.csv"
    
    with open(output_file_path, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        # Write the header
        writer.writerow(['Token', 'Frequency'])
        
        # Write each token and its cumulative frequency
        for token, count in token_counts.items():
            writer.writerow([token, count])
    
    print(f"CSV file with token frequencies saved to: {output_file_path}")

# URL of the file
url = "https://norvig.com/ngrams/count_1w.txt"
local_file_path = "count_1w.txt"
model_name = "gpt-4o-mini"  # Specify the model name

# Download the file from the URL
try:
    # Download the file
    download_file(url, local_file_path)

    # Extract words and their frequencies from the TXT file
    words_with_freq = extract_words_and_frequencies(local_file_path)
    print(f"Words extracted: {len(words_with_freq)}")

    # Tokenize words and calculate cumulative token frequencies
    token_counts, tokenized_words = tokenize_and_count(words_with_freq, model_name)
    print(f"Total unique tokens: {len(token_counts)}")

    # Save tokenized words and frequencies to a CSV
    save_word_tokens_to_csv(tokenized_words, local_file_path, model_name)

    # Save tokens and their cumulative frequencies to another CSV
    save_token_counts_to_csv(token_counts, local_file_path, model_name)
except Exception as e:
    print(f"Error: {e}")


File downloaded and saved as count_1w.txt
Words extracted: 333333
Total unique tokens: 22245
CSV file with tokenized words saved to: count_1w_tokenized_by_gpt-4o-mini.csv
CSV file with token frequencies saved to: count_1w_frequency_tokens_tokenized_by_gpt-4o-mini.csv
