In [None]:
import csv
import re
import os
import requests  # To download the file from the URL
from collections import Counter
from transformers import AutoTokenizer
from huggingface_hub import login
from dotenv import load_dotenv

# Insert your Hugging Face token here
load_dotenv()
huggingface_token = os.getenv("HUGGINGFACE_TOKEN")

if huggingface_token:
    login(huggingface_token)
    print("Logged in successfully!")
else:
    print("Hugging Face token is not set in the .env file.")

# Specify the model name
model_name = "google/gemma-2-9b"  # Change the model name as needed
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to remove the special character '▁' from the first token of each word
def clean_special_character(tokens):
    """
    Removes the leading special character '▁' from the first token of a word.

    Args:
        tokens (list of str): List of tokens for a word.

    Returns:
        list of str: Tokens with the special character removed from the first token.
    """
    if tokens and tokens[0].startswith("▁"):  # Check if the first token starts with '▁'
        tokens[0] = tokens[0][1:]  # Remove the first character
    return tokens

# Function to read the file and extract words and their frequencies
def extract_words_and_frequencies(file_path):
    words_with_freq = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            # Each line has the format: word frequency
            parts = line.split()
            if len(parts) == 2:  # Ensure there's a word and frequency
                word = parts[0]
                frequency = int(parts[1])
                words_with_freq.append((word, frequency))
    return words_with_freq

# Function to tokenize words and calculate the cumulative frequency of tokens
def tokenize_and_count(words_with_freq, tokenizer):
    token_counts = Counter()  # To store tokens and their counts
    tokenized_words = []  # To store tokens for each word

    for word, frequency in words_with_freq:
        tokens = tokenizer.tokenize(word)  # Tokenize the word
        tokens = clean_special_character(tokens)  # Clean the special character '▁'
        num_tokens = len(tokens)
        for token in tokens:
            token_counts[token] += frequency  # Add the word frequency to the token
        tokenized_words.append((word, frequency, num_tokens, tokens))  # Store the word info
    
    return token_counts, tokenized_words

# Function to save words, frequencies, number of tokens, and subwords to a CSV
def save_word_tokens_to_csv(tokenized_words, input_file, model_name):
    base_name = re.sub(r'\.txt$', '', input_file.split("/")[-1])  # Remove the .txt extension
    output_file_path = f"{base_name}_tokenized_by_{model_name.replace('/', '_')}.csv"
    
    with open(output_file_path, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        # Write the header
        writer.writerow(['Word', 'Frequency', 'Number of Tokens', 'Subwords'])
        
        # Write each word, frequency, number of tokens, and tokens
        for word, frequency, num_tokens, tokens in tokenized_words:
            writer.writerow([word, frequency, num_tokens, ' '.join(tokens)])
    
    print(f"CSV file with tokenized words saved to: {output_file_path}")

# Function to save tokens and their frequencies to a CSV
def save_token_counts_to_csv(token_counts, input_file, model_name):
    base_name = re.sub(r'\.txt$', '', input_file.split("/")[-1])  # Remove the .txt extension
    output_file_path = f"{base_name}_frequency_tokens_tokenized_by_{model_name.replace('/', '_')}.csv"
    
    with open(output_file_path, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        # Write the header
        writer.writerow(['Token', 'Frequency'])
        
        # Write each token and its cumulative frequency
        for token, count in token_counts.items():
            writer.writerow([token, count])
    
    print(f"CSV file with token frequencies saved to: {output_file_path}")

# URL of the file
url = "https://norvig.com/ngrams/count_1w.txt"
local_file_path = "count_1w.txt"

# Download the file from the URL
try:
    # Download the file
    response = requests.get(url)
    if response.status_code == 200:
        with open(local_file_path, 'w', encoding='utf-8') as file:
            file.write(response.text)
        print(f"File downloaded and saved as {local_file_path}")
    else:
        raise Exception(f"Error downloading the file. Status code: {response.status_code}")

    # Extract words and their frequencies from the TXT file
    words_with_freq = extract_words_and_frequencies(local_file_path)
    print(f"Words extracted: {len(words_with_freq)}")

    # Tokenize words and calculate cumulative token frequencies
    token_counts, tokenized_words = tokenize_and_count(words_with_freq, tokenizer)
    print(f"Total unique tokens: {len(token_counts)}")

    # Save tokenized words and frequencies to a CSV
    save_word_tokens_to_csv(tokenized_words, local_file_path, model_name)

    # Save tokens and their cumulative frequencies to another CSV
    save_token_counts_to_csv(token_counts, local_file_path, model_name)
except Exception as e:
    print(f"Error: {e}")



The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to C:\Users\raque\.cache\huggingface\token
Login successful


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


File downloaded and saved as count_1w.txt
Words extracted: 333333
Total unique tokens: 37293
CSV file with tokenized words saved to: count_1w_tokenized_by_google_gemma-2-9b.csv
CSV file with token frequencies saved to: count_1w_frequency_tokens_tokenized_by_google_gemma-2-9b.csv
