In [2]:
import csv
import transformers
from transformers import AutoTokenizer
from huggingface_hub import login
from dotenv import load_dotenv
import os

# Insert your Hugging Face token here
load_dotenv()
huggingface_token = os.getenv("HUGGINGFACE_TOKEN")

if huggingface_token:
    login(huggingface_token)
    print("Logged in successfully!")
else:
    print("Hugging Face token is not set in the .env file.")

# Load the tokenizer for Gemma2-9B from Hugging Face
model_name = "meta-llama/Llama-3.1-8B" #mistralai/Mistral-7B-v0.1, meta-llama/Llama-3.1-8B  , google/gemma-2-9b
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize a word and get the token count and tokens
def tokenize_word(word):
    tokens = tokenizer.tokenize(word)  # Tokenize the word
    num_tokens = len(tokens)  # Count the number of tokens
    return num_tokens, tokens

# Read the existing CSV file, process its contents, and add new columns
def process_csv(input_csv):
    # Generate the output CSV file name dynamically
    model_name = tokenizer.name_or_path.replace("/", "_")  # Replace slashes in model name to avoid file issues
    output_csv = f"{input_csv.split('.')[0]}_tokenized_by_{model_name}.csv"

    with open(input_csv, 'r', encoding='utf-8') as infile, open(output_csv, 'w', newline='', encoding='utf-8') as outfile:
        reader = csv.DictReader(infile)
        fieldnames = reader.fieldnames + ['Tokens_Number', 'Tokens']  # Add new columns
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)

        # Write the header
        writer.writeheader()

        # Process each row and add the new data
        for row in reader:
            word = row['Word']
            num_tokens, tokens = tokenize_word(word)
            row['Tokens_Number'] = num_tokens
            row['Tokens'] = ' '.join(tokens)
            writer.writerow(row)
        print(f"Processed file saved to: {output_csv}")

# Path to the input CSV file
input_csv = 'test_words.csv'  # Original file name

# Run the CSV processing function
process_csv(input_csv)


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to C:\Users\raque\.cache\huggingface\token
Login successful
Logged in successfully!
Processed file saved to: test_words_tokenized_by_meta-llama_Llama-3.1-8B.csv
