In [2]:
!# Install SentencePiece
!pip install sentencepiece




In [3]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("praveengovi/tamil-language-corpus-for-nlp")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/praveengovi/tamil-language-corpus-for-nlp?dataset_version_number=8...


100%|██████████| 2.27G/2.27G [00:39<00:00, 61.8MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/praveengovi/tamil-language-corpus-for-nlp/versions/8


# Using SentencePiece

In [4]:
import sentencepiece as spm
import re
import tempfile
import csv

def readcorpus(corpus_path:str):
    with open(corpus_path, 'r', encoding='utf-8') as f:
        text = f.read()
    return text

def read_corpus_from_csv(corpus_path: str):
    """Reads text data from a CSV file, extracting the relevant column.

    Args:
        corpus_path (str): The path to the CSV file.

    Returns:
        list[str]: A list of sentences extracted from the CSV file.
    """
    with open(corpus_path, 'r', encoding='utf-8') as f:
        reader = csv.reader(f)
        # Assuming the text data is in the second column (index 1)
        # and each cell contains one or more sentences separated by '\n'.
        text_data = []
        for row in reader:
            if row:
                text_data.extend(row[1].split('\n'))  # Splitting cell content into sentences
    return text_data

input_file_path = '/root/.cache/kagglehub/datasets/praveengovi/tamil-language-corpus-for-nlp/versions/8/Tamil_News_Corpus/Tamil_news_Dataset/Tamilmurasu_dataset_06_Jan_2011_06_Jan_2020.csv'
input_file = read_corpus_from_csv(input_file_path) # Pass the file path to the function
model_prefix = "tamil_bpe"
vocab_size = 5000

# Create a temporary file to store the sentences for training
with tempfile.NamedTemporaryFile(mode='w', encoding='utf-8', delete=False) as temp_file:
    for sentence in input_file:
        temp_file.write(sentence + '\n')  # Writing each sentence on a new line
    temp_file_path = temp_file.name  # Get the path of the temporary file

# Train the BPE model, using the temporary file path as input
spm.SentencePieceTrainer.train(
    input=temp_file_path,  # Using the temporary file path
    model_prefix=model_prefix,
    vocab_size=vocab_size,
    character_coverage=0.995,  # To ensure most Tamil characters are included
    model_type='bpe'
)

# Load the model
sp = spm.SentencePieceProcessor(model_file=f"{model_prefix}.model")

# Tokenize a Tamil sentence
text = "உலகத்தில் மாறாத ஒன்று மாறுதலே."
tokens = sp.encode(text, out_type=str)
print(f"Tokens: {tokens}")

# Decode back to text
decoded_text = sp.decode(tokens)
print(f"Decoded Text: {decoded_text}")


# Test compression ratio
def calculate_compression_ratio(text, sp_model):
    original_length = len(text)
    tokenized_length = len(sp_model.encode(text, out_type=str))
    return original_length / tokenized_length

test_text = "உலகில் மாறாத ஒன்று மாறுதல் மட்டுமே."  # Example Tamil text
compression_ratio = calculate_compression_ratio(test_text, sp)
print(f"Compression Ratio: {compression_ratio}")


Tokens: ['▁', 'உலகத்தில்', '▁', 'மாறாத', '▁', 'ஒன்று', '▁', 'மாறுதலே.']
Decoded Text: உலகத்தில் மாறாத ஒன்று மாறுதலே.
Compression Ratio: 3.5


# **Using** BPE Tokenizer merge

In [7]:
import pandas as pd
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, normalizers

# Load CSV
csv_file = "/root/.cache/kagglehub/datasets/praveengovi/tamil-language-corpus-for-nlp/versions/8/Tamil_News_Corpus/Tamil_news_Dataset/Tamilmurasu_dataset_06_Jan_2011_06_Jan_2020.csv"  # Update with your actual CSV file
df = pd.read_csv(csv_file)

# Check column names and select the Tamil text column
print(df.head())  # Inspect column names
print(df.columns)  # Print the available column names
tamil_texts = df["news_article"].dropna().tolist()  # Assuming 'text' column contains Tamil sentences

# Save as a raw text file (needed for tokenizer training)
with open("tamil_corpus.txt", "w", encoding="utf-8") as f:
    for line in tamil_texts:
        f.write(line + "\n")

# Initialize BPE Tokenizer
tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))

# Set Normalization (Handle Tamil text properly)
tokenizer.normalizer = normalizers.NFKC()

# Set Pre-tokenization (Whitespace-based)
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

# Define Trainer for BPE with Vocabulary Limit
trainer = trainers.BpeTrainer(
    vocab_size=5000,  # Set vocabulary size
    special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
)

# Train the Tokenizer on Extracted Tamil Corpus
tokenizer.train(files=["tamil_corpus.txt"], trainer=trainer)

# Save trained tokenizer
tokenizer.save("bpe_tamil_tokenizer.json")
print("Tokenizer training completed. Model saved.")


   news_id            news_date news_category  \
0        6  1/6/2011 2:45:49 PM        மர்மம்   
1        9  1/6/2011 2:56:51 PM        மர்மம்   
2       10  1/6/2011 3:02:00 PM       இந்தியா   
3       11  1/6/2011 3:08:15 PM        மர்மம்   
4       12  1/6/2011 3:09:20 PM        மர்மம்   

                                          news_title  \
0  தூக்கில் தொங்கும் சேவல்கள் திருடர்களை காவு வாங...   
1                பவுர்ணமி ஜாமத்தில் மாயமான கர்ப்பிணி   
2  காமன்வெல்த் ஊழல்: சுரேஷ் கல்மாடியிடம் 102 கேள்...   
3                            மச்சுபிச்சு மலை ரகசியம்   
4                      ரத்த பலி வாங்கும் விபரீத ஆவி!   

                                        news_article  
0  நாலு ஆள் உயரம், முறுக்கு மீசை, கையில் வீச்சரிவ...  
1  அமானுஷ்யமான சம்பவங்கள் நம்மை சுற்றி ஆங்காங்கே ...  
2  காமன்வெல்த் போட்டி ஏற்பாட்டில் நடைபெற்ற முறைகே...  
3  தென்அமெரிக்க நாடான பெருவில் காடுகள் மிகவும் பய...  
4  கடந்த 18ம் தேதி சாயங்காலம்... அடைமழையை கிழித்த...  
Index(['news_id', 'news_date', 'news_c

In [8]:
# Test sentence
text = "தமிழ் மொழி ஒரு அழகான மொழியாகும்."

# Tokenize
encoded = tokenizer.encode(text)

# Compute compression ratio
original_length = len(text)
tokenized_length = len(encoded.tokens)
compression_ratio = original_length / tokenized_length

print("Tokens:", encoded.tokens)
print("Compression Ratio:", compression_ratio)


Tokens: ['தமிழ்', 'மொழி', 'ஒரு', 'அழ', 'கான', 'மொ', 'ழ', 'ியாக', 'ும்', '.']
Compression Ratio: 3.2


In [None]:
!pip install tokenizers




In [10]:
!pip install tokenizers huggingface_hub pandas





In [11]:
from huggingface_hub import notebook_login

notebook_login()  # This will ask you to enter your Hugging Face token


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [17]:
from huggingface_hub import HfApi

repo_id = "antonyrajesh/Tamil-BPE-Tokenizer"  # Replace with your HF username & repo name
api = HfApi()

# Upload Tokenizer File
api.upload_file(
    path_or_fileobj="bpe_tamil_tokenizer.json",
    path_in_repo="bpe_tamil_tokenizer.json",
    repo_id=repo_id,
)

print("Tokenizer uploaded successfully!")


No files have been modified since last commit. Skipping to prevent empty commit.


Tokenizer uploaded successfully!
