In [None]:
!pip install --upgrade pip
!pip install sentencepiece pandas regex tqdm



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import re
import unicodedata

input_file = "/content/drive/MyDrive/Dataset/dataset_raw.txt"
output_file = "/content/data/preprocessed/corpus.txt"

os.makedirs(os.path.dirname(output_file), exist_ok=True)

def preprocess_text_line(line):
    # Normalize Unicode
    line = unicodedata.normalize("NFC", line)

    # Skip extremely long lines to avoid regex hangs (optional)
    if len(line) > 10000:
        print(f"Skipping very long line ({len(line)} chars)")
        return ""

    # Remove headers exactly matching "शीर्षक" or "विवरण"
    if re.match(r"^\s*(शीर्षक|विवरण)\s*$", line):
        return ""

    # Replace smart quotes with normal ones
    line = line.replace('“', '"').replace('”', '"').replace('‘', "'").replace('’', "'")

    # Collapse multiple spaces/tabs into one space
    # This should be very fast on normal lines
    line = re.sub(r"[ \t]+", " ", line)

    return line.strip()

with open(input_file, "r", encoding="utf-8") as infile, \
     open(output_file, "w", encoding="utf-8") as outfile:

    blank_line_written = False

    for i, line in enumerate(infile, 1):
        try:
            processed_line = preprocess_text_line(line)
        except Exception as e:
            print(f"Error processing line {i}: {e}")
            processed_line = ""

        if processed_line == "":
            if not blank_line_written:
                outfile.write("\n")
                blank_line_written = True
        else:
            outfile.write(processed_line + "\n")
            blank_line_written = False

print(f"Preprocessed corpus saved at: {output_file}")

Skipping very long line (22356 chars)
Skipping very long line (22356 chars)
Skipping very long line (22356 chars)
Skipping very long line (22483 chars)
Skipping very long line (22356 chars)
Skipping very long line (15729 chars)
Skipping very long line (15742 chars)
Skipping very long line (15825 chars)
Skipping very long line (15737 chars)
Skipping very long line (15737 chars)
Skipping very long line (15729 chars)
Skipping very long line (15729 chars)
Skipping very long line (15728 chars)
Skipping very long line (13998 chars)
Skipping very long line (13998 chars)
Skipping very long line (13998 chars)
Skipping very long line (13998 chars)
Skipping very long line (13998 chars)
Skipping very long line (11821 chars)
Skipping very long line (10252 chars)
Skipping very long line (10282 chars)
Skipping very long line (11829 chars)
Skipping very long line (11898 chars)
Skipping very long line (11818 chars)
Skipping very long line (11889 chars)
Skipping very long line (11860 chars)
Skipping ver

In [None]:
from pathlib import Path

# Define the path to the folder containing the preprocessed file
data_pre = Path("/content/data/preprocessed")

# Search for the file named "corpus.txt" in that folder
pre_files = list(data_pre.glob("corpus.txt"))

if not pre_files:
    raise SystemExit("No preprocessed files found. Run upload + preprocess steps.")

# Pick the first found file (should be only one)
pf = pre_files[0]

# Read the file contents as lines
lines = pf.read_text(encoding='utf-8').splitlines()

print("Preprocessed file:", pf)
print("Number of lines:", len(lines))
print("\nFirst 10 lines:")
for i, l in enumerate(lines[:10], 1):
    print(i, "-", l)

Preprocessed file: /content/data/preprocessed/corpus.txt
Number of lines: 5587515

First 10 lines:
1 - सुर्खेत (रासस ।
2 - विसं २०४६ मा तत्कालीन मसालको विद्यार्थी संगठनको सदस्यता लिँदा उनी राप्ती ज्ञानोदय मावि रुकुममा कक्षा ६ मा अध्ययनरत थिइन् ।
3 - विद्यार्थी संगठनको सदस्य हुँदा राजनीतिबारे खासै थाहा थिएन ।
4 - पछि संगठनका अग्रजले मुलुकको राजनीतिक आर्थिक र सामाजिक अवस्थाको यथार्थता प्रष्ट्याउँदै जाँदा त्यसप्रति उनको चासो बढ्यो ।
5 - यद्यपि त्यतिबेला भने नेपालमा सशस्त्र युद्धको थालनी भइसकेको थिएन ।
6 - किशोर अवस्थादेखि राजनीतिक यात्रा शुरु गरेकी बिमला केसी रुकुम पश्चिमको सानीभेरी गाउँपालिका ९ सिम्ली निवासी हुन् ।
7 - सानै उमेरमा बुबाको मृत्यु भएकाले केसीलाई विद्यालय शिक्षा हाँसिल गर्न निकै समस्या भयो ।
8 - बुबाको मृत्युपछि आमाले भोगेको पीडा व्याप्त गरिबी र महिला हिंसाले उनलाई संघर्षमा उत्रन उत्प्रेरित गरायो ।
9 - विद्यार्थी राजनीतिबाट अघि बढेकी केसीपछि २०५२ सालमा तत्कालीन नेकपा (माओवादी)ले सञ्चालन गरेको जनयुद्धमा सहभागी भइन् ।
10 - पूर्णकालीन सदस्य बनेर माओवादी राजनीतिको यात्राका दौडान

In [None]:
import sentencepiece as spm
from pathlib import Path

subset_file = "subset_corpus.txt"

# Define the path to the preprocessed file using the output_file variable from the previous cell
pf = Path(output_file)

# Create a subset of first 3 million lines for training
with open(subset_file, "w", encoding="utf-8") as f_out:
    with open(str(pf), "r", encoding="utf-8") as f_in:
        for i, line in enumerate(f_in):
            if i >= 3000000:
                break
            f_out.write(line)

model_prefix = "nepali_tokenizer"
VOCAB_SIZE = 64000
MODEL_TYPE = "unigram"
CHAR_COVERAGE = 1.0

spm_cmd = (
    f"--input={subset_file} "
    f"--model_prefix={model_prefix} "
    f"--vocab_size={VOCAB_SIZE} "
    f"--model_type={MODEL_TYPE} "
    f"--character_coverage={CHAR_COVERAGE} "
    f"--shuffle_input_sentence=true "
    f"--input_sentence_size=1000000"
)

print("SentencePiece command:")
print(spm_cmd)

spm.SentencePieceTrainer.Train(spm_cmd)

print("\nTraining finished. Model files created:")
print(model_prefix + ".model")
print(model_prefix + ".vocab")

SentencePiece command:
--input=subset_corpus.txt --model_prefix=nepali_tokenizer --vocab_size=64000 --model_type=unigram --character_coverage=1.0 --shuffle_input_sentence=true --input_sentence_size=1000000

Training finished. Model files created:
nepali_tokenizer.model
nepali_tokenizer.vocab


In [None]:
import sentencepiece as spm
sp = spm.SentencePieceProcessor()
sp.load("nepali_tokenizer.model")

def sp_pieces(text):
    return sp.encode(text, out_type=str)

def sp_decode(pieces):
    return sp.decode_pieces(pieces)

tests = [
    "नेपाल एक सुन्दर देश हो।",
    "काठमाडौं नेपालको राजधानी हो।",
    "घरहरूमा धेरै मान्छे बस्छन्।"
]

for t in tests:
    pieces = sp_pieces(t)
    recon = sp_decode(pieces)
    print("\nSENT:", t)
    print("PIECES:", pieces)
    print("RECON:", recon)


SENT: नेपाल एक सुन्दर देश हो।
PIECES: ['▁नेपाल', '▁एक', '▁सुन्दर', '▁देश', '▁हो', '।']
RECON: नेपाल एक सुन्दर देश हो।

SENT: काठमाडौं नेपालको राजधानी हो।
PIECES: ['▁काठमाडौं', '▁नेपालको', '▁राजधानी', '▁हो', '।']
RECON: काठमाडौं नेपालको राजधानी हो।

SENT: घरहरूमा धेरै मान्छे बस्छन्।
PIECES: ['▁घर', 'हरूमा', '▁धेरै', '▁मान्छे', '▁बस्छन्', '।']
RECON: घरहरूमा धेरै मान्छे बस्छन्।


In [None]:
import pandas as pd
from pathlib import Path

vocab_path = Path("nepali_tokenizer.vocab")
if not vocab_path.exists():
    raise SystemExit("Vocab file not found. Run training cell.")

vocab_lines = vocab_path.read_text(encoding='utf-8').splitlines()
rows = [l.split("\t") for l in vocab_lines]
tokens = [r[0] for r in rows]
scores = [float(r[1]) if len(r) > 1 and r[1] != "" else None for r in rows]

df = pd.DataFrame({"token": tokens, "score": scores})
df["length_chars"] = df["token"].str.len()

# Display first 5 rows
print("\nFirst 5 vocab entries:")
print(df.head(5))

# Print total unique vocab count
unique_vocab_count = df["token"].nunique()
print(f"\nTotal unique tokens in vocab: {unique_vocab_count}")


First 5 vocab entries:
   token    score  length_chars
0  <unk>  0.00000             5
1    <s>  0.00000             3
2   </s>  0.00000             4
3      । -2.98211             1
4      ▁ -3.08217             1

Total unique tokens in vocab: 64000


In [None]:
from statistics import mean

# Use the preprocessed file (pf)
ws_counts = []
sp_counts = []
with open(pf, 'r', encoding='utf-8') as f:
    for ln in f:
        ln = ln.strip()
        if not ln:
            continue
        ws = ln.split()
        # join back into a "sentence" (tokens already spaced), evaluate SP tokenization
        reconstructed = " ".join(ws)
        ws_counts.append(len(ws))
        sp_counts.append(len(sp.encode(reconstructed, out_type=str)))

print("Processed lines:", len(ws_counts))
print("Average whitespace tokens per line:", mean(ws_counts))
print("Average SentencePiece tokens per line:", mean(sp_counts))
print("Average token inflation ratio (SP / whitespace):", mean([s/w if w>0 else 0 for s,w in zip(sp_counts, ws_counts)]))

Processed lines: 5586977
Average whitespace tokens per line: 16.17308107765613
Average SentencePiece tokens per line: 19.792281407279823
Average token inflation ratio (SP / whitespace): 1.2442565697033334


In [None]:
from collections import Counter

split_words = Counter()
with open(pf, 'r', encoding='utf-8') as f:
    for ln in f:
        for word in ln.strip().split():
            sp_tokens = sp.encode(word, out_type=str)
            if len(sp_tokens) > 1:
                split_words[word] += 1

print("\nTop 20 most split words:")
for w, c in split_words.most_common(20):
    print(w, "→ split", c, "times")


Top 20 most split words:
। → split 4025073 times
छ। → split 442643 times
छन्। → split 179319 times
को → split 126451 times
हो। → split 97198 times
थियो। → split 94543 times
बताए। → split 83565 times
का → split 69085 times
थिए। → split 69015 times
हुन्। → split 40887 times
भने, → split 40493 times
दिए। → split 34137 times
ले → split 31513 times
बैंकले → split 29411 times
छैन। → split 29224 times
गरे। → split 24039 times
छ, → split 18298 times
बाट → split 18211 times
उनीहरुले → split 18200 times
हुन्छ। → split 17331 times


In [None]:
unk_lines = 0
with open(pf, 'r', encoding='utf-8') as f:
    for ln in f:
        if "<unk>" in sp.encode(ln.strip(), out_type=str):
            unk_lines += 1

print(f"\nLines containing <unk>: {unk_lines} ({unk_lines/len(ws_counts)*100:.2f}%)")


Lines containing <unk>: 0 (0.00%)


In [None]:
from shutil import copy2
drive_target = Path("/content/drive/MyDrive/nepali_tokenizer_models")
drive_target.mkdir(parents=True, exist_ok=True)

for fn in ["nepali_tokenizer.model", "nepali_tokenizer.vocab"]:
    src = Path(fn)
    if src.exists():
        dst = drive_target / fn
        copy2(src, dst)
        print("Copied", src, "->", dst)
    else:
        print("Not found (skipping):", src)

Copied nepali_tokenizer.model -> /content/drive/MyDrive/nepali_tokenizer_models/nepali_tokenizer.model
Copied nepali_tokenizer.vocab -> /content/drive/MyDrive/nepali_tokenizer_models/nepali_tokenizer.vocab


In [None]:
from statistics import mean
import numpy as np

# Use the preprocessed file (pf)
# Assuming 'df' with 'token' column exists from previous steps
token_lengths = [len(tok) for tok in df['token'].tolist()]
print(f"\nAverage token length: {np.mean(token_lengths):.2f} chars")
print(f"Median token length: {np.median(token_lengths):.2f} chars")


Average token length: 7.22 chars
Median token length: 7.00 chars


In [None]:
missing = []
with open(pf, 'r', encoding='utf-8') as f:
    for ln in f:
        for w in ln.strip().split():
            if "<unk>" in sp.encode(w, out_type=str):
                missing.append(w)

missing = list(set(missing))
print(f"\nUnique words not in vocab: {len(missing)}")
if missing:
    print("Example missing words:", missing[:20])


Unique words not in vocab: 0


In [None]:
# Coverage
import collections

char_counts = collections.Counter()
total_chars = 0

with open(pf, "r", encoding="utf-8") as f:
    for line in f:
        total_chars += len(line)
        char_counts.update(line)

# Characters in vocab
vocab_chars = {sp.id_to_piece(i) for i in range(sp.get_piece_size()) if len(sp.id_to_piece(i)) == 1}

covered_chars = sum(count for ch, count in char_counts.items() if ch in vocab_chars)
coverage_percent = (covered_chars / total_chars) * 100
print(f"✅ Character Coverage: {coverage_percent:.2f}%")

# Reconstruction Accuracy
errors = 0
total = 0
with open(pf, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        pieces = sp.encode(line, out_type=str)
        recon = sp.decode(pieces)
        if recon != line:
            errors += 1
        total += 1
print(f"✅ Reconstruction Accuracy: {(1 - errors/total)*100:.2f}%")

✅ Character Coverage: 84.49%
✅ Reconstruction Accuracy: 100.00%


In [None]:
import sentencepiece as spm
from pathlib import Path
import os

# Load the tokenizer model
sp = spm.SentencePieceProcessor()
sp.Load("nepali_tokenizer.model")

# Vocabulary size (should be 16000 if training succeeded)
vocab_size = sp.GetPieceSize()
print(f"Vocabulary size: {vocab_size}")

# Load a sample of lines to evaluate on (e.g., first 100k lines)
subset_path = Path("subset_corpus.txt")

def read_lines(file_path, max_lines=100000):
    with open(file_path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            if i >= max_lines:
                break
            yield line.strip()

lines = list(read_lines(subset_path, max_lines=100000))
print(f"Loaded {len(lines)} lines for evaluation")

# Tokenize each line, calculate token lengths
token_lengths = []
total_tokens = 0
oov_tokens = 0

for line in lines:
    tokens = sp.EncodeAsPieces(line)
    token_lengths.extend([len(token) for token in tokens])
    total_tokens += len(tokens)

    # Count OOV tokens (SentencePiece uses <unk> token for unknown)
    oov_tokens += tokens.count(sp.IdToPiece(sp.unk_id()))

avg_token_length = sum(token_lengths) / len(token_lengths) if token_lengths else 0
oov_rate = oov_tokens / total_tokens if total_tokens > 0 else 0

print(f"Average token length: {avg_token_length:.3f} characters")
print(f"OOV tokens: {oov_tokens} out of {total_tokens} tokens")
print(f"OOV rate: {oov_rate:.4%}")

Vocabulary size: 64000
Loaded 100000 lines for evaluation
Average token length: 5.264 characters
OOV tokens: 0 out of 1890384 tokens
OOV rate: 0.0000%


In [None]:

# Path to your saved model file
model_file = "/content/drive/MyDrive/nepali_tokenizer_models/nepali_tokenizer.model"

# Create a SentencePieceProcessor instance
sp = spm.SentencePieceProcessor()

# Load the model
sp.Load(model_file)

# Example text
text = "यो एउटा उदाहरण वाक्य हो।"

# Encode the text into pieces
pieces = sp.encode(text, out_type=str)
print("Pieces:", pieces)

# Encode the text into IDs
ids = sp.encode(text, out_type=int)
print("IDs:", ids)

# Decode pieces back to text
decoded_text_from_pieces = sp.decode_pieces(pieces)
print("Decoded from pieces:", decoded_text_from_pieces)

# Decode IDs back to text
decoded_text_from_ids = sp.decode_ids(ids)
print("Decoded from IDs:", decoded_text_from_ids)

Pieces: ['▁यो', '▁एउटा', '▁उदाहरण', '▁वाक्य', '▁हो', '।']
IDs: [26, 191, 1557, 9394, 17, 4]
Decoded from pieces: यो एउटा उदाहरण वाक्य हो।
Decoded from IDs: यो एउटा उदाहरण वाक्य हो।
