In [1]:
from gujarati_tokenizer import GujaratiTokenizer, process_dataset, save_tokenized_data, compute_corpus_statistics
from datasets import load_dataset
from collections import Counter
import os

In [2]:
print("Loading Gujarati dataset from IndicCorpV2...")
dataset = load_dataset("ai4bharat/IndicCorpV2", "indiccorp_v2", split="guj_Gujr", streaming=True)
print("Dataset loaded successfully!")

Loading Gujarati dataset from IndicCorpV2...
Dataset loaded successfully!


In [3]:
tokenizer = GujaratiTokenizer()

test_text = """આજે હું શાળાએ ગયો! મારું ઇમેઇલ student@gmail.com છે। 
આજની તારીખ 30/07/2025 છે અને સમય 10.30 વાગ્યે છે। 
વેબસાઇટ www.gujarati.com પર જાઓ। આ કિંમત ₹123.45 છે।"""

print("Original text:")
print(test_text)
print("\n" + "="*50)

sentences = tokenizer.sentence_tokenize(test_text)
print(f"Number of sentences: {len(sentences)}")
print("\nSentences:")
for i, sent in enumerate(sentences, 1):
    print(f"{i}. {sent.strip()}")

print("\n" + "="*50)

words = tokenizer.word_tokenize(test_text)
print(f"Number of tokens: {len(words)}")
print("\nTokens with classification:")
for word in words[:20]:
    word_type = tokenizer.classify_token(word)
    print(f"'{word}' -> {word_type}")

print("Processing the dataset...")
print("Note: Processing 1000 examples. Adjust max_examples as needed.")

processed_data = process_dataset(dataset, max_examples=1000)


Original text:
આજે હું શાળાએ ગયો! મારું ઇમેઇલ student@gmail.com છે। 
આજની તારીખ 30/07/2025 છે અને સમય 10.30 વાગ્યે છે। 
વેબસાઇટ www.gujarati.com પર જાઓ। આ કિંમત ₹123.45 છે।

Number of sentences: 10

Sentences:
1. આજે હું શાળાએ ગયો
2. મારું ઇમેઇલ student@gmail
3. com છે
4. આજની તારીખ 30/07/2025 છે અને સમય 10
5. 30 વાગ્યે છે
6. વેબસાઇટ www
7. gujarati
8. com પર જાઓ
9. આ કિંમત ₹123
10. 45 છે

Number of tokens: 34

Tokens with classification:
'આજે' -> gujarati_word
'હું' -> gujarati_word
'શાળાએ' -> gujarati_word
'ગયો' -> gujarati_word
'!' -> punctuation
'મારું' -> gujarati_word
'ઇમેઇલ' -> gujarati_word
'student@gmail.com' -> email
'છે' -> gujarati_word
'।' -> punctuation
'આજની' -> gujarati_word
'તારીખ' -> gujarati_word
'30' -> integer
'/' -> punctuation
'07' -> integer
'/' -> punctuation
'2025' -> integer
'છે' -> gujarati_word
'અને' -> gujarati_word
'સમય' -> gujarati_word
Processing the dataset...
Note: Processing 1000 examples. Adjust max_examples as needed.
Processing up to 1000 examples

In [4]:
print(f"Successfully processed {len(processed_data)} documents")

print("Saving tokenized data...")
save_tokenized_data(processed_data, 'gujarati_corpus_tokenized')
print("Data saved successfully!")

Successfully processed 500 documents
Saving tokenized data...
Data saved as:
- gujarati_corpus_tokenized.json (JSON format)
- gujarati_corpus_tokenized.pkl (Pickle format)
- gujarati_corpus_tokenized_stats.txt (Statistics)
Data saved successfully!


In [5]:
# Cell 6: Compute and display corpus statistics
print("Computing corpus statistics...")

stats = compute_corpus_statistics(processed_data)

print("\n" + "="*60)
print("CORPUS STATISTICS")
print("="*60)
for key, value in stats.items():
    print(f"{key:45}: {value}")
print("="*60)

Computing corpus statistics...

CORPUS STATISTICS
Total number of sentences                    : 1632
Total number of words                        : 23504
Total number of characters                   : 132007
Average sentence length (words per sentence) : 14.4
Average word length (characters per word)    : 4.63
Type-Token Ratio (TTR)                       : 0.3609
Total unique words                           : 8482


In [6]:
print("Detailed Analysis of Processed Data")
print("="*40)

if processed_data:
    sample_doc = processed_data[0]
    print(f"Sample document analysis:")
    print(f"Original text length: {len(sample_doc['original_text'])} characters")
    print(f"Number of sentences: {len(sample_doc['sentences'])}")
    print(f"Total words in document: {sample_doc['total_words']}")
    print(f"First 100 characters: {sample_doc['original_text'][:100]}...")
    
    print(f"\nFirst sentence details:")
    if sample_doc['sentences']:
        first_sentence = sample_doc['sentences'][0]
        print(f"Sentence text: {first_sentence['text']}")
        print(f"Number of words: {first_sentence['word_count']}")
        print(f"Words: {first_sentence['words'][:10]}...")
        
        print(f"Token classifications (first 10):")
        for word, word_type in first_sentence['classified_words'][:10]:
            print(f"  '{word}' -> {word_type}")


Detailed Analysis of Processed Data
Sample document analysis:
Original text length: 54 characters
Number of sentences: 1
Total words in document: 11
First 100 characters: આ વીડિયો જુઓ: ઊંઝા માર્કેટયાર્ડ આજથી 25 જુલાઈ સુધી બંધ...

First sentence details:
Sentence text: આ વીડિયો જુઓ: ઊંઝા માર્કેટયાર્ડ આજથી 25 જુલાઈ સુધી બંધ
Number of words: 11
Words: ['આ', 'વીડિયો', 'જુઓ', ':', 'ઊંઝા', 'માર્કેટયાર્ડ', 'આજથી', '25', 'જુલાઈ', 'સુધી']...
Token classifications (first 10):
  'આ' -> gujarati_word
  'વીડિયો' -> gujarati_word
  'જુઓ' -> gujarati_word
  ':' -> punctuation
  'ઊંઝા' -> gujarati_word
  'માર્કેટયાર્ડ' -> gujarati_word
  'આજથી' -> gujarati_word
  '25' -> integer
  'જુલાઈ' -> gujarati_word
  'સુધી' -> gujarati_word


In [7]:
print("\nToken Type Distribution Analysis")
print("="*40)

all_token_types = []
gujarati_words = []
english_words = []
numbers = []
emails = []
urls = []

for doc in processed_data:
    for sentence in doc['sentences']:
        for word, word_type in sentence['classified_words']:
            all_token_types.append(word_type)
            
            if word_type == 'gujarati_word':
                gujarati_words.append(word)
            elif word_type == 'english_word':
                english_words.append(word)
            elif word_type in ['integer', 'decimal_number']:
                numbers.append(word)
            elif word_type == 'email':
                emails.append(word)
            elif word_type == 'url':
                urls.append(word)

type_distribution = Counter(all_token_types)
print("Token type distribution:")
for token_type, count in type_distribution.most_common():
    percentage = (count / len(all_token_types)) * 100
    print(f"{token_type:20}: {count:8} ({percentage:.2f}%)")


Token Type Distribution Analysis
Token type distribution:
gujarati_word       :    21576 (91.80%)
punctuation         :     1381 (5.88%)
integer             :      434 (1.85%)
english_word        :      111 (0.47%)
url                 :        2 (0.01%)


In [8]:
print("\n" + "="*50)
print("SAMPLE TOKENS BY TYPE")
print("="*50)

print(f"\nSample Gujarati words ({len(set(gujarati_words))} unique):")
unique_gujarati = list(set(gujarati_words))[:10]
for word in unique_gujarati:
    print(f"  {word}")

if english_words:
    print(f"\nSample English words ({len(set(english_words))} unique):")
    unique_english = list(set(english_words))[:10]
    for word in unique_english:
        print(f"  {word}")

if numbers:
    print(f"\nSample numbers ({len(set(numbers))} unique):")
    unique_numbers = list(set(numbers))[:10]
    for word in unique_numbers:
        print(f"  {word}")

if emails:
    print(f"\nEmails found ({len(set(emails))} unique):")
    unique_emails = list(set(emails))[:5]
    for email in unique_emails:
        print(f"  {email}")

if urls:
    print(f"\nURLs found ({len(set(urls))} unique):")
    unique_urls = list(set(urls))[:5]
    for url in unique_urls:
        print(f"  {url}")




SAMPLE TOKENS BY TYPE

Sample Gujarati words (8185 unique):
  વહેતી
  ઉપરાતં
  અસાઇનમેન્ટની
  ઉદયસિંહ
  સ્મગલર
  ભગત
  સિંહ
  સીસ્ટમ
  લોડ
  ફાયદો

Sample English words (99 unique):
  Shraddha
  tablecloth
  Neeraj
  HalloApp
  BCCI
  Crorepati
  RTGS
  LED
  Sledkov
  GIDC

Sample numbers (175 unique):
  36
  2021
  ૧૫
  08
  ૦
  ૭
  ૪૦
  65
  ૬૦૦
  ૬૦

URLs found (2 unique):
  https://events
  http://tiny


In [9]:

print("\nFile verification:")
files_to_check = [
    'gujarati_corpus_tokenized.json',
    'gujarati_corpus_tokenized.pkl', 
    'gujarati_corpus_tokenized_stats.txt'
]

for filename in files_to_check:
    if os.path.exists(filename):
        size = os.path.getsize(filename)
        print(f"✓ {filename}: {size:,} bytes")
    else:
        print(f"✗ {filename}: Not found")


File verification:
✓ gujarati_corpus_tokenized.json: 3,678,244 bytes
✓ gujarati_corpus_tokenized.pkl: 1,321,873 bytes
✓ gujarati_corpus_tokenized_stats.txt: 258 bytes
