In [1]:
import re
from collections import Counter

In [2]:
def load_text(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.read()


In [None]:
def sentence_tokenizer(text):
    sentences = re.split(r'(?<=[।.!?])\s+', text)
    return [s.strip() for s in sentences if s.strip()]


In [4]:
def word_tokenizer(sentence):
    pattern = r'''
        [a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+ |     # Email
        https?://\S+ |                                       # URL
        \d{1,2}/\d{1,2}/\d{2,4} |                            # Dates
        \d+\.\d+ |                                           # Decimal numbers
        \d+ |                                                # Whole numbers
        \w+ |                                                # Words
        [^\w\s]                                              # Punctuation
    '''
    return re.findall(pattern, sentence, re.VERBOSE)


In [5]:
def save_list_to_file(data_list, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        for item in data_list:
            f.write(item + '\n')


In [6]:
def compute_statistics(sentences, words):
    total_sentences = len(sentences)
    total_words = len(words)
    total_characters = sum(len(word) for word in words)

    avg_sentence_length = total_words / total_sentences if total_sentences else 0
    avg_word_length = total_characters / total_words if total_words else 0
    type_token_ratio = len(set(words)) / total_words if total_words else 0

    return {
        "Total Sentences": total_sentences,
        "Total Words": total_words,
        "Total Characters": total_characters,
        "Avg Sentence Length": avg_sentence_length,
        "Avg Word Length": avg_word_length,
        "Type/Token Ratio": type_token_ratio
    }


In [None]:
file_name = "gom.txt"
text = load_text(file_name)


FileNotFoundError: [Errno 2] No such file or directory: 'gom.txt'

In [None]:
sentences = sentence_tokenizer(text)

In [None]:
tokenized_sentences = []
all_words = []

for s in sentences:
    words = word_tokenizer(s)
    all_words.extend(words)
    tokenized_sentence = ' '.join(words)
    tokenized_sentences.append(tokenized_sentence)

In [None]:
save_list_to_file(tokenized_sentences, "tokenized_sentences.txt")

In [None]:
save_list_to_file(all_words, "tokenized_words.txt")

In [None]:
stats = compute_statistics(sentences, all_words)

print("Statistics:")
for k, v in stats.items():
    print(f"{k}: {v}")