In [1]:
import numpy as np
import pandas as pd
from preprocess import (
    remove_punctuation,
    split_sentences,
    process_suffixes,
    get_suffixes,
)

In [2]:
# CONSTANTS
CORPUS_PATH = './corpus'
SUFFIXES = get_suffixes()

In [3]:
# GET CORPUS
import os

content_files = os.listdir(CORPUS_PATH)

all_content = []

for filename in content_files:
    path = os.path.join(CORPUS_PATH, filename)
    with open(path, 'r') as f:
        all_content.append(f.read())


In [4]:
def clean_and_extract_sentences(text):
    clean = remove_punctuation(text)
    sentences = split_sentences(clean)
    sentences_word_list = []
    for sentence in sentences:
        splitted = [x for x in sentence.split() if x]
        suffix_processed = process_suffixes(SUFFIXES, splitted)
        sentences_word_list.append(
            [x for x in suffix_processed if x]
        )
    return sentences_word_list

In [5]:
# Get unique words and their corresponding count within corpus
from collections import Counter

words_count = Counter()
for content in all_content:
    cleaned_sentences = clean_and_extract_sentences(content)
    for sentence in cleaned_sentences:
        words_count.update(sentence)

print(len(words_count))

118872


In [7]:
# Save words list, sorted
import time
s = time.time()
sorted_words_list = sorted(words_count.items(), key=lambda x: x[1], reverse=True)
e = time.time()
print('Time taken to sort : ', e - s)
print(sorted_words_list[:200])

def a():
    WORDS_COUNT_FILE = 'words_count.txt'
    with open(WORDS_COUNT_FILE, 'w') as f:
        for k, v in words_count.items():
            f.write(f'{k} {v}\n')

Time taken to sort :  0.04373979568481445
[('N', 196257), ('छ', 118307), ('लाई', 97504), ('र', 97086), ('भए', 78608), ('गरे', 62055), ('पनि', 61618), ('बाट', 47160), ('छन्', 44599), ('भने', 43731), ('उन', 38068), ('पछि', 34132), ('जना', 32026), ('कोरोना', 30723), ('गर्ने', 29194), ('गर्न', 29012), ('नेपाल', 28924), ('सरकार', 28712), ('हो', 28696), ('लागि', 28022), ('थियो', 27986), ('बताए', 25958), ('सँग', 24939), ('रहे', 22676), ('तर', 21205), ('काठमाडौं', 20913), ('सम्म', 20786), ('तथा', 20768), ('यो', 20537), ('एक', 20387), ('प्रहरी', 19888), ('उनी', 19843), ('नै', 19329), ('दिए', 19163), ('यस', 18618), ('आए', 18368), ('अनुसार', 18278), ('हुने', 17439), ('गरिए', 17165), ('अस्पताल', 17031), ('संक्रमण', 16745), ('थिए', 16371), ('वर्ष', 15877), ('काम', 15818), ('प्रदेश', 15291), ('दिन', 15075), ('हजार', 14773), ('संक्रमित', 14456), ('मात्र', 14356), ('ओली', 14164), ('अहि', 14117), ('प्रधानमन्त्री', 13780), ('जिल्ला', 13638), ('भन्ने', 13377), ('पार्टी', 13238), ('देखि', 13203), ('सबै', 