# 🟢 Data gather from Git Repo

In [8]:
import requests
import json
from tqdm import tqdm

base_url = "https://github.com/language-ml/course-nlp-ir-1-text-exploring/raw/main/exploring-datasets/health/"  # Replace with your JSON URL
file_names = ['hidoctor-1.json', 'hidoctor-2.json', 'hidoctor-3.json', 'hidoctor-4.json', 'hidoctor-5.json',
              'namnak-1.json', 'namnak-2.json', 'namnak-3.json', 'namnak-4.json', 'namnak-5.json']
url_list = [base_url + file_name for file_name in file_names]
all_paragraphs = []

for url in tqdm(url_list):
    response = requests.get(url)
    data = json.loads(response.text)
    for dict_ in data:
        all_paragraphs += dict_['paragraphs']

100%|██████████| 10/10 [00:32<00:00,  3.29s/it]


# 🟢 No need for smoothing

# 🟢 pattern = r'\b' + str_ngram + r'\b' # it's so importatnt because of preventing sub simple words

سر همین ۱۴ ساعت برنامه الکی ران شد🟢 ..................

In [None]:
from hazm import sent_tokenize
from nltk import ngrams
import regex as re
from hazm import Normalizer
from hazm import word_tokenize
import pandas as pd


class Generator_ngram:
    
    def __init__(self, n):
        '''
        n for ngram
        '''
        self.n = n
    
    def __preprocess(self, all_paragraphs):
        # Text preprocessing
        
        # Extract sentences from paragraphs :
        sentences_list = []
        for paragraph in tqdm(all_paragraphs, desc = 'Sentences tokenization'):
            sentences_list += sent_tokenize(paragraph)
        
        # Normalize + remove extra denotations like :
        normalized_sentences_list = []
        normalizer = Normalizer()
        for sentence in tqdm(sentences_list, desc = 'Normalization'):
            normalized_sentence = re.sub('[:,،.<>/!@#$%~{}();»«…“”"؛؟◊♦–\*\+_\^]', ' ', sentence)
            normalized_sentence = normalizer.normalize(normalized_sentence)
            normalized_sentences_list.append(normalized_sentence)        
            
        # Extract tokens from normalized sentences
        self.all_tokens = [] # <s> and </s> tags are added
        for sentence in tqdm(normalized_sentences_list, desc = 'Tokenization'):
            self.all_tokens.append("<s>")
            temp_tokens = word_tokenize(sentence)
            # علامت های خیلی زیادی وجود داره و پیدا کردن همشون سخته فکر کنم تنها کلمه تک حرفی توی فارسی 'و' باشه 
            # پس به غیر از 'و' همه توکن های تک حرفی رو حذف کنیم
            for token in temp_tokens:
                if len(token) > 1 or token == 'و':
                    self.all_tokens.append(token)
            self.all_tokens.append("</s>")

        self.remaked_corpus = ' '.join(self.all_tokens) # use this for search and count
        
        
    def fit(self, input_corpus):
        '''
        input_corpus is a list(each item should be a string) of paragraphs.
        '''
        
        self.__preprocess(input_corpus)
        
        df_ngrams = pd.DataFrame(columns = ['ngram', 'count'])
        df_indx = -1
        for ngram in tqdm(set(ngrams(self.all_tokens, self.n)), desc = f'{self.n}_grams calculation'): # use set to ignore repeated ones
            if '<s>' not in ngram and '</s>' not in ngram:
                str_ngram = ' '.join(ngram)
                try :
                    pattern = r'\b' + str_ngram + r'\b' # it's so importatnt because of preventing sub simple words
                    count = len(re.findall(pattern, self.remaked_corpus))
                    if count > 0:
                        df_indx += 1
                        df_ngrams.loc[df_indx, 'ngram'] = str_ngram
                        df_ngrams.loc[df_indx, 'count'] = count
                except:
                    print("Error at :")
                    print(str_ngram)
        
        self.df_ngrams = df_ngrams   
        self.df_ngrams.sort_values(by = 'count', ascending = False, inplace = True)
            
    def set_pre_probs(self, ngram_file_name):
        '''
        set pre calculated ngrams from file.
        ngram_file is a .csv with "ngram", "count" as columns
        '''
        self.df_ngrams = pd.read_csv(ngram_file_name)
        self.df_ngrams.sort_values(by = 'count', ascending = False, inplace = True)
        
    
    def generate(self,input_text, top_n):
        '''
        input_text : user input text
        top_n : top n words to show
        '''
        input_text = re.sub('[:,،.<>/!@#$%~{}();»«…“”"؛؟◊♦–\*\+_\^]', ' ', input_text)
        normalizer = Normalizer(correct_spacing = False)
        input_text = normalizer.normalize(input_text)

        input_text_tokens = input_text.split()
        last_word = input_text_tokens[-1] # no worries with space --> split doesn't count last space
            
        last_incomplete = True
        if input_text[-1] == ' ': # So the last word is complete
            last_incomplete = False

        if not last_incomplete: #complete
            if (self.n -1 > len(input_text_tokens)):
                raise Exception(f"input text must be longer than {self.n} words")
        else: #incomplete
            if (self.n > len(input_text_tokens)):
                raise Exception(f"input text must be longer than {self.n} words")
                
                
        df_output = self.df_ngrams.copy()
        if not last_incomplete: # last complete
            if self.n == 1:
                df_output['ngram'] = df_output['ngram'].str.split().str[-1]
                return df_output.head(top_n).values
            else:
                input_ngram_list = input_text_tokens[-(self.n-1):]
                str_input_ngram = ' '.join(input_ngram_list)
                df_output = df_output[df_output['ngram'].str.startswith(str_input_ngram + " ")]
                df_output['ngram'] = df_output['ngram'].str.split().str[-1]
                return df_output.head(top_n).values
            
        else: # incomplete
            if self.n == 1:
                df_output = df_output[df_output['ngram'].str.startswith(last_word)]
                df_output['ngram'] = df_output['ngram'].str.split().str[-1]
                return df_output.head(top_n).values
            else:
                input_ngram_list = input_text_tokens[-self.n:] # no need to subtract by 1 because last incomplete
                str_input_ngram = ' '.join(input_ngram_list)
                df_output = df_output[df_output['ngram'].str.startswith(str_input_ngram)]
                df_output['ngram'] = df_output['ngram'].str.split().str[-1]
                return df_output.head(top_n).values

# 1 gram

In [None]:
model = Generator_ngram(1)
model.set_pre_probs('1_gram_probs.csv') # read from trained & saved ngrams
text =input()
model.generate(text, 5)

array([['مغز', 1395],
       ['مغزی', 565],
       ['مغزها', 35],
       ['مغزتان', 22],
       ['مغزهای', 11]], dtype=object)

# 2 gram

In [7]:
model = Generator_ngram(2)
model.set_pre_probs('2_gram_probs.csv') # read from trained & saved ngrams
text = input()
model.generate(text, 5)

array([['و', 94],
       ['بادام', 87],
       ['مادر', 77],
       ['گاو', 50],
       ['را', 46]], dtype=object)

In [None]:
model = Generator_ngram(2)
model.set_pre_probs('2_gram_probs.csv') # read from trained & saved ngrams
text = input()
model.generate(text, 5)

افزایش نرخ 


array([['مرگ', 4],
       ['بالایی', 3],
       ['سوخت', 3],
       ['بقای', 2],
       ['چاقی', 2]], dtype=object)

<h1>
اگه تونستی روی
<br>
n = 3 
<br>
هم ران بگیر
    </h1>

In [None]:
for n in [1,2,3]:
    model = Generator_ngram(n)
    model.fit(all_paragraphs)
    model.df_ngrams.to_csv(f'{n}_gram_probs.csv', index = False)

Sentences tokenization: 100%|██████████| 58995/58995 [00:00<00:00, 180921.81it/s]
Normalization: 100%|██████████| 95636/95636 [00:04<00:00, 20177.87it/s]
Tokenization: 100%|██████████| 95636/95636 [00:01<00:00, 65381.13it/s]
1_grams calculation: 100%|██████████| 39694/39694 [05:28<00:00, 120.80it/s]
Sentences tokenization: 100%|██████████| 58995/58995 [00:00<00:00, 183944.43it/s]
Normalization: 100%|██████████| 95636/95636 [00:04<00:00, 20053.69it/s]
Tokenization: 100%|██████████| 95636/95636 [00:01<00:00, 73008.99it/s]
2_grams calculation: 100%|██████████| 454110/454110 [1:56:23<00:00, 65.03it/s]
Sentences tokenization: 100%|██████████| 58995/58995 [00:00<00:00, 174801.19it/s]
Normalization: 100%|██████████| 95636/95636 [00:04<00:00, 19837.49it/s]
Tokenization: 100%|██████████| 95636/95636 [00:01<00:00, 72146.14it/s]
3_grams calculation: 100%|██████████| 1015452/1015452 [6:04:43<00:00, 46.40it/s] 
