In [2]:
# Install required packages
# !pip3 -q install datasets
# !pip3 -q install transformers
# !pip3 -q install sentencepiece
# !pip3 -q install hazm
# !pip3 -q install clean-text[gpl]
# !pip3 install faiss-cpu
# !pip3 install torch torchvision torchaudio

In [12]:
# libraries
import codecs
import os
import json
import tqdm
from collections import Counter
from nltk import FreqDist
import pandas as pd
import numpy as np
import itertools
from hazm import *
import torch
from transformers import BigBirdModel, AutoTokenizer

## load pretrained model for first time

In [10]:
MODEL_NAME = "SajjadAyoubi/distil-bigbird-fa-zwnj"
model = BigBirdModel.from_pretrained(MODEL_NAME, block_size=32)
model = BigBirdModel.from_pretrained(MODEL_NAME, attention_type="original_full")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

HBox(children=(IntProgress(value=0, description='Downloading', max=314339179, style=ProgressStyle(description_…

Some weights of the model checkpoint at SajjadAyoubi/distil-bigbird-fa-zwnj were not used when initializing BigBirdModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BigBirdModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BigBirdModel were not initialized from the model checkpoint at SajjadAyoubi/distil-bigbird-fa-zwnj and are newly initialized: ['bert.pooler.

HBox(children=(IntProgress(value=0, description='Downloading', max=365, style=ProgressStyle(description_width=…

HBox(children=(IntProgress(value=0, description='Downloading', max=426422, style=ProgressStyle(description_wid…

HBox(children=(IntProgress(value=0, description='Downloading', max=112, style=ProgressStyle(description_width=…

In [11]:
model.save_pretrained('../models/pretrained-transformer-model.model')

tokenizer.save_pretrained('../models/pretrained-transformer-tokenizer')

('../models/pretrained-transformer-tokenizer\\tokenizer_config.json',
 '../models/pretrained-transformer-tokenizer\\special_tokens_map.json',
 '../models/pretrained-transformer-tokenizer\\vocab.txt',
 '../models/pretrained-transformer-tokenizer\\added_tokens.json',
 '../models/pretrained-transformer-tokenizer\\tokenizer.json')

## load pretrained model and tokenizer

In [13]:
model = BigBirdModel.from_pretrained('../models/pretrained-transformer-model.model')
tokenizer = AutoTokenizer.from_pretrained('../models/pretrained-transformer-tokenizer')

# vectorize docs type1

## join paragraphs up to 300 words

In [15]:
import json

source_names = ['hidoctor', 'namnak']
threshold = 300

for source in source_names:
    for i in range(1, 8):
        data = []
        with open(f'../old-dataset/{source}-{i}.json', 'r', encoding="utf-8") as j:
            contents = json.loads(j.read())
            for cont in contents:
                out = []
                if 'abstract' in cont.keys():
                    out.append(cont['abstract'])
                if len(cont['paragraphs']):
                    current = cont['paragraphs'][0]
                    current_len = len(current.split())
                    for chunk in cont['paragraphs'][1:]:
                        chunk_len = len(chunk.split())
                        if current_len + chunk_len < threshold:
                            current += ' ' + chunk
                            current_len += chunk_len
                        else:
                            if len(current) > 10: 
                                out.append(current)
                                current = chunk
                                current_len = chunk_len
                            else:
                                out.append(current + ' '.join(chunk.split()[0:threshold]))
                                current = ' '.join(chunk.split()[threshold : min(chunk_len, threshold + 299)])
                                current_len = len(current.split())
                if len(current) > 3:
                    out.append(current)

                item = {'title' : cont['title'], 'paragraphs': out}
                data.append(item)
        os.makedirs('data-edited', exist_ok=True)
        with open(f'data-edited/{source}-p{i}.json', 'w', encoding="utf-8") as f:
            json.dump(data, f)

In [16]:
DIRNAME = 'data-edited'
data = []
for i in range(1,8):
    with open(f'{DIRNAME}/namnak-p{i}.json', 'r', encoding="utf-8") as f:
           data += json.loads(f.read())
for i in range(1,8):
    with open(f'{DIRNAME}/hidoctor-p{i}.json', 'r', encoding="utf-8") as f:
           data += json.loads(f.read())
with open(f'data-edited/data-all-new.json', 'w', encoding="utf-8") as f:
    json.dump(data, f)

## load modified data

In [17]:
data = []
with open('data-edited/data-all-new.json', 'r', encoding="utf-8") as f:
       data = json.loads(f.read())

## normalization

In [18]:
# normalization
from hazm import *

normalizer = Normalizer()

normalized_data = []
for item in tqdm.tqdm(data):
    normalized_data.append({"title" : item['title'],
                            "paragraphs":[normalizer.normalize(paragraph) for paragraph in item['paragraphs']]})

100%|█████████████████████████████████████████████████████████████████████████████| 4322/4322 [00:06<00:00, 656.44it/s]


## stopwords

In [19]:
stopwords = [normalizer.normalize(x.strip()) for x in codecs.open('stopwords.txt','r','utf-8').readlines()]
custom_stop_words = [normalizer.normalize(x.strip()) for x in codecs.open('custom_stopwords.txt','r','utf-8').readlines()]
total_stop_words = custom_stop_words + stopwords

In [20]:
data_without_stopwords = []
for item in normalized_data:
    data_without_stopwords.append({"title" : item['title'],
                            "paragraphs":[' '.join([_ for _ in word_tokenize(paragraph)  if _ not in total_stop_words])
                                          for paragraph in item['paragraphs']]})

In [21]:
vectors = []

i = 0
for document in data_without_stopwords:
    title_encoded = model(**tokenizer(document['title'], return_tensors='pt'))[0].detach().squeeze()
    title_encoded = torch.mean(title_encoded, dim=0)

    encoded_paragraphs = [model(**tokenizer(doc, return_tensors='pt'))[0].detach().squeeze() 
                          for doc in document['paragraphs']]
    averaged_vectors = [torch.mean(vector, dim=0) for vector in encoded_paragraphs]
    paragraphs_averaged = torch.stack(averaged_vectors).mean(dim=0)
    
    vectors.append({'index': i, 'title': document['title'],
                    'vector' : [title_encoded.numpy().tolist(), paragraphs_averaged.numpy().tolist()]})
    i += 1

Token indices sequence length is longer than the specified maximum sequence length for this model (978 > 512). Running this sequence through the model will result in indexing errors


In [22]:
with open('../models/transformer_vectors-new.json', 'w', encoding="utf-8") as f:
    json.dump(vectors, f)

# vectorize docs type2

## join paragraphs up to 300 words

In [39]:
DIRNAME = '../dataset'
threshold = 300

data = []
for i in range(1, 8):
    with open(f'{DIRNAME}/namnak-p{i}.json', 'r', encoding="utf-8") as f:
           data += json.loads(f.read())
for i in range(1, 8):
    with open(f'{DIRNAME}/hidoctor-p{i}.json', 'r', encoding="utf-8") as f:
           data += json.loads(f.read())

            
data_paragraphed = []
link_data = {}
for i, doc in enumerate(data):
    out = []
    current =  ''
    current_len = 0
    for sent in doc['text'].split('. '):
        if current_len + len(sent.split()) < threshold:
            current += ' ' + sent
            current_len += len(sent.split())
        else:
            out.append(current)
            current = sent
            current_len = len(sent.split())
    if len(current) > 10:
        out.append(current)
    item = {'title': doc['title'], 'paragraphs': out}
    data_paragraphed.append(item)
    link_data[i] = doc['link']
with open(f'data-edited/data-all2.json', 'w', encoding="utf-8") as f:
    json.dump(data_paragraphed, f)
with open(f'../models/transformers-link-docs-data.json', 'w', encoding="utf-8") as f:
    json.dump(link_data, f)

## load modified data

In [24]:
data = []
with open('data-edited/data-all2.json', 'r', encoding="utf-8") as f:
       data = json.loads(f.read())

## normalization

In [25]:
# normalization
from hazm import *

normalizer = Normalizer()

normalized_data = []
for item in tqdm.tqdm(data):
    normalized_data.append({"title" : item['title'],
                            "paragraphs":[normalizer.normalize(paragraph) for paragraph in item['paragraphs']]})

100%|█████████████████████████████████████████████████████████████████████████████| 4322/4322 [00:06<00:00, 627.84it/s]


## stopwords

In [26]:
stopwords = [normalizer.normalize(x.strip()) for x in codecs.open('stopwords.txt','r','utf-8').readlines()]
custom_stop_words = [normalizer.normalize(x.strip()) for x in codecs.open('custom_stopwords.txt','r','utf-8').readlines()]
total_stop_words = custom_stop_words + stopwords

In [27]:
data_without_stopwords = []
for item in normalized_data:
    data_without_stopwords.append({"title" : item['title'],
                            "paragraphs":[' '.join([_ for _ in word_tokenize(paragraph)  if _ not in total_stop_words])
                                          for paragraph in item['paragraphs']]})

In [28]:
vectors1 = []

i = 0
for document in data_without_stopwords:
    title_encoded = model(**tokenizer(document['title'], return_tensors='pt'))[0].detach().squeeze()
    title_encoded = torch.mean(title_encoded, dim=0)

    encoded_paragraphs = [model(**tokenizer(doc, return_tensors='pt'))[0].detach().squeeze() 
                          for doc in document['paragraphs']]
    averaged_vectors = [torch.mean(vector, dim=0) for vector in encoded_paragraphs]
    paragraphs_averaged = torch.stack(averaged_vectors).mean(dim=0)
    
    vectors1.append({'index': i, 'title': document['title'],
                    'vector' : [title_encoded.numpy().tolist(), paragraphs_averaged.numpy().tolist()]})
    i += 1

In [29]:
with open('../models/transformer_vectors-new1.json', 'w', encoding="utf-8") as f:
    json.dump(vectors1, f)

## mix to vectors

In [30]:
DIRNAME = '../dataset'
data = []
for i in range(1, 8):
    with open(f'{DIRNAME}/namnak-p{i}.json', 'r', encoding="utf-8") as f:
           data += json.loads(f.read())
for i in range(1, 8):
    with open(f'{DIRNAME}/hidoctor-p{i}.json', 'r', encoding="utf-8") as f:
           data += json.loads(f.read())

useless_docs = []
for i in range(len(data)):
    if len(data[i]['text']) < 100:
        useless_docs.append(i)

In [31]:
with open('../models/transformer_vectors-new.json', 'r', encoding="utf-8") as f:
    emb = json.loads(f.read())
with open('../models/transformer_vectors-new1.json', 'r', encoding="utf-8") as f:
    emb1 = json.loads(f.read())

In [32]:
new_emb = []
for i in range(0, len(emb)):
    item = {'index' : emb[i]['index'],'title' : emb[i]['title'],
            'vector': ((((np.array(emb[i]['vector'][0])*1 + np.array(emb[i]['vector'][1])*9)/10)*3.5 + 
                       ((np.array(emb1[i]['vector'][0])*1 + np.array(emb1[i]['vector'][1])*9)/10)*0.5)/4).tolist()}
    new_emb.append(item)

In [33]:
with open('../models/transformer_vectors-new-mixed.json', 'w', encoding="utf-8") as f:
    json.dump(new_emb, f)

In [34]:
for doc in new_emb:
    if doc['index'] in useless_docs:
        new_emb.remove(doc)

In [35]:
with open('../models/transformer_vectors-new-mixed.json', 'w', encoding="utf-8") as f:
    json.dump(new_emb, f)

## Transformer model

In [64]:
from transformers import BigBirdModel, AutoTokenizer

class TransformerEmb:
    def __init__(self):
        self.model = BigBirdModel.from_pretrained('../models/pretrained-transformer-model.model')
        self.tokenizer = AutoTokenizer.from_pretrained('../models/pretrained-transformer-tokenizer')
        with open('../models/transformers-link-docs-data.json', 'r', encoding="utf-8") as f:
            self.docs_links = json.loads(f.read())
        with open('../models/transformer_vectors-new-mixed.json', 'r', encoding="utf-8") as f:
            self.docs_embs = json.loads(f.read())
        self.normalizer = Normalizer()
        stopwords = [self.normalizer.normalize(x.strip()) for x in codecs.open('stopwords.txt','r','utf-8').readlines()]
        custom_stop_words = [self.normalizer.normalize(x.strip()) for x in codecs.open('custom_stopwords.txt','r','utf-8').readlines()]
        self.total_stop_words = custom_stop_words + stopwords
    
    def print_similars(self, query, k=10):
        ls = self.get_query(query, k)
        for i, item in enumerate(ls):
            print(f'{i + 1}- title: {item[0]}')
            print(f'{i + 1}- link: {item[1]}')
            print('-------------------------')
    
    def get_query(self, query, k=10):
        encoded_query = self.model(**self.tokenizer(query, return_tensors='pt'))[0].detach().squeeze()
        encoded_query = torch.mean(encoded_query, dim=0).numpy()
        return self.nearest_neighbor(encoded_query, self.docs_embs, k)
    
    def cosine_similarity(self, vector_1: np.ndarray, vector_2: np.ndarray) -> float:
        return np.dot(vector_1, vector_2)/(np.linalg.norm(vector_1) *
                                          np.linalg.norm(vector_2))
    
    def nearest_neighbor(self, v, doc_embs, k):
        data = {}
        for doc in doc_embs:
            data[doc['title']] = (self.cosine_similarity(v, np.array(doc['vector'])), self.docs_links[str(doc['index'])])
        return [(k,v[1]) for k, v in sorted(data.items(), key=lambda item: item[1][0])][::-1][:k]

In [65]:
transformer = TransformerEmb()

In [66]:
transformer.print_similars('ریزش مو')

1- title: روش جدید برای درمان ریزش مو و کچلی
1- link: https://namnak.com/درمان-کچلی.p31230
-------------------------
2- title: دلایل ریزش مو در زنان
2- link: https://namnak.com/ریزش-مو.p65282
-------------------------
3- title: شایعترین عوامل موثر در قد کودکان
3- link: https://namnak.com/قد-کودکان.p60301
-------------------------
4- title: چرا نگهداری دندان شیری مهم است و فواید ان چیست؟
4- link: https://namnak.com/نگهداری-دندان-شیری.p58074
-------------------------
5- title: 5 علامت نگران کننده در مورد سلامت بدن
5- link: https://namnak.com/health-problems-symptoms.p59240
-------------------------
6- title: وقتی موهای بدن هشدار می دهند !
6- link: https://namnak.com/body-hair-says-about-health.p77001
-------------------------
7- title: چهار تا از بهترین شامپو ها برای جلوگیری از ریزش مو
7- link: https://www.hidoctor.ir/354908_%da%86%d9%87%d8%a7%d8%b1-%d8%aa%d8%a7-%d8%a7%d8%b2-%d8%a8%d9%87%d8%aa%d8%b1%db%8c%d9%86-%d8%b4%d8%a7%d9%85%d9%be%d9%88-%d9%87%d8%a7-%d8%a8%d8%b1%d8%a7%db%8c-%d8%ac%d