In [1]:
import torch
from transformers import BertModel, BertTokenizerFast
import pandas as pd
import torch.nn.functional as F
from hazm import *

# Model Config

In [2]:
tokenizer = BertTokenizerFast.from_pretrained("setu4993/LaBSE")
model = BertModel.from_pretrained("setu4993/LaBSE")
model = model.eval()

In [3]:
def PersianSentenceEmbedding(sentence):
    persian_input = tokenizer(sentence,return_tensors="pt", padding=True,truncation=True,max_length=100, add_special_tokens = True)
    with torch.no_grad():
        persian_output = model(**persian_input)
    persian_embedding = persian_output.pooler_output 
    return persian_embedding

In [4]:
def similarity(embeddings_1, embeddings_2):
    normalized_embeddings_1 = F.normalize(embeddings_1, p=2)
    normalized_embeddings_2 = F.normalize(embeddings_2, p=2)
    return torch.matmul(
        normalized_embeddings_1, normalized_embeddings_2.transpose(0, 1)
    )

# Data Preprocessing

In [5]:
df = pd.read_csv("Q_A\Q_A.csv")

In [6]:
def PreProcessing_Embedding(pureDataFrame):
    CleanDF = pureDataFrame.copy()
    CleanDF["Embeddings"] =  CleanDF["question"].apply(lambda x: PersianSentenceEmbedding(x))
    return CleanDF

In [12]:
def BestAnswer(question,QuestionsDataframe):
    # UserInput = input("سوال جدید را وارد کنید:")
    Clean_input = Normalizer().normalize(question)
    Embedded_input = PersianSentenceEmbedding(Clean_input)
    tempList = []
    for index,row in QuestionsDataframe.iterrows():
        sim = similarity(row["Embeddings"],Embedded_input)
        tempList.append(sim)
    bestMatchedIndex = tempList.index(max(tempList))
    return QuestionsDataframe.iloc[bestMatchedIndex,1]

In [7]:
CleanDF = PreProcessing_Embedding(df)

In [11]:
UserInput = input("سوال جدید را وارد کنید:")

In [13]:
BestAnswer(UserInput,CleanDF)

'با سلام، کاربر گرامی ارتقا نوع کاربری تا 24 ساعت بررسی و تایید خواهد شد. لطفا صبور باشید.'