## реализация данной статьи https://habr.com/ru/articles/769124/

In [None]:
! pip install peft

In [1]:
! pip install langchain

Collecting langchain
  Obtaining dependency information for langchain from https://files.pythonhosted.org/packages/23/9f/a78357793c96ae5b53b5a31a891ed2fe3b02dc1a11a705dd14da67932c42/langchain-0.1.4-py3-none-any.whl.metadata
  Downloading langchain-0.1.4-py3-none-any.whl.metadata (13 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain)
  Obtaining dependency information for jsonpatch<2.0,>=1.33 from https://files.pythonhosted.org/packages/73/07/02e16ed01e04a374e644b575638ec7987ae846d25ad97bcc9945a3ee4b0e/jsonpatch-1.33-py2.py3-none-any.whl.metadata
  Downloading jsonpatch-1.33-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting langchain-community<0.1,>=0.0.14 (from langchain)
  Obtaining dependency information for langchain-community<0.1,>=0.0.14 from https://files.pythonhosted.org/packages/57/00/a798f8124db57eb9e20fe31dc7561e15e9c4607281cddaa4db49f93d7111/langchain_community-0.0.16-py3-none-any.whl.metadata
  Downloading langchain_community-0.0.16-py3-none-any.whl.metadata (7.8 

In [2]:
import torch
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer, AutoModel
import torch.nn.functional as F
from langchain.prompts import PromptTemplate

adapt_model_name = "IlyaGusev/saiga_mistral_7b_lora"
base_model_name = "Open-Orca/Mistral-7B-OpenOrca"

tokenizer = AutoTokenizer.from_pretrained(
              base_model_name,
              trust_remote_code=True)

tokenizer.pad_token = tokenizer.eos_token
device_map = {"": 0}
model = AutoPeftModelForCausalLM.from_pretrained(
              adapt_model_name,
              device_map=device_map,
              torch_dtype=torch.bfloat16)

model = model.to("cuda")

ModuleNotFoundError: No module named 'peft'

In [None]:
# Load model from HuggingFace Hub
sent_tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
sent_model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

In [None]:
def get_embedding(sentence):
    
    #Mean Pooling - Take attention mask into account for correct averaging
    def _mean_pooling(model_output, attention_mask):
        token_embeddings = model_output[0] #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    # Tokenize sentences
    encoded_input = sent_tokenizer([sentence], padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = sent_model(**encoded_input)

    # Perform pooling
    sentence_embeddings = _mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

    return sentence_embeddings

In [None]:
answers = []
emb_database = torch.empty((0, 384), dtype=torch.float32)

In [None]:
with open("/kaggle/input/zulip-data/prompts.txt", 'r') as f:
    text = f.readlines()

In [None]:
print(*text)

In [None]:
import pandas as pd

In [None]:
text_ans = pd.read_excel("/kaggle/input/zulip-xlsx/messages.xlsx", sheet_name="wekan_ans", names=['answer'])
text_q = pd.read_excel("/kaggle/input/zulip-xlsx/messages.xlsx", sheet_name="wekan_q", names=['questions'])
text_q_ans = pd.concat([text_q, text_ans], axis=1)
text_q_ans.head()

In [None]:
text_q_ans['q_ans'] = text_q_ans['questions'] + text_q_ans['answer']
text_q_ans['q_ans']

In [None]:
f = lambda x : ' '.join(x)
text = f(text_q_ans['q_ans']) + "{question}\nbot: Вот ответ на ваш вопрос длиной не более 10 слов"

In [None]:
info_prompt_less10 = PromptTemplate.from_template(text)

In [None]:
def get_answer(info_prompt, question):
    
    prompt = info_prompt.format(question=question)   
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), 
                            top_p=0.5,
                            temperature=0.3,
                            attention_mask=inputs["attention_mask"],
                            max_new_tokens=50,
                            pad_token_id=tokenizer.eos_token_id,
                            do_sample=True)

    output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    parsed_answer = output.split("Вот ответ на ваш вопрос длиной не более 10 слов:")[1].strip()

    if "bot:" in parsed_answer:
        parsed_answer = parsed_answer.split("bot:")[0].strip()

    return parsed_answer

In [None]:
question = "Не зачтены часы в карточке в wekan" 
emb = get_embedding(question)

In [None]:
def get_cos_sim(question):
    cos_sim = F.cosine_similarity(emb_database, emb, dim=1, eps=1e-8)
    return cos_sim
  
get_cos_sim(question)

In [None]:
answer = get_answer(info_prompt_less10, question)
emb_database = torch.cat((emb_database, emb), 0)
answers.append(answer)
print(f'Answer from model: {answer}')

In [None]:
questions = [
    "Не зачтены часы в карточке в wekan",
    "Не  отображается доска в wekan",
    "Не зачисляются часы в карточке",
    "Руководитель не может проставить часы",
    "Не найдена доска в wekan",
    "Можно ли удалить неверно зачтенную карточку?",
    "Не получается авторизоваться в wekan"
            ]

In [None]:
for q in questions:
    print(q)
    emb = get_embedding(q)
#     answer = get_answer(info_prompt_less10, q)
#     emb_database = torch.cat((emb_database, emb), 0)
#     answers.append(answer)
#     print(f'Answer from model: {answer}')
    
    cos_sim = get_cos_sim(q)
    max_value, max_index = torch.max(get_cos_sim(q), dim=0)

    if max_value > 0.9:
        answer = answers[max_index]
        print(f'DATABASE: {answer}')
    else:
        answer = get_answer(info_prompt_less10, q)
        emb_database = torch.cat((emb_database, emb), 0)
        answers.append(answer)
        print(f'MODEL: {answer}')
    print()