In [None]:
!pip install -U -q langchain-community
!pip install -U -q langchain_core
!pip install -U -q gigachain-community
!pip install -U -q pypdf
!pip install -U -q chromadb
!pip install -U -q tiktoken
!pip install -U -q langchain_experimental
!pip install -U -q rank_bm25
!pip install -U -q rouge
!pip install -U -q transformers


In [None]:
!wget -q https://raw.githubusercontent.com/artyomrabosh/rag_for_sber/refs/heads/main/dataset/clean/requirements.csv
!wget -q https://raw.githubusercontent.com/artyomrabosh/rag_for_sber/refs/heads/main/dataset/clean/risk1.csv
!wget -q https://raw.githubusercontent.com/artyomrabosh/rag_for_sber/refs/heads/main/dataset/clean/risk2.csv
!wget -q https://raw.githubusercontent.com/artyomrabosh/rag_for_sber/refs/heads/luzanin/dataset/data_for_pipeline/queries1.csv
!wget -q https://raw.githubusercontent.com/artyomrabosh/rag_for_sber/refs/heads/luzanin/dataset/data_for_pipeline/queries2.csv
!wget -q https://raw.githubusercontent.com/artyomrabosh/rag_for_sber/refs/heads/luzanin/dataset/data_for_pipeline/queries3.csv
!wget -q https://raw.githubusercontent.com/artyomrabosh/rag_for_sber/refs/heads/luzanin/dataset/data_for_pipeline/end_to_end.csv
!wget -q https://raw.githubusercontent.com/artyomrabosh/rag_for_sber/refs/heads/aandreev/validation_generation.py

In [None]:
import pandas as pd
import re

import torch

from tqdm import tqdm

import numpy as np

from IPython.display import clear_output

from langchain.document_loaders import PyPDFLoader
import langchain_core
from langchain_core.documents.base import Document
from langchain.vectorstores import Chroma
import langchain
from langchain.chat_models import gigachat
from langchain.schema import HumanMessage, SystemMessage
from langchain.chat_models.gigachat import GigaChat

from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo
from sentence_transformers import SentenceTransformer

from rank_bm25 import BM25Okapi

import nltk
from nltk import WordPunctTokenizer
from nltk.corpus import stopwords

nltk.download('punkt_tab')
nltk.download('stopwords')
russian_stopwords = stopwords.words("russian")

import chromadb
chroma_client = chromadb.Client()

from google.colab import userdata
API_TOKEN = userdata.get('GIGACHAT')


from retrieval_modules import *
from retriever_validation import *
from validation_generation import *

from langchain.text_splitter import (RecursiveCharacterTextSplitter,
                                    SentenceTransformersTokenTextSplitter,
                                    TokenTextSplitter,
                                    NLTKTextSplitter,
                                    SpacyTextSplitter
                                    )


device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
# from sentence_transformers import CrossEncoder

# reranker_model = CrossEncoder('DiTy/cross-encoder-russian-msmarco', max_length=512, device='cuda')

# from sentence_transformers import SentenceTransformer
# model = SentenceTransformer('BAAI/bge-large-zh-v1.5')
# embeddings_1 = model.encode(sentences_1, normalize_embeddings=True)


In [None]:
class Embedder_wrapper:
    def __init__(self, model):
        self.model = model

    def embed_documents(self, texts):
        return [self.model.encode(text) for text in texts]

    def embed_docs_pc(self, docs):
        return [self.model.encode(doc.page_content) for doc in docs]

    def embed_query(self, query):
        return self.model.encode(query)

In [None]:
model = SentenceTransformer("intfloat/multilingual-e5-large-instruct").to(device)
embedder = Embedder_wrapper(model)

clear_output(wait=True)


##### Скачивамем обработанные документы

In [None]:
document_1 = pd.read_csv('risk1.csv')
document_2 = pd.read_csv('risk2.csv')
document_3 = pd.read_csv('requirements.csv')

In [None]:
docs_1 = []
docs_2 = []
docs_3 = []
for row in document_1.iterrows():
    docs_1.append(Document(row[1].loc['content']))
    docs_1[-1].metadata = {'header_1': row[1].loc['Header_1'],
                           'header_2': row[1].loc['Header_2'],
                           'header_3': row[1].loc['Header_3']}

for row in document_2.iterrows():
    docs_2.append(Document(row[1].loc['content']))
    docs_2[-1].metadata = {'header_1': row[1].loc['Header_1'],
                           'header_2': row[1].loc['Header_2'],
                           'header_3': row[1].loc['Header_3']}

for row in document_3.iterrows():
    docs_3.append(Document(row[1].loc['content']))
    docs_3[-1].metadata = {'header_1': row[1].loc['Header_1'],
                           'header_2': row[1].loc['Header_2'],
                           'header_3': row[1].loc['Header_3']}


In [None]:
whole_doc = docs_1 + docs_2 + docs_3

In [None]:
# данный сплиттер с этими параметрами был выбран после валидации ретривера

splitter = SpacyTextSplitter(chunk_overlap=100, chunk_size=1024)
splitted_docs = splitter.split_documents(whole_doc)

vectordb = Chroma.from_documents(
    documents=splitted_docs,
    embedding=embedder,
    persist_directory=f'docs/'
)

clear_output(wait=True)

##### Загружаем датасет с вопросами

In [None]:
queries_1 = pd.read_csv('queries1.csv')
queries_2 = pd.read_csv('queries2.csv')
queries_3 = pd.read_csv('queries3.csv')
end_to_end = pd.read_csv('end_to_end.csv')

all_queries = pd.concat([queries_1[['Вопрос', 'Ответ']],
                         queries_2[['Вопрос', 'Ответ']],
                         queries_3[['Вопрос', 'Ответ']],
                         end_to_end[['Вопрос', 'Ответ']]])

### End-to-end Валидация

In [None]:
param = {'db' : vectordb, 'strategy' : 'ss', 'fusion_alpha' : 0.6}
retriever = Retriever(**param)

In [None]:
giga_chat = GigaChat(credentials=API_TOKEN, verify_ssl_certs=False)

rag = RAG(retriever, giga_chat)

tp = """Используй данный контест чтобы ответить на вопрос в конце. Для ответа используй не более двух предложений.\
```{information}```
Вопрос: {query}
Ответ:"""

all_queries['gpt4o'] = all_queries['Вопрос'].apply(lambda x: rag.get_answer(x, tp)[0])


In [None]:
columns = {
    'Вопрос' : 'question',
    'Ответ' : 'golden_answer',
     'gpt4o' : 'rag_answer',
    # 'context_chunks': 'chunks'  # TODO
}
all_queries = all_queries.rename(columns = columns)

In [None]:
model = GigaChat(
    credentials=API_TOKEN,
    scope="GIGACHAT_API_PERS",
    model=["GigaChat", "GigaChat-Pro"][0],
    # Отключает проверку наличия сертификатов НУЦ Минцифры
    verify_ssl_certs=False,
)

model.invoke('say something').content


In [None]:
df_metrics = calculate_generation_metrics_v1(all_queries, model)
df_metrics.head()


In [None]:
df_metrics.drop(columns=['question', 'golden_answer', 'rag_answer', 'llm_score1_desc']).mean(axis=0)
