In [32]:
!pip install chromadb tiktoken sentence-transformers groq openpyxl rouge-score

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting absl-py (from rouge-score)
  Downloading absl_py-2.3.1-py3-none-any.whl.metadata (3.3 kB)
Collecting nltk (from rouge-score)
  Downloading nltk-3.9.2-py3-none-any.whl.metadata (3.2 kB)
Downloading absl_py-2.3.1-py3-none-any.whl (135 kB)
Downloading nltk-3.9.2-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m7.1 MB/s[0m  [33m0:00:00[0m eta [36m0:00:01[0m
[?25hBuilding wheels for collected packages: rouge-score
  Building wheel for rouge-score (pyproject.toml) ... [?25ldone
[?25h  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24987 sha256=bf8fe991a5e682bf8f449300f82106398e5d7a277bca6f4cfdd04ea1129129a8
  Stored in directory: /Users/mac/Library/Cac

In [None]:
!unzip chroma_store.zip

Archive:  chroma_store.zip
   creating: chroma_store/
   creating: chroma_store/36f71b6c-5104-459b-a417-cb69a7f92d08/
 extracting: chroma_store/36f71b6c-5104-459b-a417-cb69a7f92d08/link_lists.bin  
  inflating: chroma_store/36f71b6c-5104-459b-a417-cb69a7f92d08/header.bin  
  inflating: chroma_store/36f71b6c-5104-459b-a417-cb69a7f92d08/data_level0.bin  
  inflating: chroma_store/36f71b6c-5104-459b-a417-cb69a7f92d08/length.bin  
  inflating: chroma_store/chroma.sqlite3  


In [33]:
import os
import re
import ast
import chromadb
import tiktoken
from sentence_transformers import SentenceTransformer, util
from groq import Groq
import tiktoken
import torch
import pandas as pd
from collections import defaultdict
from tqdm import tqdm
from rouge_score import rouge_scorer

tqdm.pandas()

In [34]:
CHUNK_SIZE = 400
CHUNK_OVERLAP = 50
TOP_K = 5

# Модель для эмбеддингов
MODEL_NAME = "all-MiniLM-L6-v2"
enc = tiktoken.get_encoding("cl100k_base")
embedder = SentenceTransformer(MODEL_NAME)

# оценщик, который будем использовать при подсчёте метрик на генерации
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

# <center style="color: #000000ff;"> **Аугментация данных** </center>

In [3]:
def read_script(path: str) -> str:
    with open(path, "r", encoding="utf-8") as f:
        return f.read()

def count_tokens(text: str) -> int:
    return len(enc.encode(text))

def chunk_text(text: str, chunk_size=400, overlap=50):
    tokens = enc.encode(text)
    chunks = []
    i = 0
    while i < len(tokens):
        j = min(i + chunk_size, len(tokens))
        chunk = enc.decode(tokens[i:j])
        chunks.append(chunk)
        i += chunk_size - overlap
    return chunks


def extract_role(line: str):
    m = re.match(r"^([A-Z][A-Z\s'\-]+):", line.strip())
    return m.group(1).strip() if m else None

## Исходные данные

In [4]:
SCRIPT_PATH = "script.txt"
CHROMA_PATH = "./chroma_store1"
COLLECTION_NAME = "bojack_script"

client = chromadb.PersistentClient(path=CHROMA_PATH)
collection = client.get_or_create_collection(name=COLLECTION_NAME)

text = read_script(SCRIPT_PATH)

paragraphs = [p.strip() for p in text.split("\n") if p.strip()]

documents = []
metadatas = []
ids = []

chunk_id = 0

for i, para in enumerate(paragraphs):
    role = extract_role(para)
    chunks = chunk_text(para, CHUNK_SIZE, CHUNK_OVERLAP)
    for j, ch in enumerate(chunks):
        tok = count_tokens(ch)
        documents.append(ch)
        metadatas.append({
            "role": role or "UNKNOWN",
            "source_para": i,
            "chunk": j,
            "tokens": tok
        })
        ids.append(f"chunk-{chunk_id}")
        chunk_id += 1

embeddings = embedder.encode(documents, show_progress_bar=True).tolist()

collection.add(documents=documents, embeddings=embeddings, metadatas=metadatas, ids=ids)

Batches:   0%|          | 0/15 [00:00<?, ?it/s]

In [5]:
len(paragraphs)

471

## Аугментированные данные

Данные были аугментированы с помощью модели Gemini 3 Pro Preview (через интерфейс от Google). Промпт:

"""Мне нужно аугментировать данные для RAG-системы. Возьми скрипт «Коня БоДжека» и нагенерируй еще реплик, чтобы у меня было больше контента."""

In [4]:
SCRIPT_PATH = 'script_augmented.txt'
CHROMA_PATH = "./chroma_augmented_store"
COLLECTION_NAME = "bojack_script"

client = chromadb.PersistentClient(path=CHROMA_PATH)
collection = client.get_or_create_collection(name=COLLECTION_NAME)

text = read_script(SCRIPT_PATH)

paragraphs = [p.strip() for p in text.split("\n") if p.strip()]

documents = []
metadatas = []
ids = []

chunk_id = 0

for i, para in enumerate(paragraphs):
    role = extract_role(para)
    chunks = chunk_text(para, CHUNK_SIZE, CHUNK_OVERLAP)
    for j, ch in enumerate(chunks):
        tok = count_tokens(ch)
        documents.append(ch)
        metadatas.append({
            "role": role or "UNKNOWN",
            "source_para": i,
            "chunk": j,
            "tokens": tok
        })
        ids.append(f"chunk-{chunk_id}")
        chunk_id += 1

embeddings = embedder.encode(documents, show_progress_bar=True).tolist()

collection.add(documents=documents, embeddings=embeddings, metadatas=metadatas, ids=ids)

Batches:   0%|          | 0/18 [00:00<?, ?it/s]

In [5]:
len(paragraphs)

553

In [6]:
query = "Bojack is a total jerk"
res = collection.query(query_texts=[query], n_results=3)
for doc, meta in zip(res["documents"][0], res["metadatas"][0]):
    print(meta["role"], "→", doc[:150], "...\n")

BOJACK → BOJACK: What? ...

BOJACK → BOJACK: What? ...

UNKNOWN → BOJACK He's so stupid he doesn't realize how miserable he should be. I envy that. ...



# <center style="color: #000000ff;"> **Baseline-решение** </center>

In [None]:
# Модель для генерации ответа
api_key = ''
model = Groq(api_key=api_key)

In [None]:
def retrieve_chunks(query: str, n_results=3):
    res = collection.query(query_texts=[query], n_results=n_results)
    docs = res["documents"][0]
    metas = res["metadatas"][0]
    return docs, metas

def build_context(chunks, metas):
    context = ""
    for doc, meta in zip(chunks, metas):
        role = meta.get("role", "UNKNOWN")
        context += f"{role}: {doc}\n"
    return context

# CHARACTERS = ["BoJack", "Diane", "Todd"]

# def get_character_prompt(character: str):
#     """
#     Возвращает промпт для выбранного персонажа
#     """
#     if character == "BoJack":
#         return "You are BoJack Horseman. Answer sarcastically, cynically, and a bit depressively, staying true to BoJack's style and personality.\n"
#     elif character == "Diane":
#         return "You are Diane Nguyen. Answer thoughtfully, analytically, and with a touch of sarcasm, reflecting Diane's speech style.\n"
#     elif character == "Todd":
#         return "You are Todd Chavez. Answer naively, absurdly, and cheerfully, reflecting Todd's childlike and carefree personality.\n"
#     else:
#         return "You are hero from Bojack Horseman. Answer in this cartoon style."

def generate_answer(query: str, chunks_context: str, character: str, max_new_tokens=300):
    # char_prompt = get_character_prompt(character)
    prompt = f"""
        Answer the question below based on the context.
        Do not hallucinate; only use information from the context.

        Context:
        {chunks_context}

        Question:
        {query}

        Answer:
    """
    output = model.chat.completions.create(
                    model="llama-3.1-8b-instant",
                    messages=[{"role": "user", "content": prompt}],
                    max_tokens=600,
                    temperature=0.7
                )
    return output.choices[0].message.content.strip()

# <center style="color: #000000ff;"> **Валидация RAGа** </center>

## Сбор ответов

In [9]:
def validate_rag(questions_df: pd.DataFrame, output_path: str, use_character: bool = False):
    """
    Прогоняет вопросы через RAG и собирает ответы и чанки.
    Результаты динамически сохраняются в CSV.
    
    questions_df должно содержать столбцы: question, right_answer, right_chunk, type
    output_path — путь к CSV для сохранения результатов
    """
    # Проверка, есть ли уже файл, чтобы добавлять строки
    file_exists = os.path.exists(output_path)
    
    for idx, row in tqdm(questions_df.iterrows()):
        query = row["question"]
        character = row["character"] if use_character else None

        chunks, metas = retrieve_chunks(query, n_results=TOP_K)
        context = build_context(chunks, metas)
        answer = generate_answer(query, context, character)

        result = {
            "question": query,
            "right_answer": row["right_answer"],
            "right_chunk": row["right_chunk"],
            "rag_answer": answer,
            "rag_chunks": chunks,
            "rag_metas": metas,
            "type": row["type"],
            # "character": character
        }

        df_result = pd.DataFrame([result])
        if file_exists:
            df_result.to_csv(output_path, mode='a', index=False, header=False)
        else:
            df_result.to_csv(output_path, mode='w', index=False, header=True)
            file_exists = True

In [47]:
questions_df = pd.read_excel("valid_dataset.xlsx")
questions_df.head(3)

Unnamed: 0,question,right_answer,right_chunk,rag_answer,rag_chunk,type
0,"How many seasons did the show ""Horsin' Around""...",Nine seasons.,the family comedy struck a chord with America ...,,,Character Identity & Core Facts
1,What specific ingredients are in BoJack's brea...,"Carrots, vitamins, and vodka.","Bojack prepares himself a smoothie of carrots,...",,,Character Identity & Core Facts
2,What TV show did BoJack Horseman star in and w...,"Horsin' Around, which premiered in 1987.","CHARLIE ROSE: In 1987, the situation comedy Ho...",,,Character Identity & Core Facts


In [31]:
questions_df.shape

(222, 6)

In [32]:
questions_df['type'].value_counts()

type
Plot & Motivation                     69
Character Identity & Core Facts       61
Relationship Dynamics                 47
Internal Context & Emotional State    45
Name: count, dtype: int64

In [46]:
questions_df[questions_df['question'].notna() & questions_df['right_answer'].notna()].shape

(222, 6)

In [50]:
validate_rag(questions_df, output_path="rag_validation_results.csv", use_character=False)

222it [08:46,  2.37s/it]


In [11]:
results = pd.read_csv('rag_validation_results.csv')
results.head(3)

Unnamed: 0,question,right_answer,right_chunk,rag_answer,rag_chunks,rag_metas,type
0,"How many seasons did the show ""Horsin' Around""...",Nine seasons.,the family comedy struck a chord with America ...,"The show ""Horsin' Around"" went on to air for n...","[""CHARLIE ROSE: In 1987, the situation comedy...","[{'chunk': 0, 'source_para': 12, 'tokens': 24,...",Character Identity & Core Facts
1,What specific ingredients are in BoJack's brea...,"Carrots, vitamins, and vodka.","Bojack prepares himself a smoothie of carrots,...","Based on the context, the specific ingredient ...","['Bojack prepares himself a smoothie of', ""BOJ...","[{'source_para': 51, 'chunk': 0, 'role': 'UNKN...",Character Identity & Core Facts
2,What TV show did BoJack Horseman star in and w...,"Horsin' Around, which premiered in 1987.","CHARLIE ROSE: In 1987, the situation comedy Ho...","The TV show BoJack Horseman starred in was ""Ho...","[""BOJACK ON TV: Now, that's a horse of a diffe...","[{'source_para': 128, 'chunk': 0, 'tokens': 16...",Character Identity & Core Facts


In [52]:
results.shape

(222, 7)

## Подсчёт метрик

In [None]:
def normalize_text(t: str):
    if not isinstance(t, str):
        return ""
    t = t.lower().strip()
    t = re.sub(r"\s+", " ", t)
    t = re.sub(r"^[a-z]+:\s*", "", t)  # убираем роли типа 'BOJACK:' или 'CHARLIE:'
    return t

def f1_score_answer(pred, ref):
    pred_tokens = normalize_text(pred).split()
    ref_tokens = normalize_text(ref).split()
    if len(pred_tokens) == 0 or len(ref_tokens) == 0:
        return 0.0

    common = set(pred_tokens) & set(ref_tokens)
    if len(common) == 0:
        return 0.0
    
    precision = len(common) / len(pred_tokens)
    recall = len(common) / len(ref_tokens)
    return 2 * precision * recall / (precision + recall)

def semantic_similarity(right_chunk, rag_chunks):
    right_norm = normalize_text(right_chunk)
    rag_norms = [normalize_text(c) for c in rag_chunks]

    emb_right = embedder.encode(right_norm, convert_to_tensor=True)
    emb_rag = embedder.encode(rag_norms, convert_to_tensor=True)

    cos = util.cos_sim(emb_right, emb_rag)[0]
    return float(cos.max().item())   # лучший из K


def semantic_similarity_multi(right_chunks, rag_chunks):
    """
    right_chunks: list[str]
    rag_chunks: list[str]
    
    Возвращает максимальную близость между двумя списками реплик.
    """
    if not right_chunks or not rag_chunks:
        return 0.0

    right_norm = [normalize_text(c) for c in right_chunks]
    rag_norm = [normalize_text(c) for c in rag_chunks]

    emb_right = embedder.encode(right_norm, convert_to_tensor=True)
    emb_rag = embedder.encode(rag_norm, convert_to_tensor=True)

    cos = util.cos_sim(emb_right, emb_rag)  # матрица NxK

    return float(cos.max().item())

def compute_metrics(results_df):
    top1_list = []
    topk_list = []
    retriv_sim_list = []

    f1_list = []
    gen_sim_list = []
    rougeL_list = []

    row_metrics = []

    for _, row in results_df.iterrows():
        right_chunks = row.get("normalized_right_chunk", [])
        if isinstance(right_chunks, str):
            right_chunks = ast.literal_eval(right_chunks)
        right_norms = [normalize_text(rc) for rc in right_chunks]

        rag_chunks = ast.literal_eval(row["rag_chunks"])
        rag_norms = [normalize_text(c) for c in rag_chunks]

        # метрики для retrieval
        rag_top1 = rag_norms[0] if rag_norms else ""
        top1 = int(rag_top1 in right_norms)
        topk = int(any(rn in right_norms for rn in rag_norms))
        sim = semantic_similarity_multi(right_chunks, rag_chunks)

        # метрики для generation
        rag_answer = row["rag_answer"]
        right_answer = row["right_answer"]

        f1 = f1_score_answer(rag_answer, right_answer)
        gen_sim = semantic_similarity_multi([right_answer], [rag_answer])
        rougeL_score = scorer.score(normalize_text(right_answer), normalize_text(rag_answer))['rougeL'].fmeasure

        row_metrics.append({
            "top1": top1,
            "topk": topk,
            "retriv_semantic_sim": sim,
            "f1": f1,
            "gen_sim": gen_sim,
            "rougeL": rougeL_score,
        })

        # накопление для глобальных метрик
        top1_list.append(top1)
        topk_list.append(topk)
        retriv_sim_list.append(sim)
        f1_list.append(f1)
        gen_sim_list.append(gen_sim)
        rougeL_list.append(rougeL_score)

    metrics_df = pd.DataFrame(row_metrics)
    results_with_metrics = pd.concat([results_df.reset_index(drop=True), metrics_df], axis=1)

    global_metrics = {
        "retrieval_top1": sum(top1_list) / len(top1_list),
        "retrieval_topk": sum(topk_list) / len(topk_list),
        "retrieval_semantic_similarity": sum(retriv_sim_list) / len(retriv_sim_list),
        "generation_f1": sum(f1_list) / len(f1_list),
        "generation_semantic_similarity": sum(gen_sim_list) / len(gen_sim_list),
        "generation_rougeL": sum(rougeL_list) / len(rougeL_list),
    }

    return results_with_metrics, global_metrics

In [36]:
results_metrics, global_metrics = compute_metrics(results)

In [37]:
global_metrics

{'retrieval_top1': 0.21171171171171171,
 'retrieval_topk': 0.36486486486486486,
 'retrieval_semantic_similarity': 0.559862256855578,
 'generation_f1': 0.12784031206227517,
 'generation_semantic_similarity': 0.3118595419397057,
 'generation_rougeL': 0.15674869521317797}

In [38]:
results_metrics.head(3)

Unnamed: 0,question,right_answer,right_chunk,rag_answer,rag_chunks,rag_metas,type,normalized_right_chunk,multiple_chunks,top1,topk,retriv_semantic_sim,f1,gen_sim,rougeL
0,"How many seasons did the show ""Horsin' Around""...",Nine seasons.,the family comedy struck a chord with America ...,"The show ""Horsin' Around"" went on to air for n...","[""CHARLIE ROSE: In 1987, the situation comedy...","[{'chunk': 0, 'source_para': 12, 'tokens': 24,...",Character Identity & Core Facts,"[CHARLIE ROSE: The show, in which a young, bac...",False,0,1,1.0,0.307692,0.557173,0.307692
1,What specific ingredients are in BoJack's brea...,"Carrots, vitamins, and vodka.","Bojack prepares himself a smoothie of carrots,...","Based on the context, the specific ingredient ...","['Bojack prepares himself a smoothie of', ""BOJ...","[{'source_para': 51, 'chunk': 0, 'role': 'UNKN...",Character Identity & Core Facts,[Bojack prepares himself a smoothie of],False,1,1,1.0,0.0,0.231055,0.0
2,What TV show did BoJack Horseman star in and w...,"Horsin' Around, which premiered in 1987.","CHARLIE ROSE: In 1987, the situation comedy Ho...","The TV show BoJack Horseman starred in was ""Ho...","[""BOJACK ON TV: Now, that's a horse of a diffe...","[{'source_para': 128, 'chunk': 0, 'tokens': 16...",Character Identity & Core Facts,"[CHARLIE ROSE: In 1987, the situation comedy ...",False,0,0,0.648064,0.055556,0.606624,0.222222


Обратим внимание, что референсные чанки представлены не целыми фразами героев (то есть "I commit to things all the time" вместо "BOJACK: I'm not afraid of commitment. I commit to things all the time"), а выдержками из текста. Вероятнее всего, мы сравниваем чанки разной длины (референсные более точные, обычно это фразы), поэтому метрики извлечения малы. Проведем обработку right_chunk - найдём реплики, из которых извлечены фразы, и запишем их в список.

In [None]:
def find_all_matching_lines(target_chunk: str, documents: list):
    """
    Находит одну или максимум две последовательные реплики, 
    которые полностью содержат target_chunk.
    """
    if not isinstance(target_chunk, str):
        return None

    target = target_chunk.strip().lower()
    docs_lower = [doc.lower() for doc in documents]

    # сначала ищем прямое включение в одну реплику
    for doc in documents:
        if target in doc.lower():
            return [doc]

    # ищем комбинацию двух соседних реплик
    for i in range(len(documents) - 1):
        combined = documents[i] + " " + documents[i + 1]
        if target in combined.lower():
            return [documents[i], documents[i + 1]]

    # если прямых совпадений нет, ищем наиболее похожую реплику через SequenceMatcher
    from difflib import SequenceMatcher
    best_score = 0
    best_doc = None
    for doc in documents:
        score = SequenceMatcher(None, target, doc.lower()).ratio()
        if score > best_score:
            best_score = score
            best_doc = doc

    if best_score > 0.3:
        return [best_doc]

    return None


def normalize_right_chunks(df, documents):
    normalized = []

    for _, row in df.iterrows():
        rc = row["right_chunk"]

        # если это список строк, берём самую длинную
        if isinstance(rc, str) and rc.startswith("["):
            try:
                import ast
                rc_eval = ast.literal_eval(rc)
                if isinstance(rc_eval, list):
                    rc = max(rc_eval, key=len)
            except:
                pass

        full_lines = find_all_matching_lines(rc, documents)

        # если ничего не нашли, то None
        normalized.append(full_lines if full_lines else None)

    df["normalized_right_chunk"] = normalized
    return df

In [19]:
all_docs = collection.get(include=["documents"])
documents = all_docs["documents"]

results_norm = normalize_right_chunks(results, documents)
results_norm.head(3)

Unnamed: 0,question,right_answer,right_chunk,rag_answer,rag_chunks,rag_metas,type,normalized_right_chunk
0,"How many seasons did the show ""Horsin' Around""...",Nine seasons.,the family comedy struck a chord with America ...,"The show ""Horsin' Around"" went on to air for n...","[""CHARLIE ROSE: In 1987, the situation comedy...","[{'chunk': 0, 'source_para': 12, 'tokens': 24,...",Character Identity & Core Facts,"[CHARLIE ROSE: The show, in which a young, bac..."
1,What specific ingredients are in BoJack's brea...,"Carrots, vitamins, and vodka.","Bojack prepares himself a smoothie of carrots,...","Based on the context, the specific ingredient ...","['Bojack prepares himself a smoothie of', ""BOJ...","[{'source_para': 51, 'chunk': 0, 'role': 'UNKN...",Character Identity & Core Facts,[Bojack prepares himself a smoothie of]
2,What TV show did BoJack Horseman star in and w...,"Horsin' Around, which premiered in 1987.","CHARLIE ROSE: In 1987, the situation comedy Ho...","The TV show BoJack Horseman starred in was ""Ho...","[""BOJACK ON TV: Now, that's a horse of a diffe...","[{'source_para': 128, 'chunk': 0, 'tokens': 16...",Character Identity & Core Facts,"[CHARLIE ROSE: In 1987, the situation comedy ..."


In [None]:
def has_multiple_chunks(x):
    if isinstance(x, str):
        try:
            lst = ast.literal_eval(x)
        except:
            return False
    elif isinstance(x, list):
        lst = x
    else:
        return False
    return len(lst) >= 2

# считаем, cколько строк содержат 2 и более чанков
results_norm["multiple_chunks"] = results_norm["normalized_right_chunk"].apply(has_multiple_chunks)
num_multiple = results_norm["multiple_chunks"].sum()
print(f"Строк с 2 и более чанками: {num_multiple}")

results_with_multiple = results_norm[results_norm["multiple_chunks"]]
results_with_multiple.head()

Строк с 2 и более чанками: 6


Unnamed: 0,question,right_answer,right_chunk,rag_answer,rag_chunks,rag_metas,type,normalized_right_chunk,multiple_chunks
61,What is BoJack's complaint about the bread at ...,"He isn't crazy about it, yet he keeps eating it.","BOJACK: You know, I am not crazy about the bre...",BoJack is complaining that the bread is not fr...,"['BOJACK: You know, I am not crazy about the b...","[{'tokens': 16, 'source_para': 118, 'chunk': 0...",Character Identity & Core Facts,"[BOJACK: You know, I am not crazy about the br...",True
77,Who told Todd that BoJack and Princess Carolyn...,Princess Carolyn did.,BOJACK: Who told you Princess Carolyn and I br...,The answer is not explicitly stated in the con...,['BOJACK: Who told you Princess Carolyn and I ...,"[{'role': 'BOJACK', 'chunk': 0, 'tokens': 14, ...",Relationship Dynamics,[BOJACK: Who told you Princess Carolyn and I b...,True
82,"Which ""pile of crap"" does BoJack look like, ac...",The third one (the worst one).,"BOJACK: Wait, wait, so which pile of crap do I...","Based on the context, I couldn't find any info...","['BOJACK: Wait, wait, so which pile of crap do...","[{'chunk': 0, 'source_para': 242, 'tokens': 18...",Relationship Dynamics,"[BOJACK: Wait, wait, so which pile of crap do ...",True
102,Who does BoJack compare himself to regarding t...,Linus (with his blanket).,BOJACK: Linus walked around with a blanket. No...,The context does not mention BoJack comparing ...,"[""BOJACK: 'Cause I I think the show's actually...","[{'tokens': 20, 'source_para': 29, 'role': 'BO...",Character Identity & Core Facts,"[BOJACK:, Linus walked around with a blanket. ...",True
122,What mood is Sarah in when she sits at the tab...,A bad mood.,Bojack is in a loud orange sweatshirt doing di...,The context doesn't explicitly state Sarah's i...,"[""SARAH: What, you don't think it's cute?"", 'S...","[{'tokens': 14, 'role': 'SARAH', 'source_para'...",Internal Context & Emotional State,[Bojack is in a loud orange sweatshirt doing d...,True


Некоторые ответы на вопросы включают несколько реплик, тк относятся к ситуации в целом (учитывают реакцию нескольких персонажей):

In [23]:
results_with_multiple.iloc[1]['normalized_right_chunk']

['BOJACK: Who told you Princess Carolyn and I broke up?', 'TODD: She did.']

In [39]:
results_metrics, global_metrics = compute_metrics(results_norm)

In [41]:
global_metrics

{'retrieval_top1': 0.21171171171171171,
 'retrieval_topk': 0.36486486486486486,
 'retrieval_semantic_similarity': 0.559862256855578,
 'generation_f1': 0.12784031206227517,
 'generation_semantic_similarity': 0.3118595419397057,
 'generation_rougeL': 0.15674869521317797}

In [43]:
results_metrics.head(3)

Unnamed: 0,question,right_answer,right_chunk,rag_answer,rag_chunks,rag_metas,type,normalized_right_chunk,multiple_chunks,top1,topk,retriv_semantic_sim,f1,gen_sim,rougeL
0,"How many seasons did the show ""Horsin' Around""...",Nine seasons.,the family comedy struck a chord with America ...,"The show ""Horsin' Around"" went on to air for n...","[""CHARLIE ROSE: In 1987, the situation comedy...","[{'chunk': 0, 'source_para': 12, 'tokens': 24,...",Character Identity & Core Facts,"[CHARLIE ROSE: The show, in which a young, bac...",False,0,1,1.0,0.307692,0.557173,0.307692
1,What specific ingredients are in BoJack's brea...,"Carrots, vitamins, and vodka.","Bojack prepares himself a smoothie of carrots,...","Based on the context, the specific ingredient ...","['Bojack prepares himself a smoothie of', ""BOJ...","[{'source_para': 51, 'chunk': 0, 'role': 'UNKN...",Character Identity & Core Facts,[Bojack prepares himself a smoothie of],False,1,1,1.0,0.0,0.231055,0.0
2,What TV show did BoJack Horseman star in and w...,"Horsin' Around, which premiered in 1987.","CHARLIE ROSE: In 1987, the situation comedy Ho...","The TV show BoJack Horseman starred in was ""Ho...","[""BOJACK ON TV: Now, that's a horse of a diffe...","[{'source_para': 128, 'chunk': 0, 'tokens': 16...",Character Identity & Core Facts,"[CHARLIE ROSE: In 1987, the situation comedy ...",False,0,0,0.648064,0.055556,0.606624,0.222222


Как видно из примера ниже, встречаются кейсы, где референсный ответ совпадает с ответом нашей rag-системы, однако метрики генерации не очень высокие.

In [44]:
print(results_metrics.iloc[0].right_answer)
print(results_metrics.iloc[0].rag_answer)
print(results_metrics.iloc[0].f1)
print(results_metrics.iloc[0].gen_sim)
print(results_metrics.iloc[0].rougeL)

Nine seasons.
The show "Horsin' Around" went on to air for nine seasons.
0.3076923076923077
0.5571725368499756
0.3076923076923077


In [27]:
results_metrics.to_csv("metrics.csv")