In [None]:
!nvidia-smi
!nvcc --version

Sat Nov 16 06:19:40 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   55C    P0              26W /  70W |   7037MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
!pip install -q einops
!pip install -q FlagEmbedding
!pip install -q peft
!pip install -q faiss-gpu
!pip install xformers



In [None]:
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification
from numpy.linalg import norm
from FlagEmbedding import FlagModel
from FlagEmbedding import FlagReranker
from sentence_transformers import SentenceTransformer


import os
import torch
import faiss
import numpy as np

In [None]:
courses_numbers = ['544', '566', '585', '596', '599', '626', '677', '699']
course_names = ['Applied Natural Language Processing', 'Deep Learning and its Applications', 'Database Systems', 'Scientific Computing and Visualization',
         'Distributed Systems', 'Text as Data', 'Advanced Computer Vision', 'Robotic Perception']

name_to_num = dict(zip(course_names, courses_numbers))

course_ranges = {
    'csci544': './CSCI544.txt',
    'csci566': './CSCI566.txt',
    'csci585': './CSCI585.txt',
    'csci596': './CSCI596.txt',
    'csci599': './CSCI599.txt',
    'csci626': './CSCI626.txt',
    'csci677': './CSCI677.txt',
    'csci699': './CSCI699.txt'
}

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def load_models(embed_model_name=None, rerank_model_name=None):

    # 如果未提供模型名称，设置默认的嵌入模型和重排序模型名称
    embed_model_name = embed_model_name or 'BAAI/bge-base-en-v1.5'
    rerank_model_name = rerank_model_name or 'BAAI/bge-reranker-v2-m3'

    # 加载嵌入模型
    embed_model = AutoModel.from_pretrained(embed_model_name, trust_remote_code=True, torch_dtype=torch.float32)
    embed_tokenizer = AutoTokenizer.from_pretrained(embed_model_name, trust_remote_code=True)
    embed_model.to(device)
    embed_model.eval()

    # 加载重排序模型
    rerank_model = AutoModelForSequenceClassification.from_pretrained(rerank_model_name, torch_dtype='auto', trust_remote_code=True)
    rerank_tokenizer = AutoTokenizer.from_pretrained(rerank_model_name)
    rerank_model.to(device)
    rerank_model.eval()

    return embed_model, embed_tokenizer, rerank_model, rerank_tokenizer

# 定义可用的嵌入模型字典
available_embed_models = {
    '1': 'BAAI/bge-base-en-v1.5',                      # BGE
    '2': 'jinaai/jina-embeddings-v2-base-en',          # Jina
    '3': 'dunzhang/stella_en_400M_v5',                 # Stella + BGE Reranker
    '4': 'dunzhang/stella_en_400M_v5',                 # Stella + Jina Reranker
    '5': 'sentence-transformers/all-roberta-large-v1'  # Roberta
}

# 定义可用的重排序模型字典
available_rerank_models = {
    '1': 'BAAI/bge-reranker-v2-m3',                             # BGE Reranker
    '2': 'jinaai/jina-reranker-v2-base-multilingual',           # Jina Reranker
    '3': 'BAAI/bge-reranker-v2-m3',                             # Stella + BGE Reranker
    '4': 'jinaai/jina-reranker-v2-base-multilingual',           # Stella + Jina Reranker
    '5': 'BAAI/bge-reranker-v2-m3'                              # Roberta + BGE Reranker
}

print("Please select the embedding model to load:")
print("1: BGE + BGE Reranker")
print("2: Jina + Jina Reranker")
print("3: Stella + BGE Reranker")
print("4: Stella + Jina Reranker")
print("5: Roberta + BGE Reranker")

user_choice = input("Enter a number (1-5): ").strip()

# 获取用户选择的嵌入模型和对应的重排序模型
selected_embed_model = available_embed_models.get(user_choice)
selected_rerank_model = available_rerank_models.get(user_choice)

# 加载选择的模型
embed_model, embed_tokenizer, rerank_model, rerank_tokenizer = load_models(embed_model_name=selected_embed_model, rerank_model_name=selected_rerank_model)

Please select the embedding model to load:
1: BGE + BGE Reranker
2: Jina + Jina Reranker
3: Stella + BGE Reranker
4: Stella + Jina Reranker
5: Roberta + BGE Reranker
Enter a number (1-5): 1


In [None]:

# 对文本进行编码的函数
def encode(texts):
    inputs = embed_tokenizer(texts, return_tensors='pt', padding=True, truncation=True).to(device)
    with torch.no_grad():
        embeddings = embed_model(**inputs).last_hidden_state.mean(dim=1)
        # 转换为float32
    return embeddings.to(torch.float32)

# 读取txt文件并按"\n\n"切割 -> 自行修改！！！
def read_and_split_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    segments = content.split('\n\n##')

    return segments

# 计算query和每个文档分段的相似度，并返回排序后的结果
def compute_similarity(query, segments):
    query_embedding = encode([query])[0].cpu()
    segment_embeddings = encode(segments).cpu()

    # 计算余弦相似度的函数
    cos_sim = lambda a, b: (a @ b.T if b.ndim > 1 else a @ b) / (norm(a) * norm(b))

    similarities = [cos_sim(query_embedding, segment_embedding) for segment_embedding in segment_embeddings]

    # 将分数、query和文档分段组合并排序
    results = sorted(zip(similarities, segments), key=lambda x: x[0], reverse=True)

    return results

def generate_embeddings(texts):
    # 使用 embed_tokenizer 和 embed_model 生成嵌入（适用于 BGE、Stella 等模型）
    inputs = embed_tokenizer(texts, padding=True, truncation=True, return_tensors="pt").to(device)
    with torch.no_grad():
        embeddings = embed_model(**inputs).last_hidden_state.mean(dim=1).cpu().numpy().astype('float32')
    return embeddings

def build_vector_database(course_names, user_choice):
    print("Starting to build vector database...")

    if user_choice == '2':  # 如果选择的是 Jina 模型
        print("Using Jina model to encode course names...")
        course_embeddings = embed_model.encode(course_names)  # 直接使用 embed_model.encode 方法生成嵌入
    else:  # 如果是 BGE、Stella 等其他模型
        print("Using generate_embeddings to encode course names...")
        course_embeddings = generate_embeddings(course_names)  # 调用 generate_embeddings 生成嵌入

    # 转为 float32 并进行 L2 归一化
    np_course_embeddings = np.array(course_embeddings).astype('float32')
    faiss.normalize_L2(np_course_embeddings)


    # 创建 FAISS 索引，选择内积搜索方式 (dot-product search)
    index_innerproduct = faiss.IndexFlatIP(np_course_embeddings.shape[1])
    print("FAISS index created on CPU.")

    # 将已归一化的嵌入添加到 CPU 索引中
    #index_innerproduct.add(np_course_embeddings)
    #print("Embeddings added to CPU index.")

    #return index_innerproduct  # 返回 CPU 上的索引##

    ## 将索引从 CPU 移动到 GPU
    res = faiss.StandardGpuResources()  # 创建 GPU 资源
    gpu_index = faiss.index_cpu_to_gpu(res, 0, index_innerproduct)  # 将索引移动到 GPU

    # 将已归一化的嵌入添加到 GPU 索引中
    gpu_index.add(np_course_embeddings)
    print("Embeddings added to GPU index.")

    return gpu_index  # 返回 GPU 上的索引

database = build_vector_database(course_names, user_choice)

Starting to build vector database...
Using generate_embeddings to encode course names...
FAISS index created on CPU.
Embeddings added to GPU index.


In [None]:
import csv

def load_queries_from_csv(file_path):
    queries = []
    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        next(reader)
        for row in reader:
            if len(row) < 6:
                print(f"Skipping incomplete row: {row}")
                continue

            if any(cell.strip().lower() in ["n/a", "nah"] for cell in row):
                #print(f"Skipping invalid question row: {row}")
                continue

            # 提取有效的列数据
            query = row[0].strip()
            original_query = row[1].strip()
            course_class = row[2].strip()
            most_relevant_segment = row[3].strip()
            respond = row[4].strip()
            function_to_call = row[5].strip()
            queries.append((query, original_query, course_class, most_relevant_segment, respond, function_to_call))

    return queries

def extract_first_line(segment):
    # 检查段落是否包含 `##`，并提取 `##` 后面的第一行内容
    lines = segment.strip().split('\n')
    if lines[0].startswith("##"):
        return lines[1].strip() if len(lines) > 1 else ""  # 提取 `##` 后的第一行内容
    else:
        return lines[0].strip()  # 没有 `##`，返回整个段落的第一行


# 执行检索并获取 top-k 结果
def RAG(queries, segments, k=3):
    retrieved_segments = []

    for idx, query_tuple in enumerate(queries, 1):  # 使用 enumerate 来获取索引
        query = query_tuple[0]  # 使用短的 query 作为检索项
        original_query = query_tuple[1]  # 获取 CSV 中的实际查询内容
        most_relevant_segment = query_tuple[3]
        all_similarity = compute_similarity(query, segments)

        # 按相似度筛选段落
        results = []
        for score, segment in all_similarity:
            if score > -5:
                results.append(segment)

        # 如果有足够相关的段落，将其重新排序并获取前 k 个
        if results:
            sentence_pairs = [[query, doc] for doc in results]

            final_results = []
            for pair in sentence_pairs:
                inputs = rerank_tokenizer(pair[0], pair[1], return_tensors='pt', truncation=True, max_length=1024).to(device)
                with torch.no_grad():
                    outputs = rerank_model(**inputs)
                    score = outputs.logits.cpu().float().numpy().flatten()[0]

                    final_results.append((score, pair[1]))

            # 排序并选择 top k
            final_results.sort(key=lambda x: x[0], reverse=True)
            top_k_results = [extract_first_line(pair[1]) for pair in final_results[:k]]
        else:
            top_k_results = ['!!!Failed to find any segment!!!']

        # 打印实际查询内容和前 k 个结果
        print(f"Query {idx} ({original_query}): Top {k} Segments: {top_k_results}")
        print(f"Expected Segment from CSV: {most_relevant_segment}")

        # 将前 k 个结果存储到 retrieved_segments 列表中
        retrieved_segments.append(top_k_results)

    return retrieved_segments

# 搜索最相似的课程
def search(query, database, embed_dim, topk=1):
    # 判断使用哪种方法生成查询嵌入
    if user_choice == '2':  # Jina 模型
        query_embed = embed_model.encode([query])[0]
    else:  # 其他模型，如 BGE、Stella 等
        query_embed = generate_embeddings([query])[0]

    query_embed = np.array(query_embed).astype('float32')

    _, idx = database.search(query_embed.reshape((1, embed_dim)), topk)  # (768,) -> (1, 768)
    idx = idx.reshape(-1)

    ret = [course_names[i] for i in idx]
    return ret

def get_top_3_answers(query, course_segments):
    # Compute similarity for the query against course segments
    all_similarity = compute_similarity(query, course_segments)

    # Filter results by a threshold
    results = [segment for score, segment in all_similarity if score > 0.25]
    final_results = []

    if results:
        # Prepare sentence pairs for reranking
        sentence_pairs = [[query, doc] for doc in results]

        # Rerank and score each pair
        for pair in sentence_pairs:
            inputs = rerank_tokenizer(pair[0], pair[1], return_tensors='pt', truncation=True, max_length=1024).to(device)
            with torch.no_grad():
                outputs = rerank_model(**inputs)
                score = outputs.logits.cpu().float().numpy().flatten()[0]
                final_results.append((score, pair[1]))

        # Sort and select the top 3 results
        final_results.sort(key=lambda x: x[0], reverse=True)
        final_results = final_results[:3]
    else:
        final_results = [(0.0, '!!!Failed to find any segment!!!')]

    # Print top 3 answers with scores
    for i, (score, answer) in enumerate(final_results, 1):
        print(f"Top {i} Answer: {answer}\nScore: {score:.4f}\n")

    return final_results

In [None]:
queries1 = load_queries_from_csv('./combined_data.csv')
queries2 = load_queries_from_csv('./combined_fake_data.csv')
queries = [q1[0] for q1 in queries1] + [q2[0] for q2 in queries2]
courses = [q1[2] for q1 in queries1] + [q2[2] for q2 in queries2]

most_relevant_segment = [q1[3] for q1 in queries1]
ground_truth = [q1[4] for q1 in queries1]

In [None]:
from typing import List

def evaluate_find_class(query: List[str], ground_truth: List[str], database) -> float:
    """
    Parameters:
    query: a list of all the user's queries -> [query_1, query_2, ..., query_n]
    ground_truth: a list of ground truth (classes) -> [class_1, class_2, ..., class_n]
    database: vector database

    Return:
    accuracy
    """
    def course_in_query(query):
        for num in courses_numbers:
            if f"csci{num}" in query.lower() or num in query:
                return True

        for name in course_names:
            if name.lower() in query.lower():
                return True

        return False


    correct = 0
    total = 0

    for q, gt in zip(query, ground_truth):
        if not course_in_query(q):
            total += 1

            course_result = search(q, database, embed_dim=database.d, topk=1)[0]
            course_num_result = name_to_num[course_result]
            if gt in f"csci{course_num_result}":
                correct += 1

    accuracy = (correct / total) * 100

    print(f"Total(除去包含课号课名的query数量): {total}")

    return accuracy

accuracy = evaluate_find_class(queries, courses, database)
print(f"Accuracy: {accuracy:.2f}%")


Total(除去包含课号课名的query数量): 75
Accuracy: 98.67%


In [None]:
from typing import List, Tuple

def evaluate_rag(retrieved_segments: List[List[str]], ground_truth: List[str], k: int = 3) -> Tuple[float, float]:
    """
    Parameters:
    retrieved_segments: a list where each element is a list of the top-k segments retrieved by RAG -> [[seg_1, ..., seg_k], [seg_1, ..., seg_k], ..., [seg_1, ..., seg_k]]
    ground_truth: a list of the most relevant ground truth segment for each query -> [seg_gt_1, seg_gt_2, ..., seg_gt_n]
    k: the number of top segments retrieved for each query (default is 3)

    Return:
    top_1 accuracy and top_k accuracy
    """
    top_1_correct = 0
    top_k_correct = 0
    total = len(retrieved_segments)

    for segments, gt in zip(retrieved_segments, ground_truth):
        all_similarity = compute_similarity(gt, [segments])
        top_k_scores = [score for score, _ in all_similarity]

        if top_k_scores[0] > 0.70:
            top_1_correct += 1
        # else:
        #     print(f"===================> segments: {segments}")
        #     print(f"===================> gt: {gt}")
        #     print(f"===================> top_k_scores: {top_k_scores[0]}")

        for s in top_k_scores:
            if s > 0.70:
                top_k_correct += 1
                break

    top_1_accuracy = (top_1_correct / total) * 100
    top_k_accuracy = (top_k_correct / total) * 100

    return top_1_accuracy, top_k_accuracy

top_1_acc, top_k_acc = evaluate_rag(most_relevant_segment, ground_truth, k=1)
print(f"Top-1 Accuracy: {top_1_acc:.2f}%")
print(f"Top-3 Accuracy: {top_k_acc:.2f}%")


Top-1 Accuracy: 95.35%
Top-3 Accuracy: 95.35%


In [None]:
!pip install openai

In [None]:
from openai import OpenAI
client = OpenAI(
    # api_key="YOUR_API_KEY"
)

def evaluate_rag_answer(rag_answer: str, reference_answer: str, question: str) -> int:
    """
    Evaluates the relevance and correctness of a RAG model's answer by prompting ChatGPT-4 and returns a score.

    Parameters:
    rag_answer (str): The answer generated by the RAG model.
    reference_answer (str): The expected or reference answer for comparison.
    question (str): The question that was asked to generate the answer.

    Returns:
    int: A relevance and correctness score between 1 and 10.
    """
    # Compose the prompt for ChatGPT-4
    prompt = (
        f"Evaluate the relevance and correctness of the following answer generated by a RAG model.\n\n"
        f"Question: {question}\n\n"
        f"RAG Model Answer: {rag_answer}\n\n"
        f"Reference Answer: {reference_answer}\n\n"
        "Please provide a single relevance and correctness score between 1 and 10, where 10 indicates the highest relevance and correctness, "
        "and 1 indicates no relevance or correctness. Return only a number"
    )

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=50
    )

    print(response.choices[0].message)

    try:
        relevance_score = int(response.choices[0].message.content.strip())
    except ValueError:
        # In case parsing fails, return a default score or indicate error
        relevance_score = None

    return relevance_score

# Sample usage
rag_answer = "The Eiffel Tower is located in Paris, France."
reference_answer = "The Eiffel Tower is in Paris."
question = "Where is the Eiffel Tower located?"

score = evaluate_rag_answer(rag_answer, reference_answer, question)
print("Relevance and Correctness Score:", score)


ChatCompletionMessage(content='10', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None)
Relevance and Correctness Score: 10
