In [1]:
import dotenv
import os
dotenv.load_dotenv()

True

In [2]:
CHAT_MODEL = 'gpt-4.1-mini'
EMBEDDINGS_MODEL = 'text-embedding-3-large'
NEURO_URL = 'https://neuroapi.host/v1'
NEURO_KEY = os.environ.get("NEURO_KEY")
MISTRAL_KEY = os.environ.get("MISTRAL_KEY")
GOOGLE_KEY = os.environ.get("GOOGLE_KEY")

# Creating knowledge bases

In [3]:
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_mistralai import MistralAIEmbeddings
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_ollama import OllamaEmbeddings, OllamaLLM, ChatOllama

# embeddings = OpenAIEmbeddings(
#     base_url=NEURO_URL,
#     api_key=NEURO_KEY,
#     model=EMBEDDINGS_MODEL)
# embeddings = MistralAIEmbeddings(
#     api_key=MISTRAL_KEY
# )
# doc_embeddings = GoogleGenerativeAIEmbeddings(
#     api_key=GOOGLE_KEY,
#     model="gemini-embedding-001",
#     task_type="RETRIEVAL_DOCUMENT",
#     request_options={
#         'timeout': 60
#     }
# )
doc_embeddings = OllamaEmbeddings(model='bge-m3')

# llm = ChatOpenAI(
#     model=CHAT_MODEL,
#     base_url=NEURO_URL,
#     api_key=NEURO_KEY)
llm = ChatOllama(model="llama3.1:8b")

  from pydantic.v1.fields import FieldInfo as FieldInfoV1
  from .autonotebook import tqdm as notebook_tqdm


## Vector database

In [4]:
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams, Distance

# client = QdrantClient(url="http://localhost:6333")
# client.delete_collection('python_docs')
# client.create_collection('python_docs', VectorParams(size=1024, distance=Distance.COSINE))

qdrant = QdrantVectorStore.from_existing_collection(
    embedding=doc_embeddings,
    collection_name="python_docs",
    url="http://localhost:6333",
)

In [None]:
# from langchain_text_splitters import RecursiveCharacterTextSplitter

# text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
#     chunk_size=800, chunk_overlap=120
# )

# from tqdm import tqdm
# from pathlib import Path
# from langchain_community.document_loaders import TextLoader

# pydocs = Path('./knowledge_data/python_docs/')

# for doc in tqdm(list(pydocs.glob('**/*.txt'))):
#     ld = TextLoader(doc).load_and_split(text_splitter=text_splitter)
#     ids = qdrant.add_documents(ld)

## Graph database

In [None]:
from langchain_neo4j import Neo4jGraph

graph = Neo4jGraph(
    url='bolt://localhost:7687',
    username='neo4j',
    password='complexpassword')

In [None]:
from langchain_experimental.graph_transformers.llm import LLMGraphTransformer
llm_transformer = LLMGraphTransformer(llm=llm)

In [None]:
for doc in tqdm(list(pydocs.glob('**/*.txt'))):
    ld = TextLoader(doc).load_and_split(text_splitter=text_splitter)
    graph_documents = llm_transformer.convert_to_graph_documents(ld)
    graph.add_graph_documents(graph_documents, baseEntityLabel=True, include_source=True)

## Creating tools and agents

In [10]:
from langchain.tools import tool

# For qdrant
@tool(response_format="content_and_artifact")
def retrieve_context_from_vector(query: str):
    """Retrieve information to help answer a query."""
    retrieved_docs = qdrant.similarity_search(query, k=2)
    serialized = "\n\n".join(
        (f"Source: {doc.metadata}\nContent: {doc.page_content}")
        for doc in retrieved_docs
    )
    return serialized, retrieved_docs

# For neo4j
@tool(response_format="content_and_artifact")
def retrieve_context_from_graph(query: str):
    """Retrieve information to help answer a query."""
    return []
    # retrieved_docs = qdrant.similarity_search(query, k=2)
    # serialized = "\n\n".join(
    #     (f"Source: {doc.metadata}\nContent: {doc.page_content}")
    #     for doc in retrieved_docs
    # )
    # return serialized, retrieved_docs


qdrant_tools = [retrieve_context_from_vector]
neo4j_tools = [retrieve_context_from_graph]

In [11]:
from langchain.agents import create_agent
from langchain_core.prompts import ChatPromptTemplate

prompt = (
    "You have access to a tool that retrieves context from python documentations. "
    "Use this tool to make answers more accurate. "
)

qdrant_agent = create_agent(llm, qdrant_tools, system_prompt=prompt)
neo4j_agent = create_agent(llm, neo4j_tools, system_prompt=prompt)

# Usual agent - simple LLM with prompt, no tools
usual_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant that answers questions about Python programming. Provide clear and accurate answers."),
    ("human", "{messages}")
])
usual_agent = usual_prompt | llm

# Loading questions

In [12]:
from datasets import load_dataset
import pandas as pd

ds = load_dataset("Myashka/SO-Python_QA-filtered-2023-tanh_score")
df = ds['train'].to_pandas()

# Simple filtering
col_filter = (df['is_accepted'] == True) \
    & (df['Tags'].str.contains("python"))
df = df[col_filter]
print("Размер всей выборки", len(df))

# Create two selections for manual testing and usual one
manual_check_qas = df.sample(10)
df.drop(manual_check_qas.index)
test_check_qas = df.sample(20)

print("Размер выборки для ручной проверки", len(manual_check_qas))
print("Размер выборки обычной проверки", len(test_check_qas))

Размер всей выборки 17660
Размер выборки для ручной проверки 10
Размер выборки обычной проверки 20


In [13]:
from langchain.messages import HumanMessage

# Function to get answer from an agent
def get_agent_answer(agent, question: str):
    """Get answer from an agent given a question."""
    try:
        response = agent.invoke(
            {"messages": [HumanMessage(question)]}
        )
        
        # Handle different response formats
        if isinstance(response, dict):
            # Agent-based response (qdrant_agent, neo4j_agent)
            if 'output' in response:
                return response['output']
            elif 'messages' in response:
                # Extract from messages
                messages = response['messages']
                if messages and hasattr(messages[-1], 'content'):
                    return messages[-1].content
                return str(messages[-1]) if messages else str(response)
            return str(response)
        elif hasattr(response, 'content'):
            # Chain-based response (usual_agent) - AIMessage
            return response.content
        else:
            # Fallback
            return str(response)
    except Exception as e:
        print(f"Error getting answer for question '{question[:50]}...': {e}")
        return f"Error: {str(e)}"


In [14]:
# Get answers from all agents on selected questions
from tqdm import tqdm
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Use test_check_qas for evaluation
selected_qas = test_check_qas.copy()

# Initialize results storage
results = []

print("Getting answers from all agents...")
for idx, row in tqdm(selected_qas.iterrows(), total=len(selected_qas), desc="Processing questions"):
    question = f"Title: {row['Title']}\nTags: {row['Tags']}\nQuestion: {row['Question']}"
    ground_truth = row['Answer']
    
    # Get answers from all agents
    qdrant_answer = get_agent_answer(qdrant_agent, question)
    neo4j_answer = ''
    #neo4j_answer = get_agent_answer(neo4j_agent, question)
    usual_answer = get_agent_answer(usual_agent, question)
    
    results.append({
        'question_id': idx,
        'question': question,
        'ground_truth': ground_truth,
        'qdrant_answer': qdrant_answer,
        'neo4j_answer': neo4j_answer,
        'usual_answer': usual_answer
    })

results_df = pd.DataFrame(results)
print(f"\nCompleted processing {len(results_df)} questions")

Getting answers from all agents...


Processing questions: 100%|██████████| 20/20 [1:21:27<00:00, 244.40s/it]


Completed processing 20 questions





In [15]:
results_df.to_csv('cp.csv')

In [21]:
# Calculate cosine distances between agent answers and ground truth
print("Calculating embeddings and cosine distances...")

def safe_embed(embed, q):
    try:
        return embed.embed_query(q)
    except:
        return None

# Get embeddings for all answers
ground_truth_embeddings = [safe_embed(doc_embeddings, i) for i in tqdm(results_df['ground_truth'].tolist(), desc="GT")]
qdrant_embeddings = [safe_embed(doc_embeddings, i) for i in tqdm(results_df['qdrant_answer'].tolist(), desc="QA")]
neo4j_embeddings = [safe_embed(doc_embeddings, i) for i in tqdm(results_df['neo4j_answer'].tolist(), desc="NA")]
usual_embeddings = [safe_embed(doc_embeddings, i) for i in tqdm(results_df['usual_answer'].tolist(), desc="UA")]

idxs = []
for i in range(len(usual_embeddings)):
    if ground_truth_embeddings[i] == None or usual_embeddings[i] == None or neo4j_embeddings[i] == None or qdrant_embeddings[i] == None:
        idxs.append(i)

for i in idxs[::-1]:
    print(i)
    ground_truth_embeddings.pop(i)
    usual_embeddings.pop(i)
    neo4j_embeddings.pop(i)
    qdrant_embeddings.pop(i)

Calculating embeddings and cosine distances...


GT: 100%|██████████| 20/20 [00:03<00:00,  5.72it/s]
QA: 100%|██████████| 20/20 [00:06<00:00,  2.96it/s]
NA: 100%|██████████| 20/20 [00:02<00:00,  9.33it/s]
UA: 100%|██████████| 20/20 [00:09<00:00,  2.15it/s]


In [22]:

# Convert to numpy arrays
ground_truth_embeddings = np.array(ground_truth_embeddings)
qdrant_embeddings = np.array(qdrant_embeddings)
neo4j_embeddings = np.array(neo4j_embeddings)
usual_embeddings = np.array(usual_embeddings)

# Calculate cosine similarity (1 - cosine_distance)
qdrant_similarities = [cosine_similarity([gt], [qd])[0][0] 
                       for gt, qd in zip(ground_truth_embeddings, qdrant_embeddings)]
neo4j_similarities = [cosine_similarity([gt], [ne])[0][0] 
                      for gt, ne in zip(ground_truth_embeddings, neo4j_embeddings)]
usual_similarities = [cosine_similarity([gt], [us])[0][0] 
                      for gt, us in zip(ground_truth_embeddings, usual_embeddings)]

sim_results_df = pd.DataFrame()

# Calculate cosine distances (1 - similarity)
sim_results_df['qdrant_cosine_distance'] = [1 - sim for sim in qdrant_similarities]
sim_results_df['neo4j_cosine_distance'] = [1 - sim for sim in neo4j_similarities]
sim_results_df['usual_cosine_distance'] = [1 - sim for sim in usual_similarities]

# Also store similarities for reference
sim_results_df['qdrant_cosine_similarity'] = qdrant_similarities
sim_results_df['neo4j_cosine_similarity'] = neo4j_similarities
sim_results_df['usual_cosine_similarity'] = usual_similarities

print("Cosine distances calculated!")


Cosine distances calculated!


In [23]:
sim_results_df

Unnamed: 0,qdrant_cosine_distance,neo4j_cosine_distance,usual_cosine_distance,qdrant_cosine_similarity,neo4j_cosine_similarity,usual_cosine_similarity
0,0.286308,0.517885,0.609676,0.713692,0.482115,0.390324
1,0.661174,0.533499,0.650804,0.338826,0.466501,0.349196
2,0.581805,0.567655,0.732911,0.418195,0.432345,0.267089
3,0.292859,0.548787,0.557285,0.707141,0.451213,0.442715
4,0.525476,0.582131,0.501382,0.474524,0.417869,0.498618
5,0.279517,0.490104,0.670988,0.720483,0.509896,0.329012
6,0.560865,0.577275,0.666225,0.439135,0.422725,0.333775
7,0.270329,0.5444,0.297613,0.729671,0.4556,0.702387
8,0.248359,0.561657,0.53298,0.751641,0.438343,0.46702
9,0.330385,0.579501,0.393557,0.669615,0.420499,0.606443


In [None]:
# Display comparison results
print("\n=== Comparison Results ===")

print(f"\nAverage Cosine Similarities:")
print(f"Qdrant Agent: {sim_results_df['qdrant_cosine_similarity'].mean():.4f}")
print(f"Usual Agent: {sim_results_df['usual_cosine_similarity'].mean():.4f}")

print(f"\nMedian Cosine Distances:")
print(f"Qdrant Agent: {sim_results_df['qdrant_cosine_distance'].median():.4f}")
print(f"Usual Agent: {sim_results_df['usual_cosine_distance'].median():.4f}")



=== Comparison Results ===

Average Cosine Similarities (higher is better):
Qdrant Agent: 0.5935
Usual Agent: 0.4520

Median Cosine Distances:
Qdrant Agent: 0.3337
Usual Agent: 0.5644


In [28]:
# Optional: Save results to CSV
sim_results_df.to_csv('agent_comparison_results.csv', index=False)
print("Results saved to 'agent_comparison_results.csv'")


Results saved to 'agent_comparison_results.csv'
