# Imports

# 1. Tools Section

In [351]:
# 1. Search and Store Paper

import arxiv
import os
import datetime

# Second batch of imports
import arxiv
import os
import datetime
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Neo4jVector
from langchain_core.documents import Document
from langchain_text_splitters import CharacterTextSplitter
from neo4j import GraphDatabase
from dotenv import load_dotenv

# Third batch of imports
import os
# Setting up google api
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
# from crewai_tools import tool
# from langchain.tools import tool
from crewai.tools import tool

# Setting up calendar
import datetime
import pytz

# Mail format handlings
import base64
import json
# Setttig up the gmail api

In [352]:
def download_arxiv_papers(topic: str) -> list:
    """
    Download research papers from arXiv within the last 5 years on a given topic,
    and return metadata including authors, summary, publish date, and PDF path.
    
    Args:
        topic (str): The research topic to search for.
        
    Returns:
        List[dict]: List of dictionaries containing paper metadata.
    """
    max_results = 10
    download_folder = "research_papers"
    os.makedirs(download_folder, exist_ok=True)
    
    # Define the date range (past 5 years from the current date)
    current_date = datetime.datetime.now()
    start_date = current_date - datetime.timedelta(days=5*365)
    
    # Construct the client and search query
    client = arxiv.Client()
    search = arxiv.Search(
        query=topic,
        max_results=max_results,
        sort_by=arxiv.SortCriterion.Relevance,
        sort_order=arxiv.SortOrder.Descending
    )
    
    papers_data = []
    for result in client.results(search):
        # Convert result.published to a naive datetime for comparison
        published_date = result.published.replace(tzinfo=None)
        
        # Check if the paper's published date is within the last 5 years
        if published_date >= start_date:
            paper_title = result.title.replace(" ", "_").replace("/", "_")  # Clean title for filename
            pdf_path = os.path.join(download_folder, f"{paper_title}.pdf")
            
            # Download the PDF
            result.download_pdf(filename=pdf_path)
            
            # Collect metadata
            paper_info = {
                "title": result.title,
                "authors": [author.name for author in result.authors],
                "summary": result.summary,
                "publish_date": published_date,
                "pdf_path": pdf_path
            }
            papers_data.append(paper_info)
    
    return papers_data

In [353]:

from langchain_ollama import OllamaEmbeddings
from langchain_core.documents import Document

# Initialize your embedding model
ollama_emb = OllamaEmbeddings(model="nomic-embed-text")

def generate_embedding(text):
    # Use the embedding model to get vector for each text chunk
    embedding = ollama_emb.embed_query(text)
    return embedding

In [354]:
from langchain_ollama import OllamaEmbeddings
from langchain_core.documents import Document

# Initialize your embedding model
ollama_emb = OllamaEmbeddings(model="nomic-embed-text")

def generate_embedding(text):
    # Use the embedding model to get vector for each text chunk
    embedding = ollama_emb.embed_query(text)
    return embedding

In [355]:
# Load environment variables from .env
load_dotenv('.env', override=True)
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
NEO4J_DATABASE = os.getenv('NEO4J_DATABASE')

In [356]:
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

# papers_data = download_arxiv_papers("Superconductors")

In [357]:
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


In [358]:
def get_text_chunks_from_pdf(pdf_path):
    """
    Reads a PDF file, splits its text into chunks, and returns the chunks.
    
    Args:
        pdf_path (str): The path to the PDF file.
    
    Returns:
        List[str]: A list of text chunks from the PDF.
    """
    loader = PyMuPDFLoader(pdf_path)
    documents = loader.load()
    full_text = " ".join(doc.page_content for doc in documents)

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
    text_chunks = text_splitter.split_text(full_text)
    
    return text_chunks

In [359]:
# # Example usage
# pdf_path = "research_papers\Higher-order_topological_superconductor_phases_in_a_multilayer_system.pdf"
# text_chunks = get_text_chunks_from_pdf(pdf_path)

# # Output the text chunks
# for i, chunk in enumerate(text_chunks):
#     print(f"Chunk {i+1}:")
#     print(chunk)
#     print("-" * 80)

In [360]:
from crewai.tools import tool
# @tool("Name of my tool")
# def my_tool1(question: str) -> str:
#     """Clear description for what this tool is useful for, your agent will need this information to use it."""
#     # Function logic here
#     return "Result from your custom tool"

In [361]:
@tool("get papers")
def add_paper_to_neo4j(topic: str)->str:
    """
    Downloads research papers related to the given topic, processes each paper,
    and stores relevant information in a Neo4j graph database.
    
    Args:
        topic (str): The topic to search for research papers on arXiv.

    Returns:
        None: This function does not return anything. It modifies the Neo4j database.
    """
    papers_data = download_arxiv_papers(topic)
    with driver.session(database=NEO4J_DATABASE) as session:
        for paper in papers_data:
            title = paper['title']
            summary_text = paper['summary']
            authors = paper['authors']
            publish_date = paper['publish_date']
            pdf_path = paper['pdf_path']
            summary_emb = generate_embedding(summary_text)

            # Create or match the SummaryNode
            session.run(
                """
                MERGE (n:SummaryNode {
                    summary: $summary_text,
                    embedding: $embedding,
                    title: $title,
                    publish_date: $publish_date,
                    pdf_path: $pdf_path
                })
                """,
                summary_text=summary_text,
                embedding=summary_emb,
                title=title,
                publish_date=publish_date,
                pdf_path=pdf_path,
            )

            # Link each author to the SummaryNode
            for author in authors:
                session.run(
                    """
                    MERGE (a:Author {name: $author})
                    WITH a
                    MATCH (n:SummaryNode {summary: $summary_text})
                    MERGE (a)-[:WROTE]->(n)
                    """,
                    author=author,
                    summary_text=summary_text
                )

            # Read, split, and add text chunks as TextChunk nodes related to the SummaryNode
            text_chunks = get_text_chunks_from_pdf(pdf_path)
            for text in text_chunks:
                text_emb = generate_embedding(text)
                session.run(
                    """
                    MATCH (n:SummaryNode {summary: $summary_text})
                    CREATE (t:TextChunk {
                        text: $text,
                        embedding: $text_emb,
                        title: $title,
                        publish_date: $publish_date
                    })
                    MERGE (t)-[:BELONGS_TO]->(n)
                    """,
                    text=text,
                    text_emb=text_emb,
                    summary_text=summary_text,
                    title=title,
                    publish_date=publish_date
                )

            print("Paper, authors, and text chunks added successfully!")
    return "Data added to neo4j graph database"


In [362]:
@tool('get paper by title')
def get_text_chunk_nodes_by_title_and_year(title:str, year:int)->str:
    """
    Retrieve text chunk nodes for a given title and publication year from Neo4j.
    
    Args:
        title (str): The title of the paper.
        year (int): The publication year.
        
    Returns:
        List[dict]: A list of dictionaries representing text chunk nodes with their properties.
    """
    query = """
    MATCH (s:SummaryNode {title: $title})
    WHERE date(s.publish_date).year = $year
    MATCH (t:TextChunk)-[:BELONGS_TO]->(s)
    RETURN t
    """
    
    with driver.session(database=NEO4J_DATABASE) as session:
        result = session.run(query, title=title, year=year)
        text_chunk_nodes = [record["t"] for record in result]
    
    concatenated_str = ""
    for node in text_chunk_nodes:
        concatenated_str = concatenated_str +"\n"+node['text']
    
    # return text_chunk_nodes
    return concatenated_str

In [363]:
# print(chunk_nodes[0].keys())

In [364]:
# print(chunk_nodes[0]['text'])

In [365]:
# # Example usage
# chunk_nodes = get_text_chunk_nodes_by_title_and_year("Performance Analysis of Superconductor-constriction-Superconductor Transmon Qubits", 2023)
# for node in chunk_nodes:
#     print(node)

In [366]:
# chunk_nodes = get_text_chunk_nodes_by_title_and_year("Performance Analysis of Superconductor-constriction-Superconductor Transmon Qubits", 2023)
# print(chunk_nodes)

In [367]:
from neo4j import GraphDatabase

def get_summary_titles(driver):
    query = "MATCH (s:SummaryNode) RETURN s.title AS title"  # Assuming the label is `SummaryNode`
    titles = []
    
    with driver.session() as session:
        result = session.run(query)
        titles = [record["title"] for record in result]
    
    return titles

In [368]:
@tool('get relevant context')
def get_relevant_context(query_text:str)->str:
    """
    Retrieve the most relevant TextChunk nodes based on vector similarity.
    
    Args:
        query_text (str): The input query text.
        top_k (int): Number of top similar results to return.
        
    Returns:
        List[dict]: A list of dictionaries representing similar text chunks with their similarity scores.
    """
    top_k = 5
    # Step 1: Generate an embedding for the query text
    query_embedding = generate_embedding(query_text)

    # Step 2: Execute the similarity search in Neo4j
    query = """
    MATCH (t:TextChunk)
    WITH t, gds.similarity.cosine(t.embedding, $query_embedding) AS similarity
    RETURN t AS node, similarity
    ORDER BY similarity DESC
    LIMIT $top_k
    """
    
    with driver.session(database=NEO4J_DATABASE) as session:
        result = session.run(query, query_embedding=query_embedding, top_k=top_k)
        similar_nodes = [
            {
                "node": record["node"],  # The actual TextChunk node
                "similarity": record["similarity"]
            }
            for record in result
        ]
    # Convert it to text
    concatenated_str = ""
    for i, node in enumerate(similar_nodes):
        concatenated_str = concatenated_str + "\n" + node['node']['text'] + "\n\n"
    # return similar_nodes
    return concatenated_str



In [369]:
# # Example usage
# query_text = "Explain the working of superconducting qubits"
# relevant_context = get_relevant_context.invoke(query_text)

In [370]:
# print(relevant_context)

In [371]:
from neo4j import GraphDatabase

@tool('get relevant summaries')
def vector_search_summaries(query_text:str):
    """
    Retrieve the most relevant Summary nodes based on vector similarity.
    
    Args:
        query_text (str): The input query text.
        top_k (int): Number of top similar results to return.
        
    Returns:
        List[dict]: A list of dictionaries representing similar summary nodes with their similarity scores.
    """
    # Step 1: Generate an embedding for the query text
    query_embedding = generate_embedding(query_text)
    top_k=5
    # Step 2: Execute the similarity search in Neo4j
    query = """
    MATCH (s:SummaryNode)  // Assuming 'SummaryNode' is the label for summary nodes
    WITH s, gds.similarity.cosine(s.embedding, $query_embedding) AS similarity
    RETURN s AS node, similarity
    ORDER BY similarity DESC
    LIMIT $top_k
    """
    
    with driver.session(database=NEO4J_DATABASE) as session:
        result = session.run(query, query_embedding=query_embedding, top_k=top_k)
        similar_nodes = [
            {
                "node": record["node"],  # The actual Summary node
                "similarity": record["similarity"]
            }
            for record in result
        ]
    
    concatenated_str = ""
    for summary in similar_nodes:
        concatenated_str = concatenated_str + "\n" + summary['node']['summary'] + "\n\n"
    
    return concatenated_str
    # return similar_nodes

In [372]:
# # Usage example
# relevant_summaries = vector_search_summaries("Example query text")
# for summary in relevant_summaries:
#     print("Node:", summary["node"], "Similarity:", summary["similarity"])

In [373]:
# relevant_summaries = vector_search_summaries("Superconducting Qubits")


In [374]:
# print(relevant_summaries)

In [375]:
# print(relevant_summaries[0]['node'].keys())

# Agents

In [376]:
# Crewai imports
from crewai import Agent
# Imports for LLM
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_community.llms.ollama import Ollama

In [377]:
from langchain_groq import ChatGroq
from crewai import LLM

from crewai import LLM

llm = LLM(
    model="groq/llama-3.1-70b-versatile",
    api_key="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"
)


In [378]:
# from crewai import LLM, Agent
# llm = LLM(model="ollama/llama3.1", base_url="http://localhost:11434")

In [None]:
# search_agent = Agent(
#     role="You are an expert research paper finder for a given topic",
#     goal="To find the most relevant papers from the user query: {user_query}",
#     backstory="You understand the user query as a pro researcher and find the most relevant research paper for them.",
#     tools=[add_paper_to_neo4j],
#     llm=llm,
#     allow_delegation=False,
#     verbose=True,
# )

In [395]:
from crewai import Agent, Task

# Define the agent
search_agent = Agent(
    role="You are an expert research paper finder for a given topic. You understand the user's query deeply and locate the most relevant research papers.",
    goal="To find the most relevant papers based on the user's query: {user_query}, retrieve them, and store the information in a Neo4j database.",
    backstory="You are well-versed in academic research, familiar with various research domains, and capable of identifying key papers that would best answer the user's query. You have access to tools for downloading, processing, and storing research papers.",
    tools=[add_paper_to_neo4j],  # Assuming this tool is for downloading and storing papers in Neo4j
    llm=llm,  # You can choose an LLM model, assuming llm is previously defined
    allow_delegation=False,  # Prevent delegation to other agents if necessary
    verbose=True,  # Make sure the agent logs the process for debugging
)


In [None]:
# database_agent = Agent(
#     role="Your goal is to find the relevant paper for the given topic and year from the database.",
#     goal="""To retrieve the relevant paper for given year and topic from the user query: {user_query}
#     and available topic list: {topics}""",
#     backstory="You are an expert tools user who can find research paper for a given topic and year",
#     tools=[get_text_chunk_nodes_by_title_and_year],
#     llm=llm,
#     allow_delegation=True,
#     verbose=True,  # Added the missing comma here
# )


In [399]:
database_agent = Agent(
    role="You are an expert researcher capable of retrieving relevant papers for a given topic and year from the database.",
    goal="""To find and retrieve the most relevant paper based on the user's query for a specific topic and year, from the available topics list: {topics}. Your task is to match the user query with the correct context and provide accurate paper details from the database.""",
    backstory="You specialize in identifying and retrieving academic research papers based on specific criteria such as topic and year from a structured database. You understand the nuances of academic queries and are adept at handling various search tasks.",
    tools=[get_text_chunk_nodes_by_title_and_year],  # The tool used for fetching the paper chunks
    llm=llm,
    allow_delegation=True,  # Delegation is allowed if needed
    verbose=True,  # Logging for transparency in task execution
)


In [None]:
# question_answer_agent = Agent(
#     role="To answer user query for given context",
#     goal="""To answer the user query like an expert researcher from the given context: {context}.
#     and user query: {user_query}""",
#     backstory="You have a long experience of doing scientific research and you are very expert in explaining the research papers",
#     tools=[get_relevant_context],
#     llm=llm,
#     verbose=True,
# )

In [400]:
question_answer_agent = Agent(
    role="You are an expert researcher, skilled in answering queries related to scientific research papers. You can extract insights and explain complex concepts in simple terms.",
    goal="""To answer user queries about research papers in an informed, expert manner. You will leverage the provided context from research papers and the user's query to provide accurate and helpful responses.""",
    backstory="You have extensive experience in scientific research and are well-versed in explaining research papers, methodologies, results, and implications. You excel at making complex research accessible and understandable.",
    tools=[get_relevant_context],  # Tool used to extract the relevant context for answering
    llm=llm,
    verbose=True,  # Detailed logging to trace the decision-making process
)


In [None]:
# future_works_agent = Agent(
#     role="To find interesting patterns in given research papers and potential future improvements", 
#     goal="To think of future works that could be possible from the given research papers",
#     backstory="You are an expert researcher who specializes in finding potential future works",
#     tools=[get_relevant_context],
#     llm=llm,
#     verbose=True,
# )

In [401]:
future_works_agent = Agent(
    role="You are an expert researcher specializing in identifying promising future work directions from existing research papers.",
    goal="""To analyze the given research papers and identify potential future works or improvements. You will highlight unexplored areas and suggest new research directions that could extend the findings of the current papers.""",
    backstory="With deep expertise in research and trends in various academic fields, you excel at identifying gaps in current research and proposing areas where further investigation could lead to groundbreaking advancements.",
    tools=[get_relevant_context],  # Tool used for extracting relevant context and finding patterns
    llm=llm,
    verbose=True,  # Verbose for detailed output tracking
)


In [402]:
from crewai import Task, Crew, Process

In [403]:
# search_and_store_task = Task (
#     description="""
#     Search for the most relevant topic for the given user query and store it.
#     """,
#     expected_output="Nothing.",
#     tools = [add_paper_to_neo4j],
#     agent = search_agent,
# )

In [404]:
# Define the task
search_and_store_task = Task(
    description="""Search for the most relevant research papers based on the given user query. Process these papers (download, generate embeddings, and store the paper details and chunks in Neo4j).""",
    expected_output="The task will download and store research papers in Neo4j. A message indicating the completion of the task will be logged.",
    tools=[add_paper_to_neo4j],  # Add necessary tools like downloading and embedding here
    agent=search_agent,  # Assign the agent responsible for performing the task
    verbose=True,  # Ensure verbose output for tracking the progress
    max_retries=3,  # Retry logic in case of failure (optional)
    retry_delay=10  # Time in seconds before retrying (optional)
)

In [None]:
# q_and_a_task = Task(
#     description="""Given the user query do retrieve the relevant information and responde like a pro researcher""",
#     tools=[get_relevant_context],
#     expected_output="Concise response of user query",
#     agent=question_answer_agent,
# )

In [405]:
q_and_a_task = Task(
    description="""Given the user's query, retrieve relevant information from the context and respond in a clear, concise, and expert-level manner. Your response should be well-informed and precise, reflecting the expertise of a researcher.""",
    tools=[get_relevant_context],  # Tool used to fetch the relevant context for answering the query
    expected_output="A well-reasoned and concise response to the user's query, demonstrating expertise in the subject matter.",
    agent=question_answer_agent,  # Assign the agent designed for answering queries
    verbose=True,  # Verbose output for task tracking
)


In [None]:
# query_database_task = Task(
#     description="""Find the relevant information and answer the user with the best response possible from the retrieved context""",
#     expected_output="A response like an expert researcher",
#     tools=[vector_search_summaries],
#     agent=database_agent,
# )

In [406]:
query_database_task = Task(
    description="""Search the database for the most relevant information based on the user's query. Retrieve the best matching papers and respond with an expert-level answer, ensuring that the response is rooted in solid, relevant research.""",
    expected_output="An expert-level response, based on the most relevant research found in the database, addressing the user query accurately.",
    tools=[vector_search_summaries],  # Tool used for searching the summaries in the database
    agent=database_agent,  # Assign the agent responsible for querying the database
    verbose=True,  # Detailed output for tracking the task's progress
)


In [None]:
# future_works_task = Task(
#     description="""Given the provided research papers and their context, analyze the findings and propose potential future research directions. Your goal is to identify gaps, unexplored areas, and opportunities for further work. You should suggest areas that could build upon or improve the existing research and may lead to significant advancements in the field.""",
#     expected_output="""A list of insightful and well-defined future research directions. These should include areas of improvement, gaps in current research, and potential new studies that could be valuable to the academic community.""",
#     tools=[get_relevant_context],  # Tool used to extract context from the research papers
#     agent=future_works_agent,  # Assign the agent responsible for identifying future works
#     verbose=True,  # Detailed logging to track the agent's reasoning process
# )


In [407]:
future_works_task = Task(
    description="""Given the provided research papers and their context, analyze the findings and propose potential future research directions. Your goal is to identify gaps, unexplored areas, and opportunities for further work. You should suggest areas that could build upon or improve the existing research and may lead to significant advancements in the field.""",
    expected_output="""A list of insightful and well-defined future research directions. These should include areas of improvement, gaps in current research, and potential new studies that could be valuable to the academic community.""",
    tools=[get_relevant_context],  # Tool used to extract context from the research papers
    agent=future_works_agent,  # Assign the agent responsible for identifying future works
    verbose=True,  # Detailed logging to track the agent's reasoning process
)


In [408]:
from crewai import Crew, Process

## 1. Storage Crew

In [None]:

# storage_crew = Crew(
#     agents=[search_agent],
#     tasks=[search_and_store_task],
#     process=Process.sequential,
# )



In [None]:
# result = storage_crew.kickoff(inputs={'user_query':'Superconductors'})

[1m[95m# Agent:[00m [1m[92mYou are an expert research paper finder for a given topic. You understand the user's query deeply and locate the most relevant research papers.[00m
[95m## Task:[00m [92mSearch for the most relevant research papers based on the given user query. Process these papers (download, generate embeddings, and store the paper details and chunks in Neo4j).[00m
Paper, authors, and text chunks added successfully!
Paper, authors, and text chunks added successfully!


[1m[95m# Agent:[00m [1m[92mYou are an expert research paper finder for a given topic. You understand the user's query deeply and locate the most relevant research papers.[00m
[95m## Thought:[00m [92mTo successfully complete the task and find the most relevant research papers based on the given user query, "Superconductors," we need to download research papers, process each paper, and store the relevant information in a Neo4j graph database.[00m
[95m## Using tool:[00m [92mget papers[00m


In [None]:
# my_crew = Crew(
#     agents=[search_agent, database_agent, question_answer_agent, future_works_agent],
#     tasks=[search_and_store_task, query_database_task, q_and_a_task, future_works_task],
#     process=Process.sequential,
# )

In [409]:
my_crew = Crew(
    agents=[search_agent, database_agent, question_answer_agent, future_works_agent],
    tasks=[search_and_store_task, query_database_task, q_and_a_task, future_works_task],
    process=Process.sequential,
)



In [None]:
result = my_crew.kickoff(inputs={'user_query':'What is currently going on in the field of superconductors?','topics':'superconductors'})

[1m[95m# Agent:[00m [1m[92mYou are an expert research paper finder for a given topic. You understand the user's query deeply and locate the most relevant research papers.[00m
[95m## Task:[00m [92mSearch for the most relevant research papers based on the given user query. Process these papers (download, generate embeddings, and store the paper details and chunks in Neo4j).[00m
Paper, authors, and text chunks added successfully!
Paper, authors, and text chunks added successfully!


[1m[95m# Agent:[00m [1m[92mYou are an expert research paper finder for a given topic. You understand the user's query deeply and locate the most relevant research papers.[00m
[95m## Thought:[00m [92mThought: In order to find the most relevant research papers based on the user's query, we first need to identify the topic of interest and then use the 'get papers' tool to retrieve the relevant papers.[00m
[95m## Using tool:[00m [92mget papers[00m
[95m## Tool Input:[00m [92m
"{\"topic\":