In [34]:
import getpass
import os
from langchain_openai import ChatOpenAI
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_community.document_loaders import WebBaseLoader
from langchain.chains import RetrievalQA
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter




In [35]:
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Access the keys directly, as they will already be set in the environment
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
LANGCHAIN_TRACING_V2 = os.getenv("LANGCHAIN_TRACING_V2", "true")  # Set default to "true" if not in .env


In [36]:
llm = ChatOpenAI(model="gpt-4o-mini")


In [37]:
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma

# Load the markdown files from the directory (markdown_folder)
loader = DirectoryLoader("markdown_folder", glob="**/*.md")

# Load the documents
docs = loader.load()

#  Split the documents using the RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

# Set the path for persisting the ChromaDB 
persist_directory = './db'  





In [38]:
vectorstore = Chroma(
    collection_name="langchain",  
    persist_directory=persist_directory,   
    embedding_function=OpenAIEmbeddings(), 

)


In [39]:
retriever = vectorstore.as_retriever()

In [40]:
# 2. Incorporate the retriever into a question-answering chain.
system_prompt = (
    "You are an assistant for question-answering tasks for the materials project platform. "
    "Use the following pieces of retrieved context and other external resources to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use four sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)



In [41]:
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import MessagesPlaceholder

contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, "
    "just reformulate it if needed and otherwise return it as is."
)

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)

In [42]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)


question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

In [None]:
from langchain_core.messages import AIMessage, HumanMessage

chat_history = []

question = "Give me  a step on how to download data here?"
ai_msg_1 = rag_chain.invoke({"input": question, "chat_history": chat_history})
chat_history.extend(
    [
        HumanMessage(content=question),
        AIMessage(content=ai_msg_1["answer"]),
    ]
)

second_question = "How do i get the api key?"
ai_msg_2 = rag_chain.invoke({"input": second_question, "chat_history": chat_history})


print(ai_msg_1['answer'])
print(ai_msg_2["answer"])

TypeError: 'generator' object is not subscriptable

In [43]:
from langchain_core.messages import AIMessage, HumanMessage

chat_history = []
questions = [
    "What types of materials data are available on the Materials Project, and how can I access them?",
    "Can you explain the different formats in which the data is provided, and how I can convert them for use in machine learning models?",
    "Are there any recommended machine learning frameworks or libraries that work well with the datasets provided?",
    "How frequently is the materials data updated, and what is the process for submitting new materials data to the project?",
    "Can you provide examples of successful machine learning projects that have utilized data from the Materials Project?",
    "What specific properties of materials are most commonly predicted using machine learning techniques?",
    "Is there any guidance available on feature engineering for the datasets provided by the Materials Project?",
    "Are there any limitations or considerations I should be aware of when using this data for machine learning applications?",
    "Does the Materials Project offer any tutorials or resources for beginners looking to apply AI and ML in materials science?",
    "How can I get involved with community discussions or collaborations related to AI applications in materials science through the Materials Project?"
]

# Loop through each question and invoke the chain
with open("questions_answers.txt", "w") as f:
    for question in questions:
        ai_msg = rag_chain.invoke({"input": question, "chat_history": chat_history})
        
        # Update chat history
        chat_history.extend(
            [
                HumanMessage(content=question),
                AIMessage(content=ai_msg["answer"]),
            ]
        )
        
        # Write the question and answer to the file
        f.write(f"Q: {question}\nA: {ai_msg['answer']}\n\n")

print("Questions and answers have been saved to questions_answers.txt")


Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2
Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2
Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2
Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2
Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2
Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2
Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2
Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2
Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2
Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2


Questions and answers have been saved to questions_answers.txt


### checking the web own