In [32]:
from dotenv import load_dotenv
import os
import requests
import tempfile
from langchain_openai import ChatOpenAI
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

# Load environment variables from .env file
load_dotenv()

# Access environment variables
openai_api_key = os.getenv("OPENAI_API_KEY")
github_token = os.getenv("GITHUB_TOKEN")  # GitHub token

# Initialize ChatOpenAI with the model
llm = ChatOpenAI(model="gpt-3.5-turbo")

# GitHub repository details
repo_owner = "ashwinknan"
repo_name = "testterra"
branch_name = "main"  # or the branch you're using

# Function to fetch markdown files from GitHub
def fetch_markdown_files_from_github(repo_owner, repo_name, branch_name, token):
    api_url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/contents?ref={branch_name}"
    headers = {
        'Authorization': f'token {token}'
    }
    
    try:
        response = requests.get(api_url, headers=headers)
        response.raise_for_status()
    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
        return []
    except Exception as err:
        print(f"Other error occurred: {err}")
        return []

    file_list = response.json()
    
    markdown_files = []
    for file in file_list:
        if file['name'].endswith('.md'):
            file_url = file['download_url']
            try:
                file_content = requests.get(file_url, headers=headers).text
                markdown_files.append(file_content)
            except requests.exceptions.HTTPError as http_err:
                print(f"HTTP error occurred while fetching file {file_url}: {http_err}")
            except Exception as err:
                print(f"Other error occurred while fetching file {file_url}: {err}")
    
    return markdown_files

# Load and process markdown files from GitHub
def load_markdown_files():
    all_documents = []
    markdown_files = fetch_markdown_files_from_github(repo_owner, repo_name, branch_name, github_token)

    for file_content in markdown_files:
        # Write the content to a temporary file
        with tempfile.NamedTemporaryFile(delete=False, suffix=".md") as temp_file:
            temp_file.write(file_content.encode('utf-8'))
            temp_file_path = temp_file.name
            
            # Load the markdown file
            loader = UnstructuredMarkdownLoader(temp_file_path)
            documents = loader.load()
            
            # Set metadata for each document using the original file name
            for doc in documents:
                doc.metadata['source'] = os.path.basename(temp_file_path)  # Use the file name instead of the path
            
            all_documents.extend(documents)
        
        # Optionally, delete the temporary file after loading
        os.remove(temp_file_path)

    return all_documents

# Clear and reload documents
all_documents = load_markdown_files()

# Ensure we have loaded documents
assert len(all_documents) > 0

# Split the documents into manageable chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000, chunk_overlap=500, add_start_index=True
)
all_splits = text_splitter.split_documents(all_documents)

# Create a new vector store from the documents
vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings())

# Create a retriever from the vector store
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})

# Define the refined system prompt
system_prompt = (
    "You are an assistant for question-answering tasks related to game development using the Terra Creator Studio engine. "
    "Your responsibilities include providing complete T# function implementations when requested, ensuring the code is self-contained and ready for use. "
    "Prioritize modifying existing wrapper functions or combining them over creating new ones unless necessary. "
    "Most syntax is similar to C#, but be aware of differences, especially regarding access wrappers and methods. "
    "Refer to the 'T# Don'ts' section to avoid common pitfalls. "
    "If a script is requested, provide complete T# function code that can be directly copied into Terra Studio. "
    "Ensure consistency in responses; similar questions should yield similar answers. "
    "Always refer to the provided context below and search relevant portions for feature-related questions. "
    "Double-check the context document for accuracy, as verifying information is more important than speed. "
    "If you cannot find an answer in the T# documentation, state that you don't know. "
    "Your answers should be clear, concise, and suitable for novice developers. "
    "Always include the source of the information used in your response, and ensure the sources are accurate. "
    "If you reference a source, please specify its location."
    "\n\n{context}\n\nSources:\n{sources}"
)

# Create the chat prompt template
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

# Create the question-answer chain
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

# Store for chat history
store = {}
def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]

# Wrap the RAG chain with message history management
conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)

session_id = "abc1112"  # Unique session ID for the conversation

# Test the retrieval-augmented generation chain
input_question = "Isn't there a StudioHaptics wrapper present. Check please"
retrieved_docs = retriever.invoke(input_question)

# Check if any documents were retrieved
if not retrieved_docs:
    print("No documents retrieved. Please check your query.")
else:
    context = "\n\n".join(doc.page_content for doc in retrieved_docs)

    # Create a more informative sources output
    sources = "\n".join(f"- Source: {doc.metadata['source']}\n  Content: {doc.page_content[:300]}..." for doc in retrieved_docs)  # Show first 300 characters

    # Retrieve the chat history for the session
    chat_history = get_session_history(session_id)

    # Include chat history in the input
    input_with_history = "\n".join([f"User: {msg['input']}\nAI: {msg['answer']}" for msg in chat_history.messages]) + f"\nUser: {input_question}"

    formatted_prompt = system_prompt.format(context=context, sources=sources)

    response = conversational_rag_chain.invoke(
        {"input": input_question, "context": context, "sources": sources},
        config={"configurable": {"session_id": session_id}}
    )

    # Store the new question and answer in the chat history
    chat_history.messages.append({"input": input_question, "answer": response["answer"]})

    print(response["answer"])

