In [5]:
#!pip install langchain-community
#!pip install --upgrade langchain
#!pip show langchain-community


In [6]:
#Import necessary libraries
import os
import openai
import sys
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

#Using PyPDF
from langchain.document_loaders import PyPDFLoader

pdf_paths =[ "AI business model innovation.pdf",
            "BI approaches.pdf",
            "Time-Series-Data-Prediction-using-IoT-and-Machine-Le_2020_Procedia-Computer-.pdf",
            "Walmarts sales data analysis.pdf"
]

# Load and combine all PDF documents
all_docs = []
for path in pdf_paths:
    loader = PyPDFLoader(path)
    all_docs.extend(loader.load())

# PDF Implementation: text splitter for pdfs
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
split_docs = text_splitter.split_documents(all_docs)

   
    
#load CSV File
Sales_data_csv = pd.read_csv('sales_data.csv')
# Preview the CSV File
Sales_data_csv.head()



Unnamed: 0,Date,Product,Region,Sales,Customer_Age,Customer_Gender,Customer_Satisfaction
0,2022-01-01,Widget C,South,786,26,Male,2.874407
1,2022-01-02,Widget D,East,850,29,Male,3.365205
2,2022-01-03,Widget A,North,871,40,Female,4.547364
3,2022-01-04,Widget C,South,464,31,Male,4.55542
4,2022-01-05,Widget C,South,262,50,Female,3.982935


In [7]:
import faiss

#load and process csv data
loader = CSVLoader(file_path='sales_data.csv')
docs = loader.load_and_split()

#Initiate faiss vector store and openai embedding
embeddings = OpenAIEmbeddings()
index = faiss.IndexFlatL2(len(OpenAIEmbeddings().embed_query(" ")))
vector_store = FAISS(
    embedding_function=OpenAIEmbeddings(),
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={}
)


# Add pdf docs to the vector store
vector_store = FAISS.from_documents(split_docs, embedding=embeddings) 

#splitted csv data to the vector store
vector_store.add_documents(documents=docs)

#Create the retrieval chain
from langchain.memory import ConversationBufferMemory
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains.retrieval import create_retrieval_chain
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_core.chat_history import InMemoryChatMessageHistory
from langchain.chains.history_aware_retriever import create_history_aware_retriever
from langchain_core.runnables import RunnableLambda

retriever = vector_store.as_retriever()

# Set up system prompt
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}"),
    
])

llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)


# 3. Create a history-aware retriever
history_aware_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant."),
    MessagesPlaceholder(variable_name="chat_history"),
    ("human", "{input}")
])

history_aware_retriever = create_history_aware_retriever(
    llm=llm,
    retriever=retriever,
    prompt=history_aware_prompt
)

# 4. Create the document QA chain (combine retrieved docs + question)
qa_prompt = ChatPromptTemplate.from_messages([
    ("system", "Use the context to answer the question."),
    MessagesPlaceholder(variable_name="chat_history"),
    ("human", "{input}"),
    ("system", "Context:\n{context}")
])
combine_docs_chain = create_stuff_documents_chain(llm, qa_prompt)

# 5. Create the final retrieval chain
retrieval_chain = create_retrieval_chain(history_aware_retriever, combine_docs_chain)

# 6. Memory wrapper to handle chat history
message_histories = {}  # Store for multiple session histories

# 6. Wrap the chain to ensure output is under "output" key (to avoid KeyError)
wrapped_chain = retrieval_chain | RunnableLambda(lambda x: {"output": x["answer"]})

chain_with_memory = RunnableWithMessageHistory(
    wrapped_chain,
    lambda session_id: InMemoryChatMessageHistory(),  # Use simple in-memory store
    input_messages_key="input",
    history_messages_key="chat_history"
)

# 7. Simulate a conversation
session_id = "user-123"

#Query the rag bot with a question based on the CSV data

response1 = chain_with_memory.invoke({"input": "Analyze the sales performance of January 2022?"}, config={"configurable": {"session_id": session_id}})
print("Answer 1:", response1["output"])

response2 = chain_with_memory.invoke({"input": "Create Customer Segmentation by Region?"}, config={"configurable": {"session_id": session_id}})
print("Answer 2:", response2["output"])

response3 = chain_with_memory.invoke({"input": "Create Statistical Measures for this data?"}, config={"configurable": {"session_id": session_id}})
print("Answer 3:", response3["output"])

#Query the rag bot with a question based on pdf
response4 = chain_with_memory.invoke({"input": "What Are The Technology Implementation of walmart?"}, config={"configurable": {"session_id": session_id}})
print("Answer 4:", response4["output"])

Answer 1: In January 2022, the sales performance varied across different products and regions. Here is a summary of the sales data for the month:

1. Widget A:
   - Sales: 871 (North)
   - Sales: 936 (South)

2. Widget B:
   - Sales: 542 (North)

3. Widget C:
   - Sales: 422 (North)

Based on the data provided, Widget A had the highest sales in January, with 936 units sold in the South region. Widget B and Widget C had lower sales compared to Widget A. The North region had sales for all three products, with Widget A having the highest sales in that region.

Overall, the sales performance in January 2022 was relatively good, with Widget A being the top-selling product, especially in the South region.
Answer 2: To create customer segmentation by region, we can analyze the data provided for the North region. Based on the information given, we can segment customers in the North region by their characteristics such as age, gender, sales, and satisfaction level. Here is a breakdown of the cu

In [None]:
#QA Evaluation for model performance and accuracy
from langchain.evaluation.qa import QAEvalChain

# Step 1: Instantiate the evaluation chain
qa_eval_chain = QAEvalChain.from_llm(llm)

# Step 2: Define ground truth examples
examples = [
    {
        "query": "Which was the best-selling product in January 2022 ?",
        "answer": "In January 2022, Widget A was the best-selling product"
    },
    {
        "query": "What is the Median Customer Age in the given data?",
        "answer": "Median Customer Age is 40.5"
    },
    {
        "query": "What is the range of sales?",
        "answer": "Range of Sales is 569"
    }
]

# Step 3: Collect the model's predicted responses
predictions = [
    {"result": response1["output"]},
    {"result": response2["output"]},
    {"result": response3["output"]}
]

# Step 4: Run evaluation
eval_results = qa_eval_chain.evaluate(
    examples=examples,
    predictions=predictions,
    question_key="query",
    answer_key="answer",
    prediction_key="result"
)
    
# Step 5: Display evaluation results
for i, result in enumerate(eval_results):
    print(f"\n[Q{i+1}] {examples[i]['query']}")
    print(f"Prediction: {predictions[i]['result']}")
    print(f"Expected: {examples[i]['answer']}")
    print(f"Feedback: {result['results']}")
