# RAG based on Ollama framework

<a href="https://colab.research.google.com/github/cbadenes/semantic-report-search/blob/main/data/analysis/45_RAG_ollama.ipynb" target="_parent">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open in Colab"/>
</a>

In [None]:
# STEP 1: Install required packages
# !pip install langchain langchain_community langchain_ollama sentence_transformers chromadb pandas openpyxl tiktoken langchain_huggingface

In [None]:
from IPython.display import Markdown, display# STEP 2: Load the Excel file and select the "Views" sheet
import pandas as pd

df = pd.read_excel("../raw/Reporting_Inventory.xlsx", sheet_name="Views")
df.head()


Unnamed: 0,ID Data Product,Report Name,Product Owner,PBIX_File,Report View,Description,Category,Status,Rename,Dimensions,KPIs,Other Terms,Filters,Tags,Priority
0,RPPBI0032,Feeder Market - 2024,Jonathan Shields,LifeReport.pbix,CRITERIA,Methodolody and definition of the algorithim o...,Informative,Productive,,,,,,,Priority 1
1,RPPBI0032,Feeder Market - 2024,Jonathan Shields,LifeReport.pbix,DESTINATION_OF_FEEDER_MARKETS,View focused on understand the performance by ...,Functional,Productive,,"Hotel, month, Feeder Market, Segment, Channel ...","Total Revenue, Room Revenue, RN, Lead Time, Le...",,,,Priority 1
2,RPPBI0032,Feeder Market - 2024,Jonathan Shields,LifeReport.pbix,EXECUTIVE VIEW,Global view to understand Feeder Market Perfor...,Executive,Productive,,"Hotel, month, Feeder Market, Segment, Channel ...","Total Revenue, Room Revenue, RN, Lead Time, Le...",,,,Priority 1
3,RPPBI0032,Feeder Market - 2024,Jonathan Shields,LifeReport.pbix,FEEDER MARKET FLOWS,View focused on understanding the booking beha...,Functional,Productive,,"Hotel, month, Feeder Market, Segment, Channel ...","Total Revenue, Room Revenue, RN, Lead Time, Le...",,,,Priority 1
4,RPPBI0032,Feeder Market - 2024,Jonathan Shields,LifeReport.pbix,FEEDER_MARKET_DETAIL,Detail view of Feeder Markets by Destination i...,Functional,Productive,,"Hotel, month, Feeder Market, Segment, Channel ...","Total Revenue, Room Revenue, RN, Lead Time, Le...",,,,Priority 1


In [20]:
# Step 3: Convert rows to LangChain Documents
from langchain.schema import Document

def row_to_document(row):
    content = "\n".join([f"{col}: {row[col]}" for col in row.index if pd.notnull(row[col])])
    return Document(page_content=content)

documents = [row_to_document(row) for _, row in df.iterrows()]
print(f"{len(documents)} documents created.")
print("Document:\n", documents[0])

1486 documents created.
Document:
 page_content='ID Data Product: RPPBI0032
Report Name: Feeder Market - 2024
Product Owner: Jonathan Shields
PBIX_File: LifeReport.pbix
Report View: CRITERIA
Description: Methodolody and definition of the algorithim of Feeder Market
Category: Informative
Status: Productive
Priority: Priority 1'


In [21]:
# from huggingface_hub import login
# login(token="hf_xxx...")

# Step 4: Embed documents using a local embedding model (default: huggingface-based)
from langchain_huggingface import HuggingFaceEmbeddings

# You can change the model to another one like 'sentence-transformers/all-MiniLM-L6-v2'
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [None]:
# STEP 5: Store embeddings in Chroma (no compilation needed)
from langchain.vectorstores import Chroma
import shutil

shutil.rmtree("chroma_db", ignore_errors=True)

vectorstore = Chroma.from_documents(
    documents,
    embedding_model,
    persist_directory=None
)
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 4})


InternalError: Query error: Database error: error returned from database: (code: 1032) attempt to write a readonly database

In [63]:
# STEP 6: Connect to local Ollama model
from langchain_ollama import ChatOllama

llm = ChatOllama(
    model="llama3.1:8b",   # Model name (must be available in Ollama) (mistral, llama3.1, etc.)
    temperature=0.3,       # Controls randomness. Lower = more deterministic. 0.0–1.0 typical.
    top_p=0.8,             # Nucleus sampling: chooses tokens from the top cumulative probability p. Use ≤ 1.0.
    top_k=40,              # Limits token selection to top k most likely. Lower = safer, higher = more diverse.
    num_ctx=2048,          # Maximum context window (prompt + response). Must not exceed model limit.
    stop=["User:"],        # List of string(s) that, when generated, stop the output. Useful for structured outputs.
    repeat_penalty=1.2,    # Penalizes repetition. Values >1 discourage repeated tokens.
    presence_penalty=0.1,  # Encourages new topics. Higher = more novel responses.
    frequency_penalty=0.1, # Penalizes repeated phrases.
    max_tokens=512,        # Max tokens in generated output. Useful to limit long responses.
    base_url="http://localhost:11434"  # URL of Ollama server
)


In [64]:
# STEP 7: Create RAG chain with explicit prompt
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

prompt_template = """You are an assistant helping analyze reporting views in a hotel system.

Use the context below to answer the question accurately and completely.

Context:
{context}

Question: {question}
Answer:"""

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

# llm: This is your language model (LLM) used to generate the final answer.
# retriever: The retrieval component responsible for finding the most relevant documents based on the query.
# chain_type: Defines how the retrieved documents are processed before being passed to the LLM.
#   Options: "stuff", "map_reduce", "refine", "map_rerank"
#     "map_reduce" → splits docs, processes them separately, combines the outputs.
#     "refine" → generates initial answer and refines it with each doc.
#     "map_rerank" → scores individual answers and picks the best one. 
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,              
    retriever=retriever,  
    chain_type="stuff",   
    chain_type_kwargs={"prompt": prompt} 
)


In [65]:
# STEP 8: Ask your question
query = "Which views are designed to support strategic decision-making?"
response = qa_chain.invoke({"query": query})

display(Markdown(response["result"]))


Based on the context provided, I can identify several report views that appear to be designed to support strategic decision-making. These include:

1. **Advanced Quality Report (RPPBI0016)**: This view is intended for "understanding how our competitors are performing versus us in terms of quality online reputation, revenue performance, occupancy, price and profitability." It also provides recommendation strategies based on improvement opportunities.
2. **Commercial Efficiency Model & Mastertools - BUSE (TLPBI0027)**: Although this report has a lower priority than some others, it is an executive view that analyzes business from different perspectives to inform strategic decisions about commercial efficiency.

Additionally, I would like to highlight the following views as potentially supporting strategic decision-making:

1. **Distribution NET Report - 2025 (RPPBI0189)**: This dynamic report provides insights into intermediaries' monthly performance and evolution by KPI.
2. **Demand Management (TLPBI0006)**: Although this view is focused on prioritization, it can help inform strategic decisions about resource allocation.

These views are likely to be used for high-level analysis and decision-making at the executive or product owner level.

In [66]:
# STEP 8: Ask your question
query = "Group the available views by their primary data domains."
response = qa_chain.invoke({"query": query})

display(Markdown(response["result"]))


Based on the provided context, I can group the available views into the following categories based on their primary data domains:

**Distribution and Revenue**

* Distribution NET Report - 2025 (Dynamic): Analyzes intermediaries' monthly performance & evolution by KPI dynamically
* Distribution NET Report - 2025 (Static): Analyzes intermediaries' monthly performance & evolution by KPI statically

These views are related to distribution revenue, focusing on the performance and evolution of intermediaries.

**eCommerce**

* eCommerce Report 2022: Offers a vision of reservation behavior with specific KPIs such as Lead Time, Lenght of Stay, AOV or Cancellation Rate
* eCommerce Report 2023: Similar to the previous report but for different years (2023)
* eCommerce Report 2024: Analyzes reservation behavior and provides insights into various KPIs like Total Revenue, Room Revenue, F&B Revenue, etc.
* eCommerce Report 2025: The latest version of the eCommerce report series

These views are related to online sales and revenue generated through e-commerce channels.

**Data Governance**

* Data Governance Adoption Dashboard (Datamap): Shows a summary of total terms by area & domain
This view is focused on data governance, providing insights into adoption rates across different areas and domains.

**Revenue Management**

* Weekly Revenue Report 2025: Provides detailed information about revenue management with customizable tables featuring over 15 KPIs

These views are related to revenue management within the organization, focusing on various aspects of revenue generation and analysis.

Note that some reports may overlap between categories or have multiple primary data domains. However, based on their descriptions, I've grouped them into these main categories for simplicity and clarity.

In [67]:
# STEP 8: Ask your question
query = "If you had to design a new consolidated view, which existing views would you merge and why?"
response = qa_chain.invoke({"query": query})

display(Markdown(response["result"]))


After analyzing the provided context, I would suggest merging the following three existing views into one consolidated view:

**Daily Revenue Report 2025 (OTB vs SPIT Summary)** from ID Data Product RPPBI0173
**Weekly Revenue Report 2025 (Property View)** from ID Data Product TLPBI0024 is not relevant here as it has different dimensions and KPIs, so I will exclude this one.
and **Daily Key Metrics (2.Taskforce)** from ID Data Product RPPBI0076

I would merge these three views because they share similar characteristics:

1.  They are all functional reports focused on revenue analysis.
2.  Each view has a mix of KPIs and dimensions that can be combined to provide a comprehensive understanding of hotel performance.

The consolidated view, which I'll call **Consolidated Revenue Analysis**, would include the following features:

*   Dimensions: Hotel; Month; Segment; Channel
*   KPIs:
    *   Total Revenue (from Daily Revenue Report 2025)
    *   Room Revenue (from Weekly Revenue Report 2025 is not relevant here, so I will exclude this one.)
    *   Incoming Calls and Conversion Rate metrics from CRO Daily Key Metrics

This consolidated view would provide a detailed analysis of hotel revenue performance across different segments, channels, and time periods. It would also include key metrics for conversion rate and incoming calls to give a comprehensive understanding of the business.

The filters available in each individual report could be combined or modified as needed to ensure that users can easily drill down into specific data points while maintaining an overall view of revenue performance across different segments and channels

In [68]:
# STEP 8: Ask your question
query = "What are the reporting views that support operational versus financial management?"
response = qa_chain.invoke({"query": query})
display(Markdown(response["result"]))


Based on the provided context, I can identify two types of reports:

**Operational Management Reports**

These reports focus on business operations and performance. They provide insights into how to manage day-to-day activities.

* ID Data Product: RPPBI0173 - Daily Revenue Report 2025 (Report View: Executive Summary) - This report provides a summary of the most important information in the daily revenue.
* ID Data Product: TLPBI0026 and TLPBI0027 - Commercial Efficiency Model & Mastertools - BUNE and BUSE reports, respectively. Although they are focused on commercial efficiency, their views (REDACTED EUAM Summary) seem to provide operational insights.

**Financial Management Reports**

These reports focus on financial performance and management. They provide insights into how the business is performing financially.

* ID Data Product: RPPBI0168 - Budget 2025 Report (Report View: Other Revenues Detail) - This report provides a detailed view of other revenues assignation by segment to budget, compared to the previous year.
* ID Data Product: TLPBI0010 and TLPBI0026/27 are also focused on commercial efficiency but their views seem more related to financial management (Commercial Team, Company Cost & Cost Over Sales).
* ID Data Product: RPPBI0173 - Daily Revenue Report 2025 (Report View: Auto FC vs Budget & Official FC) - This report provides information about the auto forecast in contrast to budget and official figures.
* ID Data Product: TLPBI0028 is an older version of a commercial efficiency model, but its view seems more focused on financial management.

However, if I had to choose reports that are specifically designed for operational versus financial management:

**Operational Management Reports**

ID Data Product: RPPBI0173 - Daily Revenue Report 2025 (Report View: Executive Summary)

**Financial Management Reports**

* ID Data Product: TLPBI0026 and TLPBI0027 - Commercial Efficiency Model & Mastertools - BUNE and BUSE reports, respectively.

In [69]:
# Analyze the retrieved documents and their scores
query = "What are the reporting views that support financial management?"
docs_with_scores = retriever.vectorstore.similarity_search_with_score(query, k=5)

for i, (doc, score) in enumerate(docs_with_scores):
    print(f"\n--- Rank {i+1} | Score: {score:.4f} ---\n")
    print(doc.page_content[:500])



--- Rank 1 | Score: 1.0483 ---

ID Data Product: RPPBI0173
Report Name: Daily Revenue Report 2025
Product Owner: Tasha Hall
PBIX_File: AboutReport.pbix
Report View: Executive Summary
Description: It is the main page of the report, where its aim is to summarize as much as possible the most important information in the report. The view is only at Total Revenue and Month level. Includes essential information from Daily Pick Up, Weekly Pick Up, Forecast, Budget, Auto Forecast and Dummy Forecast. Also, it contains information of Bu

--- Rank 2 | Score: 1.0851 ---

ID Data Product: RPPBI0168
Report Name: Budget 2025 Report
Product Owner: Jonathan Shields
PBIX_File: AndReport.pbix
Report View: Other Revenues Detail
Description: Page focused on showing the Other Revenues assignation by segment to budget, compared to the previous year
Category: Executive
Status: Productive
Dimensions: Hotel, Month, Segment
KPIs:  Room Nights, ADR, Room Revenue, Breakfast, F&B, Events Revenues, Other Revenues, 