# Hybrid Search on Business Reports Using Keywords and Descriptions

This notebook demonstrates how to combine sparse (keyword-based) and dense (embedding-based) retrieval to find relevant report views using two different signals:
- Keywords from the `reports.csv`
- Descriptions from `Reporting_Inventory.xlsx`

<a href="https://colab.research.google.com/github/cbadenes/semantic-report-search/blob/main/data/analysis/41_hybrid_search_reports.ipynb" target="_parent">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open in Colab"/>
</a>

In [1]:
!pip install -q sentence-transformers scikit-learn pandas
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m77.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m70.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m43.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
# Revisar las primeras filas de la hoja "Views"
raw_inventory_df = pd.read_excel("Reporting_Inventory.xlsx", sheet_name="Views")
raw_inventory_df.head(2)


Unnamed: 0,ID Data Product,Report Name,Product Owner,PBIX_File,Report View,Description,Category,Status,Rename,Dimensions,KPIs,Other Terms,Filters,Tags,Priority
0,RPPBI0032,Feeder Market - 2024,Jonathan Shields,LifeReport.pbix,CRITERIA,Methodolody and definition of the algorithim o...,Informative,Productive,,,,,,,Priority 1
1,RPPBI0032,Feeder Market - 2024,Jonathan Shields,LifeReport.pbix,DESTINATION_OF_FEEDER_MARKETS,View focused on understand the performance by ...,Functional,Productive,,"Hotel, month, Feeder Market, Segment, Channel ...","Total Revenue, Room Revenue, RN, Lead Time, Le...",,,,Priority 1


Load and Merge Data

In [21]:
# Load source files
reports_df = pd.read_csv("reports.csv")
inventory_df = pd.read_excel("Reporting_Inventory.xlsx", sheet_name="Views")

# Merge by 'ID Data Product'
merged_df = reports_df.merge(inventory_df, on="ID Data Product", how="left")
merged_df = merged_df.rename(columns={"Report View_x": "Report View"})
merged_df = merged_df.drop(columns=["Report View_y"])
merged_df = merged_df.rename(columns={"Report Name_x": "Report Name"})
merged_df = merged_df.drop(columns=["Report Name_y"])
merged_df = merged_df.rename(columns={"Tags_x": "Tags"})
merged_df = merged_df.drop(columns=["Tags_y"])



# Clean fields
merged_df["keywords"] = merged_df["keywords"].fillna("")
merged_df["Description"] = merged_df["Description"].fillna("")
merged_df.head(2)

Unnamed: 0,ID Data Product,Report Name,Report View,Tags,keywords,Product Owner,PBIX_File,Description,Category,Status,Rename,Dimensions,KPIs,Other Terms,Filters,Priority
0,RPPBI0032,Feeder Market - 2024,CRITERIA,,"2024, criterion, definition, feed, feeder mark...",Jonathan Shields,LifeReport.pbix,Methodolody and definition of the algorithim o...,Informative,Productive,,,,,,Priority 1
1,RPPBI0032,Feeder Market - 2024,CRITERIA,,"2024, criterion, definition, feed, feeder mark...",Jonathan Shields,LifeReport.pbix,View focused on understand the performance by ...,Functional,Productive,,"Hotel, month, Feeder Market, Segment, Channel ...","Total Revenue, Room Revenue, RN, Lead Time, Le...",,,Priority 1


Prepare Sparse Representations (TF-IDF)

In [22]:
tfidf = TfidfVectorizer()
sparse_matrix = tfidf.fit_transform(merged_df["keywords"])


Prepare Dense Representations (Embeddings)

In [23]:
model = SentenceTransformer("all-MiniLM-L6-v2")
dense_matrix = model.encode(merged_df["Description"], convert_to_tensor=False)


Define Hybrid Search Function

In [24]:
def hybrid_search(query, alpha=0.5, top_k=5):
    # Sparse vector
    sparse_query = tfidf.transform([query])
    sparse_scores = cosine_similarity(sparse_query, sparse_matrix).flatten()

    # Dense vector
    dense_query = model.encode([query])[0]
    dense_scores = cosine_similarity([dense_query], dense_matrix).flatten()

    # Combine scores
    hybrid_scores = alpha * sparse_scores + (1 - alpha) * dense_scores

    # Get top indices and scores
    top_indices = np.argsort(hybrid_scores)[::-1][:top_k]
    top_scores = hybrid_scores[top_indices]

    # Build result DataFrame
    results = merged_df.iloc[top_indices].copy()
    results["score"] = top_scores

    return results[["Report View", "keywords", "Description", "score"]]


Try It Out!

In [25]:
hybrid_search("market performance in European destinations")


Unnamed: 0,Report View,keywords,Description,score
14297,C4C_Qualification_Detail,"account, account handler, account segmentation...",View to analyze Key Potential Destinations by ...,0.436338
14299,Key Potential Destinations,"account (business travel, analyze, business tr...",View to analyze Key Potential Destinations by ...,0.37612
25,EXECUTIVE VIEW,"2024, adr, aov, compare, executive, feeder, fe...",Benchmark by Destination. Outside information ...,0.358329
106,EXECUTIVE VIEW,"2025, adr, aov, compare, executive, feeder, fe...",Benchmark by Destination. Outside information ...,0.358171
7,CRITERIA,"2024, criterion, definition, feed, feeder mark...",Benchmark by Destination. Outside information ...,0.350058


In [26]:
hybrid_search("staff efficiency and complaints resolution", alpha=0.3)

Unnamed: 0,Report View,keywords,Description,score
15304,Home Management,"2024, commercial, efficiency, home, index page...",Older version of the report that was launched ...,0.370792
5683,HOME,"commercial, efficiency, home, index page, inte...",View to measure commercial teams efficiency th...,0.363479
5701,Home Management,"commercial, efficiency, home, index page, inte...",View to measure commercial teams efficiency th...,0.360196
15312,Management View,"2024, commercial, commercial team, comparison,...",Older version of the report that was launched ...,0.350581
5899,Summary 24 vs 25,"commercial, efficiency, hide, hide view, lead,...",View to measure commercial teams efficiency th...,0.347962
