# Semantic Search on Report Views Using Transformers

In this notebook, we will:
- Extract report view descriptions from a CSV file.
- Encode them into dense vector representations using a pre-trained Transformer model.
- Implement a semantic search function that finds the most similar report view to a given user query using cosine similarity.

<a href="https://colab.research.google.com/github/cbadenes/semantic-report-search/blob/main/data/analysis/33_text_embeddings.ipynb" target="_parent">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open in Colab"/>
</a>

In [1]:
!pip install -q sentence-transformers

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m75.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m58.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m35.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Load Report Data

In [17]:
df = pd.read_csv('reports.csv')
#df = df[['Report View', 'keywords']].dropna()
df = df.dropna()
df.head()


Unnamed: 0,ID Data Product,Report Name,Report View,Tags,keywords
39,RPPBI0004,eCommerce Report 2024,B2B Digital Report,B2B Digital,"2024, agency, allow, analyze, b2b, b2b digital..."
42,RPPBI0004,eCommerce Report 2024,Digital By Creation Date Performance Report,"Performance, Digital","2024, analytic, brand, channel analytic, count..."
43,RPPBI0004,eCommerce Report 2024,Digital Performance (Stay Date),"Performance, Digital","2024, analytic, brand, channel analytic, count..."
44,RPPBI0004,eCommerce Report 2024,Ecommerce Performance (Creation Date) Report,"Performance, eCommerce","2024, analytic, brand, channel analytic, count..."
45,RPPBI0004,eCommerce Report 2024,Executive Report,"Budget, Performance","2024, adr, budget, comparison, contain, creati..."


Generate embeddings

In [19]:
model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight and fast

# Combine 'Report View' and 'keywords' for better context
df['description'] = df['Report View'] + ' - ' + df['keywords']
df['embedding'] = df['description'].apply(lambda x: model.encode(x, convert_to_tensor=True))
df.head(2)

Unnamed: 0,ID Data Product,Report Name,Report View,Tags,keywords,description,embedding
39,RPPBI0004,eCommerce Report 2024,B2B Digital Report,B2B Digital,"2024, agency, allow, analyze, b2b, b2b digital...","B2B Digital Report - 2024, agency, allow, anal...","[tensor(0.0428, device='cuda:0'), tensor(-0.04..."
42,RPPBI0004,eCommerce Report 2024,Digital By Creation Date Performance Report,"Performance, Digital","2024, analytic, brand, channel analytic, count...",Digital By Creation Date Performance Report - ...,"[tensor(0.0175, device='cuda:0'), tensor(-0.05..."


Semantic Search Function:

In [22]:
def search_similar_view(query, top_k=4):
    query_vec = model.encode(query, convert_to_tensor=True)
    # Convert embeddings to CPU numpy arrays
    embeddings_np = [emb.cpu().numpy() for emb in df['embedding']]
    query_vec_np = query_vec.cpu().numpy().reshape(1, -1)

    similarities = cosine_similarity(query_vec_np, embeddings_np)
    top_k_idx = np.argsort(similarities[0])[::-1][:top_k]
    return df.iloc[top_k_idx][['Report Name', 'Report View', 'description']]


Example Queries

In [23]:
search_similar_view("detailed flow of feeder markets")

Unnamed: 0,Report Name,Report View,description
935,eCommerce Report 2023,Performance Report,"Performance Report - 2023, analytic, brand, ch..."
927,eCommerce Report 2022,Performance Report,"Performance Report - 2022, analytic, brand, ch..."
51,eCommerce Report 2024,Performance Report,"Performance Report - 2024, analytic, brand, ch..."
67,eCommerce Report 2025,Performance Report,"Performance Report - 2025, analytic, brand, ch..."


In [24]:
search_similar_view("summary statistics for executives")


Unnamed: 0,Report Name,Report View,description
213,Cvent Dashboard,Executive Summary,"Executive Summary - 2025, ad-hoc table, analyz..."
60,eCommerce Report 2025,Executive Report,"Executive Report - 2025, adr, budget, comparis..."
45,eCommerce Report 2024,Executive Report,"Executive Report - 2024, adr, budget, comparis..."
242,Hotel Benchmark Report,Executive Summary,"Executive Summary - (eft, actuals, analyze, be..."


In [25]:
search_similar_view("market destination performance analysis")


Unnamed: 0,Report Name,Report View,description
51,eCommerce Report 2024,Performance Report,"Performance Report - 2024, analytic, brand, ch..."
927,eCommerce Report 2022,Performance Report,"Performance Report - 2022, analytic, brand, ch..."
67,eCommerce Report 2025,Performance Report,"Performance Report - 2025, analytic, brand, ch..."
935,eCommerce Report 2023,Performance Report,"Performance Report - 2023, analytic, brand, ch..."
