# Keyword Extraction with Regular Expressions

Use simple regex patterns to extract terms, acronyms, time units, and formatted values from report metadata.

We will load the inventory file and add extracted keywords step-by-step.

In [1]:
# Load required libraries
import pandas as pd
from pathlib import Path
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="openpyxl")

# Display full content in cells (not truncated)
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)

# Load the base reports.csv
data_path = Path("../raw/Reporting_Inventory.xlsx")
views_df = pd.read_excel(data_path, sheet_name="Views")
views_df.fillna("", inplace=True)
views_df.head(2)

Unnamed: 0,ID Data Product,Report Name,Product Owner,PBIX_File,Report View,Description,Category,Status,Rename,Dimensions,KPIs,Other Terms,Filters,Tags,Priority
0,RPPBI0032,Feeder Market - 2024,Jonathan Shields,LifeReport.pbix,CRITERIA,Methodolody and definition of the algorithim of Feeder Market,Informative,Productive,,,,,,,Priority 1
1,RPPBI0032,Feeder Market - 2024,Jonathan Shields,LifeReport.pbix,DESTINATION_OF_FEEDER_MARKETS,View focused on understand the performance by hotel for a specific feeder market o selection of feeder marktes.,Functional,Productive,,"Hotel, month, Feeder Market, Segment, Channel Mix, Room Type","Total Revenue, Room Revenue, RN, Lead Time, Lenght of Stay, AOV, ADR, ADR Net, %Cost",,,,Priority 1


## Extract Terms

In [2]:
import re

# Define the columns we want to extract terms from
source_columns = ["Report Name", "Report View"]

# Define a simple pattern to extract words (alphanumeric, at least 3 characters)
TERM_PATTERN = r"\b[a-zA-Z]{3,}\b"

def extract_terms(text):
    """
    Extract candidate keywords using a basic regex word pattern.
    Only words with 3 or more characters are included.
    """
    return re.findall(TERM_PATTERN, str(text))

def extract_terms_from_row(row):
    """
    For each row, apply term extraction on Report Name and Report View
    and return the unique sorted list of candidate terms.
    """
    terms = set()
    for col in source_columns:
        terms.update(extract_terms(row[col]))
    return ", ".join(sorted(terms))

# Apply to the dataframe and create a new column
views_df["term_keywords"] = views_df.apply(extract_terms_from_row, axis=1)

# Filter rows where 'value_keywords' is not empty
filtered = views_df[views_df["term_keywords"].str.strip() != ""]

# Show a sample
filtered.head(2)

Unnamed: 0,ID Data Product,Report Name,Product Owner,PBIX_File,Report View,Description,Category,Status,Rename,Dimensions,KPIs,Other Terms,Filters,Tags,Priority,term_keywords
0,RPPBI0032,Feeder Market - 2024,Jonathan Shields,LifeReport.pbix,CRITERIA,Methodolody and definition of the algorithim of Feeder Market,Informative,Productive,,,,,,,Priority 1,"CRITERIA, Feeder, Market"
1,RPPBI0032,Feeder Market - 2024,Jonathan Shields,LifeReport.pbix,DESTINATION_OF_FEEDER_MARKETS,View focused on understand the performance by hotel for a specific feeder market o selection of feeder marktes.,Functional,Productive,,"Hotel, month, Feeder Market, Segment, Channel Mix, Room Type","Total Revenue, Room Revenue, RN, Lead Time, Lenght of Stay, AOV, ADR, ADR Net, %Cost",,,,Priority 1,"Feeder, Market"


Update `reports.csv` with the new keywords:

In [3]:
from pathlib import Path
import pandas as pd

# Load main reports.csv
reports_path = Path("../api/reports.csv")
reports_df = pd.read_csv(reports_path).fillna("")

# Merge the new keyword column from views_df
reports_df = reports_df.merge(
    views_df[["ID Data Product", "Report View", "term_keywords"]],
    on=["ID Data Product", "Report View"],
    how="left"
)

# Combine original and new keywords
def merge_keywords(original, new):
    orig_set = {kw.strip() for kw in str(original).split(",") if kw.strip()}
    new_set = {kw.strip() for kw in str(new).split(",") if kw.strip()}
    return ", ".join(sorted(orig_set | new_set))

reports_df["keywords"] = reports_df.apply(
    lambda row: merge_keywords(row["keywords"], row.get("term_keywords", "")),
    axis=1
)

# Drop helper column and save
reports_df.drop(columns=["term_keywords"], inplace=True)
reports_df.to_csv(reports_path, index=False)

print("Column 'keywords' successfully enriched.")


Column 'keywords' successfully enriched.


## Extract Acronyms (uppercase abbreviations)

In [4]:
import re

# We'll use these columns as input for keyword extraction
text_columns = ["Report Name", "Report View", "Description", "KPIs", "Dimensions"]

# Define pattern for acronyms: sequences of 2 to 6 uppercase letters
ACRONYMS_PATTERN = r"\b[A-Z]{2,6}\b"

def extract_acronyms(text):
    return re.findall(ACRONYMS_PATTERN, str(text))

# Extract acronyms from each row
def extract_acronyms_from_row(row):
    keywords = set()
    for col in text_columns:
        keywords.update(extract_acronyms(row[col]))
    return ", ".join(sorted(keywords))

# Add acronyms column
views_df["acronym_keywords"] = views_df.apply(extract_acronyms_from_row, axis=1)

# Filter rows where 'value_keywords' is not empty
filtered = views_df[views_df["acronym_keywords"].str.strip() != ""]

# Show a sample
filtered.head(2)

Unnamed: 0,ID Data Product,Report Name,Product Owner,PBIX_File,Report View,Description,Category,Status,Rename,Dimensions,KPIs,Other Terms,Filters,Tags,Priority,term_keywords,acronym_keywords
1,RPPBI0032,Feeder Market - 2024,Jonathan Shields,LifeReport.pbix,DESTINATION_OF_FEEDER_MARKETS,View focused on understand the performance by hotel for a specific feeder market o selection of feeder marktes.,Functional,Productive,,"Hotel, month, Feeder Market, Segment, Channel Mix, Room Type","Total Revenue, Room Revenue, RN, Lead Time, Lenght of Stay, AOV, ADR, ADR Net, %Cost",,,,Priority 1,"Feeder, Market","ADR, AOV, RN"
2,RPPBI0032,Feeder Market - 2024,Jonathan Shields,LifeReport.pbix,EXECUTIVE VIEW,Global view to understand Feeder Market Performance compared to previous years diferentiating between domestic and international,Executive,Productive,,"Hotel, month, Feeder Market, Segment, Channel Mix, Room Type","Total Revenue, Room Revenue, RN, Lead Time, Lenght of Stay, AOV, ADR, ADR Net, %Cost",,,,Priority 1,"EXECUTIVE, Feeder, Market, VIEW","ADR, AOV, RN, VIEW"


Update reports.csv with the new keywords:

In [5]:
from pathlib import Path
import pandas as pd

# Load main reports.csv
reports_path = Path("../api/reports.csv")
reports_df = pd.read_csv(reports_path).fillna("")

# Merge the new keyword column from views_df
reports_df = reports_df.merge(
    views_df[["ID Data Product", "Report View", "acronym_keywords"]],
    on=["ID Data Product", "Report View"],
    how="left"
)

# Combine original and new keywords
def merge_keywords(original, new):
    orig_set = {kw.strip() for kw in str(original).split(",") if kw.strip()}
    new_set = {kw.strip() for kw in str(new).split(",") if kw.strip()}
    return ", ".join(sorted(orig_set | new_set))

reports_df["keywords"] = reports_df.apply(
    lambda row: merge_keywords(row["keywords"], row.get("acronym_keywords", "")),
    axis=1
)

# Drop helper column and save
reports_df.drop(columns=["acronym_keywords"], inplace=True)
reports_df.to_csv(reports_path, index=False)

print("Column 'keywords' successfully enriched.")


Column 'keywords' successfully enriched.


## Extract Temporal Expressions (e.g., Q1, FY23, 2024)

In [6]:
import re

# We'll use these columns as input for keyword extraction
text_columns = ["Report Name", "Report View", "Description", "KPIs", "Dimensions"]

# Define pattern for temporal terms
TEMPORAL_PATTERN = r"\bQ[1-4]\b|\bFY\d{2}\b|\b\d{4}\b"

def extract_temporals(text):
    return re.findall(TEMPORAL_PATTERN, str(text))

# Extract from each row
def extract_temporals_from_row(row):
    keywords = set()
    for col in text_columns:
        keywords.update(extract_temporals(row[col]))
    return ", ".join(sorted(keywords))

# Add temporal keywords column
views_df["temporal_keywords"] = views_df.apply(extract_temporals_from_row, axis=1)

# Filter rows where 'value_keywords' is not empty
filtered = views_df[views_df["temporal_keywords"].str.strip() != ""]

# Show a sample
filtered.head(2)

Unnamed: 0,ID Data Product,Report Name,Product Owner,PBIX_File,Report View,Description,Category,Status,Rename,Dimensions,KPIs,Other Terms,Filters,Tags,Priority,term_keywords,acronym_keywords,temporal_keywords
0,RPPBI0032,Feeder Market - 2024,Jonathan Shields,LifeReport.pbix,CRITERIA,Methodolody and definition of the algorithim of Feeder Market,Informative,Productive,,,,,,,Priority 1,"CRITERIA, Feeder, Market",,2024
1,RPPBI0032,Feeder Market - 2024,Jonathan Shields,LifeReport.pbix,DESTINATION_OF_FEEDER_MARKETS,View focused on understand the performance by hotel for a specific feeder market o selection of feeder marktes.,Functional,Productive,,"Hotel, month, Feeder Market, Segment, Channel Mix, Room Type","Total Revenue, Room Revenue, RN, Lead Time, Lenght of Stay, AOV, ADR, ADR Net, %Cost",,,,Priority 1,"Feeder, Market","ADR, AOV, RN",2024


Update `reports.csv` with the new keywords:

In [7]:
from pathlib import Path
import pandas as pd

# Load main reports.csv
reports_path = Path("../api/reports.csv")
reports_df = pd.read_csv(reports_path).fillna("")

# Merge the new keyword column from views_df
reports_df = reports_df.merge(
    views_df[["ID Data Product", "Report View", "temporal_keywords"]],
    on=["ID Data Product", "Report View"],
    how="left"
)

# Combine original and new keywords
def merge_keywords(original, new):
    orig_set = {kw.strip() for kw in str(original).split(",") if kw.strip()}
    new_set = {kw.strip() for kw in str(new).split(",") if kw.strip()}
    return ", ".join(sorted(orig_set | new_set))

reports_df["keywords"] = reports_df.apply(
    lambda row: merge_keywords(row["keywords"], row.get("temporal_keywords", "")),
    axis=1
)

# Drop helper column and save
reports_df.drop(columns=["temporal_keywords"], inplace=True)
reports_df.to_csv(reports_path, index=False)

print("Column 'keywords' successfully enriched.")


Column 'keywords' successfully enriched.


## Extract Currency and Percentage Values

In [8]:
import re

# We'll use these columns as input for keyword extraction
text_columns = ["Report Name", "Report View", "Description", "KPIs", "Dimensions"]

# Define pattern for values like 75%, €200, $300K
CURRENCY_PATTERN = r"\b\d+[%€$Kk]\b"

def extract_currency_values(text):
    return re.findall(CURRENCY_PATTERN, str(text))

# Extract from each row
def extract_currency_from_row(row):
    keywords = set()
    for col in text_columns:
        keywords.update(extract_currency_values(row[col]))
    return ", ".join(sorted(keywords))

# Add monetary/percentage values column
views_df["value_keywords"] = views_df.apply(extract_currency_from_row, axis=1)
#views_df.head(100)

# Filter rows where 'value_keywords' is not empty
filtered = views_df[views_df["value_keywords"].str.strip() != ""]

# Show a sample
filtered.head()

Unnamed: 0,ID Data Product,Report Name,Product Owner,PBIX_File,Report View,Description,Category,Status,Rename,Dimensions,KPIs,Other Terms,Filters,Tags,Priority,term_keywords,acronym_keywords,temporal_keywords,value_keywords
