# Normalization and Lemmatization of Keywords

This notebook loads the `reports.csv` file, processes the `keywords` column to:
- Convert all keywords to lowercase
- Lemmatize each keyword using SpaCy
- Remove duplicates and reorder alphabetically

The result will be saved back into `data/api/reports.csv` with updated keywords for better search matching.

In [8]:
# Load libraries
import pandas as pd
import spacy
from pathlib import Path

# Load spaCy English model (run this cell once to download if needed)
# !python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

In [9]:
# Load the original CSV used by the API
csv_path = Path("../api/reports.csv")
df = pd.read_csv(csv_path)
df.fillna("", inplace=True)
df.head(3)

Unnamed: 0,ID Data Product,Report Name,Report View,Tags,keywords
0,RPPBI0032,Feeder Market - 2024,CRITERIA,,"2024, CRITERIA, Feeder, Market"
1,RPPBI0032,Feeder Market - 2024,DESTINATION_OF_FEEDER_MARKETS,,"2024, ADR, AOV, Feeder, Market, RN"
2,RPPBI0032,Feeder Market - 2024,EXECUTIVE VIEW,,"2024, ADR, AOV, EXECUTIVE, Feeder, Market, RN,..."


## Normalize and lemmatize keywords

In [10]:
def normalize_keywords(keyword_string):
    # Split and clean original keywords
    tokens = [kw.strip().lower() for kw in keyword_string.split(",") if kw.strip()]
    # Lemmatize using spaCy
    doc = nlp(" ".join(tokens))
    lemmatized = sorted(set([token.lemma_ for token in doc if len(token.lemma_) > 2]))
    return ", ".join(lemmatized)

# Apply normalization
df["keywords"] = df["keywords"].apply(normalize_keywords)
df.head(3)

Unnamed: 0,ID Data Product,Report Name,Report View,Tags,keywords
0,RPPBI0032,Feeder Market - 2024,CRITERIA,,"2024, criterion, feed, market"
1,RPPBI0032,Feeder Market - 2024,DESTINATION_OF_FEEDER_MARKETS,,"2024, adr, aov, feeder, market"
2,RPPBI0032,Feeder Market - 2024,EXECUTIVE VIEW,,"2024, adr, aov, executive, feeder, market, view"


In [11]:
# Save the updated DataFrame back to CSV
output_path = Path("../api/reports.csv")
df.to_csv(output_path, index=False)
print(f"Normalized CSV saved to: {output_path}")

Normalized CSV saved to: ../api/reports.csv
