## Prepare Data

In [14]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
import tomotopy as tp

nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Cargar la hoja "Views"
df = pd.read_excel("../raw/Reporting_Inventory.xlsx", sheet_name="Views")
df = df[df["Description"].notna() & df["Category"].notna()]

# Preprocesamiento
def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^a-záéíóúñü\s]", "", text)
    tokens = word_tokenize(text)
    return [t for t in tokens if t not in stop_words]

df["tokens"] = df["Description"].apply(preprocess)
df["label"] = df["Category"].str.lower().str.strip()
df.head(10)

[nltk_data] Downloading package punkt to /Users/cbadenes/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cbadenes/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,ID Data Product,Report Name,Product Owner,PBIX_File,Report View,Description,Category,Status,Rename,Dimensions,KPIs,Other Terms,Filters,Tags,Priority,tokens,label
0,RPPBI0032,Feeder Market - 2024,Jonathan Shields,LifeReport.pbix,CRITERIA,Methodolody and definition of the algorithim o...,Informative,Productive,,,,,,,Priority 1,"[methodolody, definition, algorithim, feeder, ...",informative
1,RPPBI0032,Feeder Market - 2024,Jonathan Shields,LifeReport.pbix,DESTINATION_OF_FEEDER_MARKETS,View focused on understand the performance by ...,Functional,Productive,,"Hotel, month, Feeder Market, Segment, Channel ...","Total Revenue, Room Revenue, RN, Lead Time, Le...",,,,Priority 1,"[view, focused, understand, performance, hotel...",functional
2,RPPBI0032,Feeder Market - 2024,Jonathan Shields,LifeReport.pbix,EXECUTIVE VIEW,Global view to understand Feeder Market Perfor...,Executive,Productive,,"Hotel, month, Feeder Market, Segment, Channel ...","Total Revenue, Room Revenue, RN, Lead Time, Le...",,,,Priority 1,"[global, view, understand, feeder, market, per...",executive
3,RPPBI0032,Feeder Market - 2024,Jonathan Shields,LifeReport.pbix,FEEDER MARKET FLOWS,View focused on understanding the booking beha...,Functional,Productive,,"Hotel, month, Feeder Market, Segment, Channel ...","Total Revenue, Room Revenue, RN, Lead Time, Le...",,,,Priority 1,"[view, focused, understanding, booking, behavi...",functional
4,RPPBI0032,Feeder Market - 2024,Jonathan Shields,LifeReport.pbix,FEEDER_MARKET_DETAIL,Detail view of Feeder Markets by Destination i...,Functional,Productive,,"Hotel, month, Feeder Market, Segment, Channel ...","Total Revenue, Room Revenue, RN, Lead Time, Le...",,,,Priority 1,"[detail, view, feeder, markets, destination, i...",functional
5,RPPBI0032,Feeder Market - 2024,Jonathan Shields,LifeReport.pbix,FEEDER_MARKETS_OF_DESTINATION,VIew focused on understanding the feeder marke...,Functional,Productive,,"Hotel, month, Feeder Market, Segment, Channel ...","Total Revenue, Room Revenue, RN, Lead Time, Le...",,,,Priority 1,"[view, focused, understanding, feeder, markets...",functional
6,RPPBI0032,Feeder Market - 2024,Jonathan Shields,LifeReport.pbix,MENU,Index page with interactive buttons to other v...,Index,Productive,,,,,,,Priority 1,"[index, page, interactive, buttons, views]",index
7,RPPBI0032,Feeder Market - 2024,Jonathan Shields,LifeReport.pbix,OE MARKET INSIGHTS,Benchmark by Destination. Outside information ...,Functional,Productive,,"Country, City","Total Spending, Total Revenue, Arrivals, Nights,","Outbound, Inbound",,,Priority 1,"[benchmark, destination, outside, information,...",functional
8,RPPBI0032,Feeder Market - 2024,Jonathan Shields,LifeReport.pbix,TARGETS FOLLOW UP,View that provides performance vs budget at a ...,Functional,Productive,,"Hotel, month, Feeder Market, Segment, Channel Mix","Total Revenue, Room Revenue, RN,ADR",Budget,,,Priority 1,"[view, provides, performance, vs, budget, feed...",functional
9,RPPBI0154,Feeder Market - 2025,Jonathan Shields,OfficerReport.pbix,CRITERIA,Methodolody and definition of the algorithim o...,Informative,Productive,,"Hotel, month, Feeder Market, Segment, Channel ...","Total Revenue, Room Revenue, RN, Lead Time, Le...",,,,Priority 1,"[methodolody, definition, algorithim, feeder, ...",informative


In [27]:
df["label"].value_counts()

label
functional      360
index            67
executive        57
informative      42
self-service     13
other            12
master data       7
Name: count, dtype: int64

## Train a labeled LDA

* tw=tp.TermWeight.ONE
This sets the term weighting scheme to no weighting (i.e., each word has equal importance).
Alternative values include:   
    * tp.TermWeight.PMI – Pointwise Mutual Information
    * tp.TermWeight.IDF – Inverse Document Frequency
    * tp.TermWeight.TFIDF – Term Frequency–Inverse Document Frequency

* min_cf=3
Minimum collection frequency: a token must appear in at least 3 documents to be included in the vocabulary.
This filters out extremely rare words to reduce noise.

* rm_top=5
Removes the top 5 most frequent tokens from the vocabulary.
These are typically very common terms (e.g. "data", "report") that add little semantic value for topic modeling.

In [10]:
# Crear modelo Labeled LDA
unique_labels = df["label"].unique().tolist()
model = tp.PLDAModel(tw=tp.TermWeight.ONE, min_cf=3, rm_top=5)

# Añadir documentos
for tokens, label in zip(df["tokens"], df["label"]):
    model.add_doc(tokens, labels=[label])

# Entrenar
model.train(0)
for i in range(100, 1000, 100):
    model.train(100)
    print(f"Log-likelihood: {model.ll_per_word:.4f}")


Log-likelihood: -6.3274
Log-likelihood: -6.3267
Log-likelihood: -6.3256
Log-likelihood: -6.3268
Log-likelihood: -6.3258
Log-likelihood: -6.3258
Log-likelihood: -6.3263
Log-likelihood: -6.3264
Log-likelihood: -6.3263


## Evaluate the Model

In [19]:
# Ver las palabras más relevantes por categoría
for i, label in enumerate(unique_labels):
    print(f"Top words for label '{label}':")
    print(model.get_topic_words(i, top_n=10))
    print()

Top words for label 'executive':
[('report', 0.04956256225705147), ('glossary', 0.04543578624725342), ('quest', 0.041309013962745667), ('main', 0.03305546194314957), ('fields', 0.028928689658641815), ('descriptions', 0.028928689658641815), ('tab', 0.028928689658641815), ('definitions', 0.024801915511488914), ('informative', 0.024801915511488914), ('kpi', 0.024801915511488914)]

Top words for label 'functional':
[('detail', 0.012081567198038101), ('evolution', 0.011578238569200039), ('data', 0.011578238569200039), ('table', 0.011410461738705635), ('kpis', 0.01057158038020134), ('total', 0.01057158038020134), ('month', 0.010236027650535107), ('tab', 0.010068251751363277), ('also', 0.010068251751363277), ('detailed', 0.009900474920868874)]

Top words for label 'index':
[('business', 0.024372220039367676), ('executive', 0.021123673766851425), ('kpis', 0.018687263131141663), ('hotel', 0.017875127494335175), ('budget', 0.01625085435807705), ('contains', 0.015438716858625412), ('report', 0.01

In [29]:
tokens = preprocess("summary hidden tooltip hotel sent")
#doc = model.make_doc(tokens)
doc = model.make_doc(tokens, labels=unique_labels) 
topic_dist, _ = model.infer(doc)
    
# Get most probable label
best_label = max(zip(unique_labels, topic_dist), key=lambda x: x[1])[0]
print(best_label)

self-service


## Use the model

In [20]:
# Aplicar a vistas sin categoría
unlabeled_df = pd.read_excel("../raw/Reporting_Inventory.xlsx", sheet_name="Views")
unlabeled_df = unlabeled_df[unlabeled_df["Category"].isna() & unlabeled_df["Description"].notna()]
unlabeled_df["tokens"] = unlabeled_df["Description"].apply(preprocess)

unlabeled_df.head(2)


Unnamed: 0,ID Data Product,Report Name,Product Owner,PBIX_File,Report View,Description,Category,Status,Rename,Dimensions,KPIs,Other Terms,Filters,Tags,Priority,tokens
182,RPPBI0034,Corporate Market Share - 2024,Raven Jordan,CharacterReport.pbix,STR Forecast Dashboard 2024,The reports sent by STR every 3 months with fo...,,Productive,,Cities available,"Occupancy, ADR, RevPar",%Chg last 2 forecast,"Forecast Month, Flag STR is Yes, Hotel_Name is...","STR Forecast, Corporate Market Share, 2024",Priority 1,"[reports, sent, str, every, months, forecast, ..."
183,RPPBI0034,Corporate Market Share - 2024,Raven Jordan,CharacterReport.pbix,STR Forecast Dashboard 2025,The reports sent by STR every 3 months with fo...,,Productive,,Cities available,"Occupancy, ADR, RevPar",%Chg last 2 forecast,"Forecast Month, Flag STR is Yes, Hotel_Name is...","STR Forecast, Corporate Market Share, 2024",Priority 1,"[reports, sent, str, every, months, forecast, ..."


In [30]:
# Predecir la categoría más probable
predictions = []
for tokens in unlabeled_df["tokens"]:
    doc = model.make_doc(tokens)
    topic_dist, _ = model.infer(doc)

    # Buscar la mejor etiqueta usando su probabilidad
    best_label = max(zip(unique_labels, topic_dist), key=lambda x: x[1])[0]
    predictions.append(best_label)

unlabeled_df["predicted_category"] = predictions
unlabeled_df.head(10)

Unnamed: 0,ID Data Product,Report Name,Product Owner,PBIX_File,Report View,Description,Category,Status,Rename,Dimensions,KPIs,Other Terms,Filters,Tags,Priority,tokens,predicted_category
182,RPPBI0034,Corporate Market Share - 2024,Raven Jordan,CharacterReport.pbix,STR Forecast Dashboard 2024,The reports sent by STR every 3 months with fo...,,Productive,,Cities available,"Occupancy, ADR, RevPar",%Chg last 2 forecast,"Forecast Month, Flag STR is Yes, Hotel_Name is...","STR Forecast, Corporate Market Share, 2024",Priority 1,"[reports, sent, str, every, months, forecast, ...",functional
183,RPPBI0034,Corporate Market Share - 2024,Raven Jordan,CharacterReport.pbix,STR Forecast Dashboard 2025,The reports sent by STR every 3 months with fo...,,Productive,,Cities available,"Occupancy, ADR, RevPar",%Chg last 2 forecast,"Forecast Month, Flag STR is Yes, Hotel_Name is...","STR Forecast, Corporate Market Share, 2024",Priority 1,"[reports, sent, str, every, months, forecast, ...",functional
259,RPPBI0150,Corporate Market Share - 2025,Matthew Callahan,SameReport.pbix,STR Forecast Dashboard 2025,The reports sent by STR every 3 months with fo...,,Productive,,Cities available,"Occupancy, ADR, RevPar",%Chg last 2 forecast,"Forecast Month, Flag STR is Yes, Hotel_Name is...","STR Forecast, Corporate Market Share",Priority 1,"[reports, sent, str, every, months, forecast, ...",functional
320,RPPBI0173,Daily Revenue Report 2025,Tasha Hall,AboutReport.pbix,Pick Up Channel Detail,DELETED,,,,,,,,,Priority 1,[deleted],functional
358,RPPBI0062,Price Competitiveness,Nicole Carter,AboutReport.pbix,Booking Criteria,"This view is exclusively for Booking.com,given...",,Productive,,"BU, Country, City, Hotel, Brand, META, OTA",,,,,Priority 1,"[view, exclusively, bookingcomgiven, offensive...",functional
362,RPPBI0062,Price Competitiveness,Nicole Carter,AboutReport.pbix,Page 1,internal,,Internal,,,,,,,Priority 1,[internal],functional
