# Predicting Categories from Report View Descriptions with Keras

This notebook trains a neural network to classify report views into their respective `Category` using the view's `Description` as input text. Once trained, the model can infer categories for views where the field is missing.

<a href="https://colab.research.google.com/github/cbadenes/semantic-report-search/blob/main/data/analysis/30_classify_categories.ipynb" target="_parent">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open in Colab"/>
</a>


In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import spacy
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Cargar datos
df = pd.read_excel("Reporting_Inventory.xlsx", sheet_name="Views")
df = df.dropna(subset=["Category"])  # Solo usamos datos con categoría

# Preprocesamiento básico
nlp = spacy.load("en_core_web_sm")
def clean_text(text):
    doc = nlp(str(text).lower())
    return " ".join([t.lemma_ for t in doc if t.is_alpha and not t.is_stop])

df["clean_text"] = df["Description"].apply(clean_text)
df.head(2)

Unnamed: 0,ID Data Product,Report Name,Product Owner,PBIX_File,Report View,Description,Category,Status,Rename,Dimensions,KPIs,Other Terms,Filters,Tags,Priority,clean_text
0,RPPBI0032,Feeder Market - 2024,Jonathan Shields,LifeReport.pbix,CRITERIA,Methodolody and definition of the algorithim o...,Informative,Productive,,,,,,,Priority 1,methodolody definition algorithim feed market
1,RPPBI0032,Feeder Market - 2024,Jonathan Shields,LifeReport.pbix,DESTINATION_OF_FEEDER_MARKETS,View focused on understand the performance by ...,Functional,Productive,,"Hotel, month, Feeder Market, Segment, Channel ...","Total Revenue, Room Revenue, RN, Lead Time, Le...",,,,Priority 1,view focus understand performance hotel specif...


In [2]:
# Tokenización
tokenizer = Tokenizer(num_words=2000)
tokenizer.fit_on_texts(df["clean_text"])
X_seq = tokenizer.texts_to_sequences(df["clean_text"])
X_pad = pad_sequences(X_seq, maxlen=50)

# Etiquetas
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(df["Category"])

# Tamaños
vocab_size = len(tokenizer.word_index) + 1
num_classes = len(label_encoder.classes_)


In [3]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=16, input_length=50))
model.add(Flatten())
model.add(Dense(32, activation="relu"))
model.add(Dense(num_classes, activation="softmax"))

model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.summary()




In [4]:
model.fit(X_pad, y_encoded, epochs=10, batch_size=4, validation_split=0.2)


Epoch 1/10
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.5241 - loss: 1.5144 - val_accuracy: 0.4162 - val_loss: 1.3395
Epoch 2/10
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5980 - loss: 1.1265 - val_accuracy: 0.4467 - val_loss: 1.3301
Epoch 3/10
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6353 - loss: 0.9305 - val_accuracy: 0.4061 - val_loss: 1.2563
Epoch 4/10
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7102 - loss: 0.7340 - val_accuracy: 0.4518 - val_loss: 1.3175
Epoch 5/10
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7645 - loss: 0.6371 - val_accuracy: 0.4518 - val_loss: 1.2709
Epoch 6/10
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7929 - loss: 0.5824 - val_accuracy: 0.4061 - val_loss: 1.3527
Epoch 7/10
[1m196/196[0m 

<keras.src.callbacks.history.History at 0x7bd315896590>

In [9]:
# Predecir categorías para vistas sin etiqueta
df_original = pd.read_excel("Reporting_Inventory.xlsx", sheet_name="Views")
df_missing = df_original[df_original["Category"].isna() & df_original["Description"].notna()].copy()
df_missing["clean_text"] = df_missing["Description"].apply(clean_text)
df_missing.head(2)

Unnamed: 0,ID Data Product,Report Name,Product Owner,PBIX_File,Report View,Description,Category,Status,Rename,Dimensions,KPIs,Other Terms,Filters,Tags,Priority,clean_text
182,RPPBI0034,Corporate Market Share - 2024,Raven Jordan,CharacterReport.pbix,STR Forecast Dashboard 2024,The reports sent by STR every 3 months with fo...,,Productive,,Cities available,"Occupancy, ADR, RevPar",%Chg last 2 forecast,"Forecast Month, Flag STR is Yes, Hotel_Name is...","STR Forecast, Corporate Market Share, 2024",Priority 1,report send str month forecast datum market oc...
183,RPPBI0034,Corporate Market Share - 2024,Raven Jordan,CharacterReport.pbix,STR Forecast Dashboard 2025,The reports sent by STR every 3 months with fo...,,Productive,,Cities available,"Occupancy, ADR, RevPar",%Chg last 2 forecast,"Forecast Month, Flag STR is Yes, Hotel_Name is...","STR Forecast, Corporate Market Share, 2024",Priority 1,report send str month forecast datum market oc...


In [10]:
X_missing_seq = tokenizer.texts_to_sequences(df_missing["clean_text"])
X_missing_pad = pad_sequences(X_missing_seq, maxlen=50)

y_pred = model.predict(X_missing_pad)
df_missing["Predicted Category"] = label_encoder.inverse_transform(np.argmax(y_pred, axis=1))

df_missing[["Report View", "Description", "Predicted Category"]].head(10)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step


Unnamed: 0,Report View,Description,Predicted Category
182,STR Forecast Dashboard 2024,The reports sent by STR every 3 months with fo...,Functional
183,STR Forecast Dashboard 2025,The reports sent by STR every 3 months with fo...,Functional
259,STR Forecast Dashboard 2025,The reports sent by STR every 3 months with fo...,Functional
320,Pick Up Channel Detail,DELETED,Functional
358,Booking Criteria,"This view is exclusively for Booking.com,given...",Informative
362,Page 1,internal,Index


In [7]:
df_missing.to_csv("views_with_predicted_categories_keras.csv", index=False)
