# N-Gram Model from Report Descriptions

This notebook create a **n-gram model** from the `Description` column using `nltk`, in order to expand the `keywords` used in search.

In [1]:
# Install required NLTK resources (if running for the first time)
# !pip install nltk
import nltk
nltk.download('punkt')
nltk.download('stopwords')

# Load libraries
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.util import ngrams
from nltk import FreqDist
from collections import defaultdict
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="openpyxl")
from pathlib import Path


[nltk_data] Downloading package punkt to /Users/cbadenes/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cbadenes/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Load and prepare report descriptions

In [2]:
# Load Views sheet
views_path = Path("../raw/Reporting_Inventory.xlsx")
views_df = pd.read_excel(views_path, sheet_name="Views")
views_df.fillna("", inplace=True)
views_df = views_df[views_df["Description"].str.strip() != ""]
descriptions = views_df["Description"].dropna().astype(str).tolist()
descriptions[0]

'Methodolody and definition of the algorithim of Feeder Market'

### Preprocess text: lowercase, clean, remove stopwords

In [3]:
# Define English stopwords (can switch to 'spanish' if needed)
stop_words = set(stopwords.words('english'))

def preprocess(text):
    """
    Normalize, tokenize, and clean a single string of text.
    """
    # Lowercase and remove punctuation
    text = text.lower()
    text = re.sub(r"[^a-záéíóúñü\s]", "", text)
    
    # Tokenize and remove stopwords
    tokens = word_tokenize(text)
    filtered_tokens = [t for t in tokens if t not in stop_words]
    
    return filtered_tokens

# Apply preprocessing to each report description
token_lists = [preprocess(desc) for desc in descriptions]


### Generate and analyze bigrams and trigrams

In [4]:
# Flatten all tokens into a single list
all_tokens = [token for tokens in token_lists for token in tokens]

# Build bigrams and trigrams
bigrams = list(ngrams(all_tokens, 2))
trigrams = list(ngrams(all_tokens, 3))

# Compute frequency distributions
bigram_freq = FreqDist(bigrams)
trigram_freq = FreqDist(trigrams)

# Preview top 10 n-grams
print("Top 10 bigrams:")
print(bigram_freq.most_common(10))

print("\nTop 10 trigrams:")
print(trigram_freq.most_common(10))


Top 10 bigrams:
[(('index', 'page'), 51), (('buttons', 'views'), 50), (('page', 'interactive'), 48), (('interactive', 'buttons'), 48), (('feeder', 'market'), 44), (('first', 'block'), 39), (('detail', 'regarding'), 37), (('block', 'filters'), 35), (('view', 'analyze'), 34), (('dynamic', 'table'), 31)]

Top 10 trigrams:
[(('index', 'page', 'interactive'), 48), (('page', 'interactive', 'buttons'), 48), (('interactive', 'buttons', 'views'), 48), (('first', 'block', 'filters'), 35), (('contains', 'first', 'block'), 21), (('month', 'week', 'evolution'), 19), (('quest', 'detail', 'regarding'), 17), (('view', 'shows', 'first'), 15), (('shows', 'first', 'block'), 15), (('block', 'filters', 'second'), 15)]


In [5]:
import json
# Build bigram prediction index: word -> list of possible next words sorted by frequency
bigram_index = defaultdict(list)
for (w1, w2), freq in bigram_freq.items():
    bigram_index[w1].append((w2, freq))

# Sort each list of next-word suggestions by frequency
for w1 in bigram_index:
    bigram_index[w1] = sorted(bigram_index[w1], key=lambda x: -x[1])

# Convertir a dict serializable
serializable_index = {k: v for k, v in bigram_index.items()}

# Save dictionary as JSON
with open("../models/bigram_index.json", "w", encoding="utf-8") as f:
    json.dump(serializable_index, f)

In [8]:

def suggest_next_word(word, top_n=1):
    """
    Given a word, suggest the most likely next tokens based on bigram frequencies.
    """
    word = word.lower()
    if word in bigram_index:
        return [next_word for next_word, _ in bigram_index[word][:top_n]]
    else:
        return []

# Example usage:
ref_word = "feeder"
print("Suggestions for '"+ref_word+"':", suggest_next_word(ref_word))

Suggestions for 'feeder': ['market']
