<a href="https://colab.research.google.com/github/akarr509/MLFantasyPredictor/blob/master/IS4200FinalProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [39]:
!pip install nltk



In [40]:
## building out the initial BM25 ranking (baseline)

# imports
import json
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from collections import defaultdict
import math

# downloads to setup a stemmer for tokenization
nltk.download('punkt_tab')
nltk.download('punkt')
stemmer = PorterStemmer()

# tokenizer helper function
def tokenize(text):
    tokens = word_tokenize(text.lower())
    return [stemmer.stem(t) for t in tokens if t.isalpha()]

#opening the annotated recipes + the query to document mapping files
with open("annotated_recipes.json") as f:
    dataset = json.load(f)

with open("query_map.json") as f:
    query_map = json.load(f)

# lookup dictionary
docid_to_entry = {doc["doc_id"]: doc for doc in dataset}

# building inverted index and supporting data for later scoring
def build_index(docs):
    index = defaultdict(dict)
    # document freq
    df = defaultdict(int)
    # list of all doc lengths
    doc_lengths = []

    # iterating through all docs
    for i, doc in enumerate(docs):
        # tokenize and stem doc text
        tokens = tokenize(doc["text"])
        # store lenghths
        doc_len = len(tokens)
        doc_lengths.append(doc_len)
        # dictionary for term frequency within doc
        tf = defaultdict(int)

        # calculating term freq for each token
        for token in tokens:
            tf[token] += 1
        # updating frequencies
        for term, freq in tf.items():
            index[term][i] = freq
            df[term] += 1

    # calculating average document length
    avgdl = sum(doc_lengths) / len(doc_lengths)
    return index, doc_lengths, df, avgdl

# scoring function based of BM25
def bm25_score(query, index, doc_lengths, df, avgdl, k1=1.5, b=0.75):
    # stores scores for each doc
    scores = defaultdict(float)

    #tokenizing the query
    query_terms = tokenize(query)

    # trotal num of docs
    N = len(doc_lengths)

    #looping over each term in query -> if present calc the IDF (using log based smoothing to avoid any division by 0)
    for term in query_terms:
        if term in index:
            idf = math.log(1 + (N - df[term] + 0.5) / (df[term] + 0.5))

            # looping through docs with term
            for doc_id, freq in index[term].items():
                # score calculation and storing it to the score map
                tf = freq
                denom = tf + k1 * (1 - b + b * (doc_lengths[doc_id] / avgdl))
                score = idf * tf * (k1 + 1) / denom
                scores[doc_id] += score

    # final ranking based on descending score (best scored at top = most relevant)
    return sorted(scores.items(), key=lambda x: x[1], reverse=True)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [41]:
# average precision helper for MAP calculations
def average_precision_at_k(relevance_list, k=6):
    num_relevant_found = 0
    precision_sum = 0
    num_relevant_total = 0

    for rank, relevance in enumerate(relevance_list[:k], start=1):
        if relevance == 1:
            num_relevant_found += 1
            precision = num_relevant_found / rank
            precision_sum += precision
            num_relevant_total += 1

    if num_relevant_total == 0:
        return 0.0
    return precision_sum / num_relevant_total

In [42]:
# Store average precisions per query
avg_precisions = []

# Running the BM25 scoring and ranking on our queries and data
for query_entry in query_map:
    query_id = query_entry["query_id"]
    query = query_entry["query"]
    doc_ids = query_entry["doc_ids"]
    docs = [docid_to_entry[doc_id] for doc_id in doc_ids]

    # BM25 init and set up
    index, doc_lengths, df, avgdl = build_index(docs)
    results = bm25_score(query, index, doc_lengths, df, avgdl)

    # Convert results to dict for full score access
    score_dict = dict(results)

    # Build full list of (doc index, score) and sort descending
    scored_docs = [(i, score_dict.get(i, 0.0)) for i in range(len(docs))]
    scored_docs.sort(key=lambda x: x[1], reverse=True)  # Sort by score descending

    print("\nTop results for query " + str(query_id) + " - " + "''" + str(query) + "''\n")

    # Build relevance list based on sorted order
    relevance_list = []
    for i, score in scored_docs:
        doc = docs[i]
        relevance = doc.get('relevance', 0)
        relevance_list.append(relevance)
        print(f"[{score:.4f}] Relevance: {relevance} — {doc['url']}")

    # Calculate average precision
    ap = average_precision_at_k(relevance_list, k=6)
    avg_precisions.append(ap)

# Calculate and print MAP@6
map_score = sum(avg_precisions) / len(avg_precisions)
print("\nMAP@6 Score: " + str(map_score))


Top results for query query_001 - ''High protein chicken based dinner''

[2.9398] Relevance: 1 — https://laurenfitfoodie.com/chicken-enchiladas/
[2.1026] Relevance: 1 — https://www.budgetbytes.com/creamy-garlic-chicken/
[2.0606] Relevance: 1 — https://hungryhappens.net/high-protein-chicken-parmesan-with-cottage-cheese/
[2.0188] Relevance: 1 — https://lemonsandzest.com/parmesan-crusted-chicken-sheet-pan-dinner/
[1.9572] Relevance: 0 — https://www.wholesomelicious.com/high-protein-egg-casserole-with-cottage-cheese/
[1.9537] Relevance: 1 — https://www.skinnytaste.com/coconut-chicken-rice-bowl/
[1.9510] Relevance: 0 — https://www.the-girl-who-ate-everything.com/one-pan-garlic-butter-chicken/
[1.7708] Relevance: 0 — https://thatveganbabe.com/summer-roll-bowl-meal-prep-high-protein-vegan/
[1.6705] Relevance: 1 — https://www.eatingwell.com/recipe/267768/chicken-spinach-skillet-pasta-with-lemon-parmesan/
[1.6704] Relevance: 1 — https://ohsnapmacros.com/marry-me-chicken-pasta/
[1.4416] Relevan

In [43]:
#enhancing BM25 baseline now with nutritional goal detection

# tag identifiers from common phrases in queries
nutrition_keywords = {
    "high protein": "high_protein",
    "low sodium": "low_sodium",
    "low sugar": "low_sugar",
    "low carb": "low_carb",
    "low fat": "low_fat",
    "low calorie": "low_calorie"
}

# query processing to find nutri_tags
def extract_nutrition_tags(query: str) -> list[str]:
    # normalizing query to lower case
    query_lower = query.lower()
    #going through the nutrition_kewords dictionary to look for presence of any common tag phrases and if so return all found tags
    return [tag for phrase, tag in nutrition_keywords.items() if phrase in query_lower]

# dictionary for associated nutritional goal thresholds
nutrition_thresholds = {
    "high_protein": lambda v: v >= 25,
    "low_sodium": lambda v: v <= 400,
    "low_sugar": lambda v: v <= 10,
    "low_carb": lambda v: v <= 20,
    "low_fat": lambda v: v <= 10,
    "low_calorie": lambda v: v <= 500,
}

# score adjustement based on satisfaction of nutritional goals (checking metadata against the thresholds)
def adjust_for_nutrition(score, doc_nutrition, nutrition_tags, bonus=0.25, penalty=-0.5):
    # checking flagged tags
    for tag in nutrition_tags:
        if tag in nutrition_thresholds:
            # based on the tag determining which macro to look at + pull that value
            nutrient_key = tag.split("_")[-1]
            value = doc_nutrition.get(nutrient_key)
            # if no value means not available on the site - skip. Else, comapre to see if threshold is met
            if value is not None:
                # if met then apply the bonus else apply a penaly
                if nutrition_thresholds[tag](value):
                    score += bonus
                else:
                    score += penalty
    # return adjusted score
    return score

# reranking feature combined with the baseline -> taking in the output of the baseline model and then applying above helper to adjust and return nrew rankings
def rerank_with_nutrition(query, scored_docs, docs, bonus=0.25, penalty=-0.5):
    # parse querry for tags
    nutrition_tags = extract_nutrition_tags(query)

    # storing new scores
    reranked = []
    for i, score in scored_docs:
        doc = docs[i]
        nutrition = doc.get("nutrition", {})
        adjusted_score = adjust_for_nutrition(score, nutrition, nutrition_tags, bonus, penalty)
        reranked.append((i, adjusted_score))

    # returning new output and ranking
    return sorted(reranked, key=lambda x: x[1], reverse=True)

In [44]:
# Store average precisions per query
avg_precisions = []

# Running the BM25 scoring and nutrition-based reranking
for query_entry in query_map:
    query_id = query_entry["query_id"]
    query = query_entry["query"]
    doc_ids = query_entry["doc_ids"]
    docs = [docid_to_entry[doc_id] for doc_id in doc_ids]

    # BM25 baseline scoring
    index, doc_lengths, df, avgdl = build_index(docs)
    results = bm25_score(query, index, doc_lengths, df, avgdl)

    # Reranking using nutrition constraints
    nutri_goal_results = rerank_with_nutrition(query, results, docs)

    # Convert to score dictionary
    score_dict = dict(nutri_goal_results)

    # Build full list of (doc index, score) and sort descending
    scored_docs = [(i, score_dict.get(i, 0.0)) for i in range(len(docs))]
    scored_docs.sort(key=lambda x: x[1], reverse=True)

    print("\nTop results for query " + str(query_id) + " - " + "''" + str(query) + "''\n")

    # Build relevance list and output results
    relevance_list = []
    for i, score in scored_docs:
        doc = docs[i]
        relevance = doc.get('relevance', 0)
        relevance_list.append(relevance)
        print(f"[{score:.4f}] Relevance: {relevance} — {doc['url']}")

    # Average precision for current query
    ap = average_precision_at_k(relevance_list, k=6)
    avg_precisions.append(ap)

# Final MAP@6 across all queries
map_score = sum(avg_precisions) / len(avg_precisions)
print("\nMAP@6 Score (using nutrition boosting): " + str(map_score))



Top results for query query_001 - ''High protein chicken based dinner''

[3.1898] Relevance: 1 — https://laurenfitfoodie.com/chicken-enchiladas/
[2.3526] Relevance: 1 — https://www.budgetbytes.com/creamy-garlic-chicken/
[2.3106] Relevance: 1 — https://hungryhappens.net/high-protein-chicken-parmesan-with-cottage-cheese/
[2.2688] Relevance: 1 — https://lemonsandzest.com/parmesan-crusted-chicken-sheet-pan-dinner/
[2.2037] Relevance: 1 — https://www.skinnytaste.com/coconut-chicken-rice-bowl/
[2.0208] Relevance: 0 — https://thatveganbabe.com/summer-roll-bowl-meal-prep-high-protein-vegan/
[1.9205] Relevance: 1 — https://www.eatingwell.com/recipe/267768/chicken-spinach-skillet-pasta-with-lemon-parmesan/
[1.9204] Relevance: 1 — https://ohsnapmacros.com/marry-me-chicken-pasta/
[1.6904] Relevance: 1 — https://www.wholesomelicious.com/spinach-artichoke-chicken-casserole/
[1.5099] Relevance: 0 — https://kellyjonesnutrition.com/high-protein-vegan-stir-fry/
[1.4572] Relevance: 0 — https://www.whole

In [45]:
## implementing the processing for exclusion handling

# regex for pattern matching
import re

# mini dictionary for synonym expansion and increasing coverage
exclusion_synonyms = {
    "beans": ["black beans", "kidney beans", "garbanzo beans", "white beans", "navy beans", "pinto beans"],
    "mushrooms": ["mushroom", "cremini mushroom", "portobello", "shiitake", "white mushroom"],
    "cheese": ["parmesan", "cheddar", "feta", "mozzarella", "pepperjack", "cottage cheese", "swiss cheese"],
    "blueberries": ["blueberries", "blueberry"],
    "eggs": ["eggs", "egg"],
    "meat": ["bacon", "chicken", "pork", "beef", "steak", "sausage"],
}

# using regex in this helper to find exclusion phrases
def extract_exclusions(query: str) -> list[str]:
    patterns = [
        r"without\s+([a-zA-Z\s\-]+)",
        r"no\s+([a-zA-Z\s\-]+)",
        r"excluding\s+([a-zA-Z\s\-]+)",
        r"free of\s+([a-zA-Z\s\-]+)",
        r"avoid\s+([a-zA-Z\s\-]+)",
        r"does not contain\s+([a-zA-Z\s\-]+)",
        r"with no\s+([a-zA-Z\s\-]+)"
    ]
    exclusions = []
    # looping through each pattern + seeing if it is present in the query
    for pattern in patterns:
        matches = re.findall(pattern, query.lower())
        # if present then manipulating to pull the ingredient that is said to need to be excluded
        for match in matches:
            tokens = match.strip().replace("-", " ").split(" and ")
            exclusions.extend(t.strip() for t in tokens)
    # Implicit exclusions from dietary preference
    lowered = query.lower()
    if "vegetarian" in lowered or "vegan" in lowered:
        exclusions.append("meat")

    return exclusions

# synonuym expansion helper using previously initialized dictionary
def expand_exclusions(exclusions):
    expanded = set()
    for ex in exclusions:
        expanded.add(ex)
        expanded.update(exclusion_synonyms.get(ex, []))
    return list(expanded)


In [46]:
# helper to see if an exclusion string matches an ingredient string
def match_ingredient(ingredient: str, exclusion: str) -> bool:
    return exclusion in ingredient

# heandling penalties and score adjustments when exclusions are found
def penalize_score(original_score, ingredients, exclusions, penalty_weight=1.0):
    #going throught the annotated ingredients, stripping exclusions and ingredients and comparing using our helper to find presence
    for ing in ingredients:
        ing_norm = ing.lower().strip()
        for ex in exclusions:
            ex_norm = ex.lower().strip()
            if match_ingredient(ing_norm, ex_norm):
                # if found then apply penalty
                return original_score - penalty_weight
    return original_score


In [47]:
# Final combined reranking suing both nutrition goals + exclusion constraints
def rerank_with_exclusion_and_nutrition(query, scored_docs, docs,
                                        exclusion_weight=2.0,
                                        nutrition_bonus=0.25,
                                        nutrition_penalty=0.5):
    raw_exclusions = extract_exclusions(query)
    exclusions = expand_exclusions(raw_exclusions)
    nutrition_tags = extract_nutrition_tags(query)

    reranked = []
    for i, score in scored_docs:
        doc = docs[i]
        ingredients = doc.get("ingredients", [])
        nutrition = doc.get("nutrition", {})

        # taking the baseline scores and then adjusting based on the nutritiongoals helper and then the exclusion adjuster
        score = penalize_score(score, ingredients, exclusions, exclusion_weight)
        score = adjust_for_nutrition(score, nutrition, nutrition_tags,
                                     bonus=nutrition_bonus, penalty=-nutrition_penalty)
        reranked.append((i, score))

    #return final sorted
    return sorted(reranked, key=lambda x: x[1], reverse=True)


In [48]:
## once again same process but this time we are passing the BM25 baseline output into new adjuster that applies both features to get new outputs - rest all is calculated the same

# Store average precisions per query
avg_precisions = []

## running the enhanced scoring and ranking on our queries and data
for query_entry in query_map:
    query_id = query_entry["query_id"]
    query = query_entry["query"]
    doc_ids = query_entry["doc_ids"]
    docs = [docid_to_entry[doc_id] for doc_id in doc_ids]

    index, doc_lengths, df, avgdl = build_index(docs)
    results = bm25_score(query, index, doc_lengths, df, avgdl)

    # applying both exclusion and nutrition enhancements
    final_results = rerank_with_exclusion_and_nutrition(query, results, docs)

    # score dict and full doc set
    score_dict = dict(final_results)

    # sorted doc list by adjusted score
    scored_docs = [(i, score_dict.get(i, 0.0)) for i in range(len(docs))]
    scored_docs.sort(key=lambda x: x[1], reverse=True)

    # progress output (what query we are on)
    print("\n" + "Top results for query " + str(query_id) + " - " + "''" + str(query) + "''" + "\n")

    # building a relevance list for avg precision calc + outputting the relevances in their order
    relevance_list = []
    for i, score in scored_docs:
        doc = docs[i]
        relevance = doc.get('relevance', 0)
        relevance_list.append(relevance)
        print(f"[{score:.4f}] Relevance: {relevance} — {doc['url']}")

    # avg precision calcs and adding to main list for MAP calc.
    ap = average_precision_at_k(relevance_list, k=6)
    avg_precisions.append(ap)

# calculating and printing out MAP@6 over the set for performance eval
map_score = sum(avg_precisions) / len(avg_precisions)
print("\nMAP@6 Score (using both nutrition boosting and exclusion handling): " + str(map_score))



Top results for query query_001 - ''High protein chicken based dinner''

[3.1898] Relevance: 1 — https://laurenfitfoodie.com/chicken-enchiladas/
[2.3526] Relevance: 1 — https://www.budgetbytes.com/creamy-garlic-chicken/
[2.3106] Relevance: 1 — https://hungryhappens.net/high-protein-chicken-parmesan-with-cottage-cheese/
[2.2688] Relevance: 1 — https://lemonsandzest.com/parmesan-crusted-chicken-sheet-pan-dinner/
[2.2037] Relevance: 1 — https://www.skinnytaste.com/coconut-chicken-rice-bowl/
[2.0208] Relevance: 0 — https://thatveganbabe.com/summer-roll-bowl-meal-prep-high-protein-vegan/
[1.9205] Relevance: 1 — https://www.eatingwell.com/recipe/267768/chicken-spinach-skillet-pasta-with-lemon-parmesan/
[1.9204] Relevance: 1 — https://ohsnapmacros.com/marry-me-chicken-pasta/
[1.6904] Relevance: 1 — https://www.wholesomelicious.com/spinach-artichoke-chicken-casserole/
[1.5099] Relevance: 0 — https://kellyjonesnutrition.com/high-protein-vegan-stir-fry/
[1.4572] Relevance: 0 — https://www.whole