# Overview of Assignment 1

This assignment involves implementing term-document retrieval techniques, including the term-document incidence matrix, inverted index, Jaccard similarity, and TF-IDF. The objective is to retrieve a set of recipes from a dataset based on a provided set of ingredients as a query.

#   Enter your details below

# Name

Aditya Patel

# Banner ID

B00930387

# Q1: Setting up the libraries and environment

In [34]:
%pip install pandas nltk whoosh scikit-learn
import nltk
import re
import tokenize
import pandas as pd
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
nltk.download('stopwords')
nltk.download('wordnet')




[nltk_data] Downloading package stopwords to C:\Users\Piyush
[nltk_data]     Patel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Piyush
[nltk_data]     Patel\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Q2: Data Pre-processing

In [1]:
import pandas as pd
import re
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Make sure to download necessary nltk resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load the dataset
data = pd.read_csv('data/food_recipes.csv')

# Function that combines cleaning, tokenizing, removing stopwords, stemming, and lemmatizing
def preprocess_text(text):
    # Normalize text
    text = text.lower()
    text = re.sub(r'[^a-z]', ' ', text)

    # Tokenize text
    tokens = nltk.word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Stem and lemmatize text
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    processed_tokens = [stemmer.stem(lemmatizer.lemmatize(word)) for word in tokens]

    # Join words back into single string
    return ' '.join(processed_tokens)

# Apply the comprehensive preprocessing function
data['directions'] = data['directions'].apply(preprocess_text)

# Display the last 5 rows of the dataset to verify changes
print(data.tail())


[nltk_data] Downloading package punkt to C:\Users\Piyush
[nltk_data]     Patel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Piyush
[nltk_data]     Patel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Piyush
[nltk_data]     Patel\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                         title  \
49995         Caramel Frosting   
49996  Barbecued Chicken Wings   
49997               Pound Cake   
49998                    Slush   
49999      Granny Ebert'S Hash   

                                              directions  
49995  let butter melt add sugar brown add milk bring...  
49996  mix sauc brown sugar onion water mix bowl set ...  
49997  bake hour put toothpick fork middl done toothp...  
49998  combin sugar boil water stir cool add banana r...  
49999  cover meat water teaspoon salt teaspoon pepper...  


# Q3 Term-Document Incidence Matrix


In [7]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

data = pd.read_csv('./data/food_recipes.csv')

# Ensure there is no missing text data
texts = data['recipeText'].fillna('')  # Replace NaN with empty string

# Initialize CountVectorizer with binary=True to create an incidence matrix
vectorizer = CountVectorizer(binary=True)

# Fit and transform the data
X = vectorizer.fit_transform(texts)

# Convert the result to a DataFrame for better readability
terms = vectorizer.get_feature_names_out()
incidence_matrix = pd.DataFrame(X.toarray(), columns=terms, index=data.index)

# Display the DataFrame
print(incidence_matrix.head())


In [4]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
data = pd.read_csv('./data/food_recipes.csv')


texts = data['directions'].fillna('')  
vectorizer = CountVectorizer(binary=True)

X = vectorizer.fit_transform(texts)

terms = vectorizer.get_feature_names_out()
incidence_matrix = pd.DataFrame(X.toarray(), columns=terms, index=data.index)

print(incidence_matrix.head())


   00  000  01  05  10  100  101  103  104  105  ...  zip  ziploc  zipper  \
0   0    0   0   0   0    0    0    0    0    0  ...    0       0       0   
1   0    0   0   0   0    0    0    0    0    0  ...    0       0       0   
2   0    0   0   0   0    0    0    0    0    0  ...    0       0       0   
3   0    0   0   0   0    0    0    0    0    0  ...    0       0       0   
4   0    0   0   0   0    0    0    0    0    0  ...    0       0       0   

   zippered  zita  ziti  zucchini  zucchinis  zuchini  zwieback  
0         0     0     0         0          0        0         0  
1         0     0     0         0          0        0         0  
2         0     0     0         0          0        0         0  
3         0     0     0         0          0        0         0  
4         0     0     0         0          0        0         0  

[5 rows x 7790 columns]


# Q4 Inverted Index


In [10]:
import pandas as pd

def map_function(data):
    intermediate = []
    # Adjust the column name as per your data schema, e.g., 'directions' or 'recipeText'
    for index, row in data.iterrows():
        # Normalize the text: convert to lower case and split into words
        terms = row['directions'].lower().split()
        for term in terms:
            intermediate.append((term, index))
    return intermediate

In [11]:
def reduce_function(mapped_data):
    inverted_index = {}
    for key, value in mapped_data:
        if key in inverted_index:
            inverted_index[key].add(value)
        else:
            inverted_index[key] = {value}
    return inverted_index

In [12]:
def create_inverted_index_map_reduce(data):
    mapped_data = map_function(data)
    return reduce_function(mapped_data)


In [35]:

def inverted_index_search(recipe_data, inverted_index, search_terms):
    try:
        sets_of_indices = [inverted_index[term] for term in search_terms if term in inverted_index]
        
        if not sets_of_indices:
            print("No search terms found in any document.")
            return pd.DataFrame()
        valid_indices = set.intersection(*sets_of_indices)
        if not valid_indices:
            print("No documents contain all search terms.")
            return pd.DataFrame()
        return recipe_data.loc[list(valid_indices), ['title', 'directions']]
    
    except KeyError as e:
        # This block now explicitly catches missing terms in the inverted index
        print(f"Warning: Search term '{str(e).strip('[]')}' not found in any document.")
        return pd.DataFrame()
inverted_index = create_inverted_index_map_reduce(data[:49999])

result = inverted_index_search(data, inverted_index, ['onions'])
print(result)

                         title  \
8192                Corn Cakes   
8196          Gourmet Potatoes   
24582             Spanish Rice   
8         Nolan'S Pepper Steak   
8200        Cornbread Dressing   
...                        ...   
32749    Chicken Tostada Salad   
8182     Chicken And Rice Bake   
32759    Orange Mushroom Salad   
24572               Deer Chili   
12287  Betty Jane'S Onion Soup   

                                              directions  
8192   Combine all ingredients., Pour 4 inch circles ...  
8196   Melt margarine., Cook onions until soft., Mix ...  
24582  Fry bacon in skillet until crisp; remove bacon...  
8      Roll steak strips in flour., Brown in skillet....  
8200   Heat oven to 425°., Grease and heat in oven a ...  
...                                                  ...  
32749  In small jar, mix vinegar, honey, cumin, salt ...  
8182   In 3-quart oblong baking dish, combine soup, w...  
32759  Toss lettuce, orange slices, mushrooms and oni...  
2

# Q5 Inverted Index using Trees

Documents containing 'cream':
                                                   title  \
1                                  Jewell Ball'S Chicken   
3                                          Chicken Funny   
32771                                      Lite Crab Dip   
5                               Cheeseburger Potato Soup   
6                                    Rhubarb Coffee Cake   
...                                                  ...   
32739                                           Ugly Dip   
32747                                    Company Chicken   
32755                                  Preacher'S Coming   
32758  None Such Prize Cookies(Makes 48 Cookies, 3-In...   
32766                                   Strawberry Salad   

                                              directions  
1      Place chipped beef on bottom of baking dish., ...  
3      Boil and debone chicken., Put bite size pieces...  
32771  Combine yogurt, mayonnaise, cream cheese and s...  
5      Wash p

# Q6 Jaccard Similarity


In [20]:
import pandas as pd
import time

def jaccard_similarity(set1, set2):

    intersection = set1.intersection(set2)
    union = set1.union(set2)
    if not union:
        return 0
    return len(intersection) / len(union)

def jaccard_similarity_search(recipe_data, search_terms):
  
    search_terms_set = set(search_terms)
    recipe_data['jaccard_similarity'] = recipe_data['directions'].apply(
        lambda x: jaccard_similarity(set(x.split()), search_terms_set)
    )
    return recipe_data.sort_values(by='jaccard_similarity', ascending=False)[['title', 'jaccard_similarity']]

def measure_average_latency(recipe_data, search_terms, num_trials=100):
   
    latencies = []
    for _ in range(num_trials):
        start_time = time.perf_counter()
        jaccard_similarity_search(recipe_data, search_terms)
        end_time = time.perf_counter()
        latencies.append((end_time - start_time) * 1000)  # Convert to milliseconds
    
    average_latency = sum(latencies) / len(latencies)
    return average_latency

# Load data from the CSV file
data = pd.read_csv('./data/food_recipes.csv')

# Define search terms
search_terms = ['mix', 'bake']

# Perform a single search and print results
results = jaccard_similarity_search(data, search_terms)
print("Search Results:")
print(results.head())

# Measure and print the average service latency
average_latency = measure_average_latency(data, search_terms)
print(f"Average Service Latency: {average_latency:.4f} ms")




## Another Answer 
import pandas as pd
import time

def jaccard_similarity(set1, set2):
    """
    Calculate the Jaccard similarity between two sets.
    :param set1: Set of elements.
    :param set2: Set of elements.
    :return: Jaccard similarity score.
    """
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    if not union:
        return 0
    return len(intersection) / len(union)

def jaccard_similarity_search(recipe_data, search_terms):
    """
    Perform a Jaccard similarity search on the recipe data using given search terms.
    :param recipe_data: DataFrame containing recipe data.
    :param search_terms: List of terms to search for.
    :return: DataFrame with titles and Jaccard similarity scores.
    """
    search_terms_set = set(search_terms)
    recipe_data['jaccard_similarity'] = recipe_data['directions'].apply(
        lambda x: jaccard_similarity(set(x.split()), search_terms_set)
    )
    return recipe_data.sort_values(by='jaccard_similarity', ascending=False)[['title', 'jaccard_similarity']]

def measure_average_latency(recipe_data, search_terms, num_trials=100):
    """
    Measure the average latency of performing the Jaccard similarity search.
    :param recipe_data: DataFrame containing recipe data.
    :param search_terms: List of terms to search for.
    :param num_trials: Number of trials to average the latency over.
    :return: Average latency in milliseconds.
    """
    latencies = []
    for _ in range(num_trials):
        start_time = time.perf_counter()
        jaccard_similarity_search(recipe_data, search_terms)
        end_time = time.perf_counter()
        latencies.append((end_time - start_time) * 1000)  # Convert to milliseconds
    
    average_latency = sum(latencies) / len(latencies)
    return average_latency

# Load data from the CSV file
data = pd.read_csv('./data/food_recipes.csv')

# Define search terms
search_terms = ['mix', 'bake']

# Perform a single search and print results
results = jaccard_similarity_search(data, search_terms)
print("Search Results:")
print(results.head())

# Measure and print the average service latency
average_latency = measure_average_latency(data, search_terms)
print(f"Average Service Latency: {average_latency:.4f} ms")



Search Results:
                          title  jaccard_similarity
31277      Mexican Corn Pudding            0.250000
46733  Sweet And Sour Meat Loaf            0.200000
2586            Blonde Brownies            0.200000
2349        Ramen Cabbage Salad            0.200000
49050             Lemonade Cake            0.166667
Average Service Latency: 352.6231 ms
Search Results:
                          title  jaccard_similarity
31277      Mexican Corn Pudding            0.250000
46733  Sweet And Sour Meat Loaf            0.200000
2586            Blonde Brownies            0.200000
2349        Ramen Cabbage Salad            0.200000
49050             Lemonade Cake            0.166667
Average Service Latency: 362.4426 ms


# Q7 TF-IDF

In [21]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import time

def perform_tfidf_search(data, query, top_n=5):
    """
    Perform TF-IDF based search on the given dataset.
    :param data: DataFrame containing the documents.
    :param query: Search query as a string.
    :param top_n: Number of top results to return.
    :return: DataFrame with top_n results sorted by relevance.
    """
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(data['directions'])
    query_tfidf = tfidf_vectorizer.transform([query])
    
    cosine_similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
    top_indices = cosine_similarities.argsort()[-top_n:][::-1]
    
    results = data.loc[top_indices, :]
    results['similarity'] = cosine_similarities[top_indices]
    return results

def measure_average_latency(data, query, num_trials=100):
    """
    Measure the average service latency of the TF-IDF search.
    :param data: DataFrame containing the documents.
    :param query: Search query as a string.
    :param num_trials: Number of trials to measure.
    :return: Average latency in milliseconds.
    """
    latencies = []
    for _ in range(num_trials):
        start_time = time.perf_counter()
        perform_tfidf_search(data, query)
        end_time = time.perf_counter()
        latencies.append((end_time - start_time) * 1000)
    
    average_latency = sum(latencies) / len(latencies)
    return average_latency

# Load data from the CSV file
data = pd.read_csv('./data/food_recipes.csv')

# Define search query
query = "bake mix"

# Perform TF-IDF search and print results
results = perform_tfidf_search(data, query)
print("TF-IDF Search Results:")
print(results[['title', 'similarity']])

# Measure and print average service latency
average_latency = measure_average_latency(data, query)
print(f"Average Service Latency: {average_latency:.4f} ms")


TF-IDF Search Results:
                 title  similarity
49050    Lemonade Cake    0.623005
11070   Corn Casserole    0.504928
2586   Blonde Brownies    0.487715
42277  Salisbury Steak    0.476386
19690   Tuna Casserole    0.473113
Average Service Latency: 1617.9188 ms


# Q8 Search Index using Whoosh

In [3]:
import os
import pandas as pd
import time
from whoosh.index import create_in, open_dir
from whoosh.fields import Schema, TEXT
from whoosh.qparser import MultifieldParser
from whoosh import scoring

# Load recipe data
recipe_data_path = '/mnt/data/food_recipes.csv'  # Update with the correct path
recipe_data = pd.read_csv(recipe_data_path)

# Define the schema
schema = Schema(
    title=TEXT(stored=True),
    directions=TEXT(stored=True)
)

# Directory for the index
index_dir = 'indexdir'
if not os.path.exists(index_dir):
    os.mkdir(index_dir)

# Create or open the index
if os.path.exists(os.path.join(index_dir, 'segments.gen')):
    index = open_dir(index_dir)
else:
    index = create_in(index_dir, schema)

# Index the recipe data
start_time = time.time()
writer = index.writer()
for _, row in recipe_data.iterrows():
    writer.add_document(title=row['title'], directions=row['directions'])
writer.commit()  
indexing_time = time.time() - start_time
print(f"Indexing completed in {indexing_time:.2f} seconds.")

# Function to list indexed documents for debugging
def list_indexed_documents():
    index = open_dir(index_dir)
    with index.searcher() as searcher:
        doc_count = searcher.doc_count()
        print(f"Total documents indexed: {doc_count}")
        for docnum in range(doc_count):
            stored_fields = searcher.stored_fields(docnum)
            print(stored_fields)

# Function to perform a search in the Whoosh index
def search_recipes(query_str):
    index = open_dir(index_dir)
    with index.searcher(weighting=scoring.TF_IDF()) as searcher:  # Using TF-IDF scoring
        query_parser = MultifieldParser(["title", "directions"], schema=index.schema)
        query = query_parser.parse(query_str)
        
        # Perform the search, limiting results to the top 5
        results = searcher.search(query, limit=5)
        return [(result['title'], result['directions'], result.score) for result in results]

# Perform searches using existing queries
queries = ["curry", "chicken", "vegetarian", "dessert", "easy dinner"]
for query in queries:
    results = search_recipes(query)
    print(f"Search results for '{query}':")
    for title, directions, score in results:
        print(f"Title: {title}, Directions: {directions}, Score: {score:.2f}")
    print()

# Evaluate the performance of the Whoosh-based searching

# Calculate the average time for indexing and searching operations over multiple runs
total_search_time = 0
for query in queries:
    start_time = time.time()
    results = search_recipes(query)
    search_time = time.time() - start_time
    total_search_time += search_time
    print(f"Search results for '{query}' in {search_time:.2f} seconds:")

average_search_time = total_search_time / len(queries)
print(f"Average search time: {average_search_time:.2f} seconds.")

# Calculate and analyze the space complexity of the index directory on disk
total_size = 0
for dirpath, dirnames, filenames in os.walk(index_dir):
    for f in filenames:
        fp = os.path.join(dirpath, f)
        total_size += os.path.getsize(fp)

print(f"Total index directory size: {total_size / 1024:.2f} KB")

# Analysis of files in the index directory
print("\nIndex Directory Contents:")
for dirpath, dirnames, filenames in os.walk(index_dir):
    for f in filenames:
        fp = os.path.join(dirpath, f)
        print(f"{f} ({os.path.getsize(fp) / 1024:.2f} KB) - Purpose and usage detailed in documentation.")


FileNotFoundError: [Errno 2] No such file or directory: '/mnt/data/food_recipes.csv'

# Q9 Performance Discussion

In [2]:
import os
import pandas as pd
import time
from whoosh.index import create_in, open_dir
from whoosh.fields import Schema, TEXT
from whoosh.qparser import MultifieldParser

# Define the schema
schema = Schema(
    title=TEXT(stored=True),
    directions=TEXT(stored=True)
)

# Directory for the index
index_dir = 'indexdir'
if not os.path.exists(index_dir):
    os.mkdir(index_dir)

# Create or open the index
if os.path.exists(os.path.join(index_dir, 'segments.gen')):
    index = open_dir(index_dir)
else:
    index = create_in(index_dir, schema)

# Load your data
recipe_data_path = './data/food_recipes.csv'
recipe_data = pd.read_csv(recipe_data_path)

# Measure indexing time
start_time = time.time()
writer = index.writer()
for _, row in recipe_data.iterrows():
    writer.add_document(title=row['title'], directions=row['directions'])
writer.commit()  
indexing_time = time.time() - start_time
print(f"Indexing completed in {indexing_time:.2f} seconds.")

# Function to perform a search and measure its time
def search_recipes(query_str):
    start_time = time.time()
    index = open_dir(index_dir)
    with index.searcher() as searcher:
        query_parser = MultifieldParser(["title", "directions"], schema=index.schema)
        query = query_parser.parse(query_str)
        results = searcher.search(query, limit=5)
        search_time = time.time() - start_time
        return results, search_time

# Search and calculate average search time
queries = ["curry", "chicken", "vegetarian", "dessert", "easy dinner"]
total_search_time = 0
for query in queries:
    results, search_time = search_recipes(query)
    total_search_time += search_time
    print(f"Search results for '{query}' in {search_time:.2f} seconds:")

average_search_time = total_search_time / len(queries)
print(f"Average search time: {average_search_time:.2f} seconds.")

# Calculate the space complexity of the index directory
total_size = 0
for dirpath, dirnames, filenames in os.walk(index_dir):
    for f in filenames:
        fp = os.path.join(dirpath, f)
        total_size += os.path.getsize(fp)

print(f"Total index directory size: {total_size / 1024:.2f} KB")

# Analysis of files in the index directory
print("\nIndex Directory Contents:")
for dirpath, dirnames, filenames in os.walk(index_dir):
    for f in filenames:
        fp = os.path.join(dirpath, f)
        print(f"{f} ({os.path.getsize(fp) / 1024:.2f} KB) - Purpose and usage detailed in documentation.")


Indexing completed in 42.81 seconds.
Search results for 'curry' in 0.04 seconds:
Search results for 'chicken' in 0.04 seconds:
Search results for 'vegetarian' in 0.01 seconds:
Search results for 'dessert' in 0.01 seconds:
Search results for 'easy dinner' in 0.01 seconds:
Average search time: 0.02 seconds.
Total index directory size: 64951.10 KB

Index Directory Contents:
MAIN_3b7rll35o3ua08i4.seg (32474.71 KB) - Purpose and usage detailed in documentation.
MAIN_7bxlaamck4wmq5f6.seg (32474.71 KB) - Purpose and usage detailed in documentation.
MAIN_WRITELOCK (0.00 KB) - Purpose and usage detailed in documentation.
_MAIN_1.toc (1.68 KB) - Purpose and usage detailed in documentation.
