# Final Project Data Preprocessing
This notebook will go over all preprocessing necessary for our optimization model.



## Import Modules

In [1]:
import os
import requests
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torchsummary import summary

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from tqdm import tqdm, trange
import seaborn as sns

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Load Data

In [2]:
# Define the local file path
file_path_recipes = 'recipes.csv'
file_path_reviews = 'reviews.csv'

# Load the data into a DataFrame
recipes = pd.read_csv(file_path_recipes)
reviews = pd.read_csv(file_path_reviews)

# Display the DataFrame
print(recipes.head())
print(reviews.head())

print("recipes shape: " + str(recipes.shape))
print("reviews shape: " + str(reviews.shape))

   RecipeId                               Name  AuthorId      AuthorName  \
0        38  Low-Fat Berry Blue Frozen Dessert      1533          Dancer   
1        39                            Biryani      1567        elly9812   
2        40                      Best Lemonade      1566  Stephen Little   
3        41     Carina's Tofu-Vegetable Kebabs      1586         Cyclopz   
4        42                       Cabbage Soup      1538       Duckie067   

  CookTime PrepTime TotalTime         DatePublished  \
0    PT24H    PT45M  PT24H45M  1999-08-09T21:46:00Z   
1    PT25M     PT4H   PT4H25M  1999-08-29T13:12:00Z   
2     PT5M    PT30M     PT35M  1999-09-05T19:52:00Z   
3    PT20M    PT24H  PT24H20M  1999-09-03T14:54:00Z   
4    PT30M    PT20M     PT50M  1999-09-19T06:19:00Z   

                                         Description  \
0  Make and share this Low-Fat Berry Blue Frozen ...   
1  Make and share this Biryani recipe from Food.com.   
2  This is from one of my  first Good House 

## Observing Data
Here, I analyzed the data to observe the shape and types of datapoints in the given data. This helps to better understand the data to see what pre-processing will be necessary.

In [3]:
descriptions = recipes['Description']
print(descriptions.head())
print(descriptions.shape)
print(recipes.columns)

0    Make and share this Low-Fat Berry Blue Frozen ...
1    Make and share this Biryani recipe from Food.com.
2    This is from one of my  first Good House Keepi...
3    This dish is best prepared a day in advance to...
4    Make and share this Cabbage Soup recipe from F...
Name: Description, dtype: object
(522517,)
Index(['RecipeId', 'Name', 'AuthorId', 'AuthorName', 'CookTime', 'PrepTime',
       'TotalTime', 'DatePublished', 'Description', 'Images', 'RecipeCategory',
       'Keywords', 'RecipeIngredientQuantities', 'RecipeIngredientParts',
       'AggregatedRating', 'ReviewCount', 'Calories', 'FatContent',
       'SaturatedFatContent', 'CholesterolContent', 'SodiumContent',
       'CarbohydrateContent', 'FiberContent', 'SugarContent', 'ProteinContent',
       'RecipeServings', 'RecipeYield', 'RecipeInstructions'],
      dtype='object')


## Data Preprocessing

In [43]:
from transformers import AutoTokenizer, AutoModel
import torch

#Find all recipes with descriptions that include breakfast, lunch, or dinner
rows = []
labels = []
key_words = ["breakfast", "lunch", "dinner"]

for i in range(recipes.shape[0]):
  description = recipes['Description'][i]
  for j in range(len(key_words)):
    if isinstance(description, str) and key_words[j] in description.lower():
      rows.append(i)
      labels.append(key_words[j])
      break

description_training_data = recipes.iloc[rows]

description_training_data.drop(columns=['AuthorName', 'CookTime', 'PrepTime', 'TotalTime','DatePublished','Images', 'RecipeInstructions'], inplace=True)

description_training_data.reset_index(inplace=True, drop=True)
description_training_labels = pd.DataFrame(labels, columns=['labels'])

print(description_training_data.shape)

print(description_training_data.head())
print(description_training_labels.head())


(30552, 21)
   RecipeId                       Name  AuthorId  \
0        88         Breakfast Burritos      1575   
1       148                  Bugwiches      1579   
2       170     Amish Six Layer Dinner      1534   
3       314  Thai Citrus Chicken Salad    148316   
4       557   Baked Breakfast Potatoes      1533   

                                         Description RecipeCategory  \
0  Make and share this Breakfast Burritos recipe ...      Breakfast   
1  A little bit of fun for the kids this summer. ...   Lunch/Snacks   
2  Make and share this Amish Six Layer Dinner rec...      Vegetable   
3  This is a healthy and delicious summer salad. ...        Chicken   
4  Make and share this Baked Breakfast Potatoes r...      Breakfast   

                                            Keywords  \
0                  c("Mexican", "< 60 Mins", "Easy")   
1  c("Tuna", "Very Low Carbs", "Low Protein", "Lo...   
2  c("Meat", "Low Cholesterol", "Weeknight", "Ove...   
3  c("Oranges", "Poultry

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  description_training_data.drop(columns=['AuthorName', 'CookTime', 'PrepTime', 'TotalTime','DatePublished','Images', 'RecipeInstructions'], inplace=True)


In [44]:
#Impute
from sklearn.impute import SimpleImputer

#Imputers for different data types
imp_string = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='')
imp_num = SimpleImputer(missing_values=np.nan, strategy="mean")

# Impute over all columns
for column in description_training_data.columns:
    if isinstance(description_training_data[column][0], str) or column == "RecipeYield":
        column_imp = imp_string.fit_transform(description_training_data[column].to_frame())
    else:
        column_imp = imp_num.fit_transform(description_training_data[column].to_frame())

    description_training_data[column] = column_imp.flatten()

print(description_training_data.head())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  description_training_data[column] = column_imp.flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  description_training_data[column] = column_imp.flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  description_training_data[column] = column_imp.flatten()
A value is trying to be set on a c

   RecipeId                       Name  AuthorId  \
0      88.0         Breakfast Burritos    1575.0   
1     148.0                  Bugwiches    1579.0   
2     170.0     Amish Six Layer Dinner    1534.0   
3     314.0  Thai Citrus Chicken Salad  148316.0   
4     557.0   Baked Breakfast Potatoes    1533.0   

                                         Description RecipeCategory  \
0  Make and share this Breakfast Burritos recipe ...      Breakfast   
1  A little bit of fun for the kids this summer. ...   Lunch/Snacks   
2  Make and share this Amish Six Layer Dinner rec...      Vegetable   
3  This is a healthy and delicious summer salad. ...        Chicken   
4  Make and share this Baked Breakfast Potatoes r...      Breakfast   

                                            Keywords  \
0                  c("Mexican", "< 60 Mins", "Easy")   
1  c("Tuna", "Very Low Carbs", "Low Protein", "Lo...   
2  c("Meat", "Low Cholesterol", "Weeknight", "Ove...   
3  c("Oranges", "Poultry", "Citrus",

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  description_training_data[column] = column_imp.flatten()


In [41]:
#Save Processed Data
description_training_data.to_csv('description_training_data.csv', index=False)
description_training_labels.to_csv('description_training_labels.csv', index=False)

print(description_training_data.shape)
print(description_training_labels.shape)

(30552, 22)
(30552, 1)


In [5]:
#Embeddings Using BERT Model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = AutoModel.from_pretrained(model_name)

#Helper Functions
def get_bert_embedding(text):
    """
    Get BERT embeddings for a given text.
    """
    # Tokenize input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

    # Pass tokens through BERT model
    with torch.no_grad():  # No gradient calculation needed for inference
        outputs = bert_model(**inputs)

    # Extract embeddings from the last hidden state (shape: [batch_size, seq_len, hidden_dim])
    hidden_states = outputs.last_hidden_state

    # Optional: Average the embeddings across the sequence for a single vector representation
    sentence_embedding = hidden_states.mean(dim=1)

    return sentence_embedding

def convert_to_bert(df, column):
  bert_embeddings = []
  for i in range(df.shape[0]):
    if i % 1000 == 0:
      print(f'{column} Rows Converted: {i}')
    bert_embeddings.append(get_bert_embedding(df[column][i]))

  return bert_embeddings

In [6]:
#convert all string instances into bert embeddings
bert_converted = {}

#Test########
# cur_col = description_training_data["Name"].to_frame()
# bert_converted["Name"] = convert_to_bert(cur_col, "Name")
# print(pd.DataFrame([bert_converted])["Name"].head())
# #############

for column_name in description_training_data.columns:
  print(f'{column_name}: {description_training_data[column_name][0]} - {type(description_training_data[column_name][0])}')
  if isinstance(description_training_data[column_name][0], str):
    cur_col = description_training_data[column_name].to_frame()
    bert_converted[column_name] = convert_to_bert(cur_col, column_name)

# Update the DataFrame after processing all columns
for column_name, bert_data in bert_converted.items():
    description_training_data[column_name] = bert_data

RecipeId: 88 - <class 'numpy.int64'>
Name: Breakfast Burritos - <class 'str'>
Name Rows Converted: 0
Name Rows Converted: 1000
Name Rows Converted: 2000
Name Rows Converted: 3000
Name Rows Converted: 4000
Name Rows Converted: 5000
Name Rows Converted: 6000
Name Rows Converted: 7000
Name Rows Converted: 8000
Name Rows Converted: 9000
Name Rows Converted: 10000
Name Rows Converted: 11000
Name Rows Converted: 12000
Name Rows Converted: 13000
Name Rows Converted: 14000
Name Rows Converted: 15000
Name Rows Converted: 16000
Name Rows Converted: 17000
Name Rows Converted: 18000
Name Rows Converted: 19000
Name Rows Converted: 20000
Name Rows Converted: 21000
Name Rows Converted: 22000
Name Rows Converted: 23000
Name Rows Converted: 24000
Name Rows Converted: 25000
Name Rows Converted: 26000
Name Rows Converted: 27000
Name Rows Converted: 28000
Name Rows Converted: 29000
Name Rows Converted: 30000
AuthorId: 1575 - <class 'numpy.int64'>
AuthorName: lindaWWJD - <class 'str'>
AuthorName Rows Conve

KeyboardInterrupt: 

In [None]:
#Save Processed Data
description_training_data.to_csv('description_training_data.csv', index=False)

description_training_labels.to_csv('description_training_labels.csv', index=False)

print(description_training_data.head())
print(description_training_data.shape)
print(description_training_labels.shape)

In [None]:
#Load in Training Data and Labels
file_path_description_train = 'description_training_data.csv'
file_path_description_label = 'description_training_labels.csv'

X = pd.read_csv(file_path_description_train)
Y = pd.read_csv(file_path_description_label)

print(X.columns)


In [None]:
#train/test split
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=.3, random_state=42)

In [None]:
#Using Logistic Regressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

model = LogisticRegression()
model.fit(X_train, Y_train)

Y_pred = model.predict(X_validation)

# Evaluate the model
print("Classification Report:\n", classification_report(Y_validation, Y_pred))
print("Accuracy:", accuracy_score(Y_validation, Y_pred))

# Now we will use BERT for clustering
We will use k-means clustering with a k value of 20 and the BERT embedding to find recipes with close similarity scores.

In [None]:
#Preparing Data

#Load in Training Data and Labels
recipes = 'recipes.csv'
X = pd.read_csv(recipes)

#Convert
cur_col = X["Description"].to_frame()
X["Description"] = convert_to_bert(cur_col, "Description")

converted_col = X["Description"].to_frame()

converted_col.to_csv('bert_descriptions.csv', index=False)

print(converted_col.head())


In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=20, random_state=0)

kmeans.fit(converted_col)

labels = kmeans.labels_

centers = kmeans.cluster_centers_

print("Labels:", labels)
print("Centers:", centers)

In [None]:
#Saving Kmeans Data
labels_df = pd.DataFrame(labels)
centers_df = pd.DataFrame(centers)

labels_df.to_csv('kmeans_labels.csv', index=False)
centers_df.to_csv('kmeans_centers.csv', index=False)