In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load datasets
recipes_df = pd.read_csv('C:/Users/arsen/Healthylicious/data/cleaned/csv/recipes_dataset.csv')


In [3]:
# TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
tfidf_matrix = tfidf_vectorizer.fit_transform(recipes_df['Ingredients'])

def get_content_based_recommendations(user_ingredients, top_n=10):
    user_tfidf = tfidf_vectorizer.transform([user_ingredients])
    cosine_sim = cosine_similarity(user_tfidf, tfidf_matrix)
    similar_indices = cosine_sim.argsort().flatten()[::-1][:top_n]
    return recipes_df.iloc[similar_indices]

In [4]:
# Example usage
user_ingredients_str = "cranberry, orange, butter, sugar"
content_based_recommendations = get_content_based_recommendations(user_ingredients_str)
print(content_based_recommendations)

      recipeId                     Category  \
79          80                    Appetizer   
246        247                    Breakfast   
936        937                  Main Course   
1016      1017              Sauce,Side Dish   
49          50                      Dessert   
61          62  Appetizer,Breakfast,Dessert   
9           10                    Appetizer   
71          72                    Appetizer   
490        491                      Dessert   
653        654                      Dessert   

                                                  Title  Total Time  \
79    Warm Brie with Honeyed Cranberry Walnut Fruit ...          15   
246                     Cranberry Orange Muffins Recipe          30   
936                  Cranberry Orange Glazed Ham Recipe          95   
1016                             Cranberry Sauce Recipe          15   
49                           Sugared Cranberries Recipe          63   
61                        Cranberry Orange Rolls Recipe 

## Model Evaluation

In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import KFold

# Load datasets
recipes_df = pd.read_csv('C:/Users/arsen/Healthylicious/data/cleaned/csv/recipes_dataset.csv')

In [6]:
# TF-IDF Vectorizer with Fine-Tuning
tfidf_vectorizer = TfidfVectorizer(
    stop_words='english', 
    ngram_range=(1, 2), 
    max_df=0.8, 
    min_df=5,
    token_pattern=r'\b\w+\b'
)
tfidf_matrix = tfidf_vectorizer.fit_transform(recipes_df['Ingredients'])

In [7]:
def get_content_based_recommendations(user_ingredients, top_n=10):
    user_tfidf = tfidf_vectorizer.transform([user_ingredients])
    cosine_sim = cosine_similarity(user_tfidf, tfidf_matrix)
    similar_indices = cosine_sim.argsort().flatten()[::-1][:top_n]
    return recipes_df.iloc[similar_indices]

def evaluate_recommendations(user_ingredients, test_ingredients, top_n=10):
    recommendations = get_content_based_recommendations(user_ingredients, top_n=top_n)
    recommended_ingredients = set(recommendations['Ingredients'].str.split(', ').sum())
    test_ingredients_set = set(test_ingredients.split(', '))

    true_positives = len(recommended_ingredients & test_ingredients_set)
    predicted_positives = len(recommended_ingredients)
    actual_positives = len(test_ingredients_set)

    precision = true_positives / predicted_positives if predicted_positives > 0 else 0
    recall = true_positives / actual_positives if actual_positives > 0 else 0
    
    return precision, recall

In [8]:
# Cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=1)
precision_scores = []
recall_scores = []

for train_index, test_index in kf.split(recipes_df):
    train_df, test_df = recipes_df.iloc[train_index], recipes_df.iloc[test_index]
    tfidf_vectorizer.fit(train_df['Ingredients'])
    tfidf_matrix = tfidf_vectorizer.transform(train_df['Ingredients'])
    
    for _, test_row in test_df.iterrows():
        user_ingredients = train_df.sample(n=1)['Ingredients'].values[0]
        test_ingredients = test_row['Ingredients']
        precision, recall = evaluate_recommendations(user_ingredients, test_ingredients)
        precision_scores.append(precision)
        recall_scores.append(recall)

average_precision = sum(precision_scores) / len(precision_scores)
average_recall = sum(recall_scores) / len(recall_scores)


In [9]:
print(f"Average Precision: {average_precision}")
print(f"Average Recall: {average_recall}")

Average Precision: 0.07400032851521744
Average Recall: 0.47552593608749427


## Using Hybrid Recommendation System to Improve Model Performance

In [10]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import KFold
from surprise import KNNBasic
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import train_test_split
from surprise import accuracy

# Load datasets
recipes_df = pd.read_csv('C:/Users/arsen/Healthylicious/data/cleaned/csv/recipes_dataset.csv')
ratings_df = pd.read_csv('C:/Users/arsen/Healthylicious/data/cleaned/csv/ratings_dataset.csv')

In [11]:
# TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_df=0.8, min_df=5)
tfidf_matrix = tfidf_vectorizer.fit_transform(recipes_df['Ingredients'])

In [12]:
def get_content_based_recommendations(user_ingredients, top_n=10):
    user_tfidf = tfidf_vectorizer.transform([user_ingredients])
    cosine_sim = cosine_similarity(user_tfidf, tfidf_matrix)
    similar_indices = cosine_sim.argsort().flatten()[::-1][:top_n]
    return recipes_df.iloc[similar_indices]

# Function to evaluate recommendations
def evaluate_recommendations(user_ingredients, test_ingredients, top_n=10):
    recommendations = get_content_based_recommendations(user_ingredients, top_n=top_n)
    recommended_ingredients = set(recommendations['Ingredients'].str.split(', ').sum())
    test_ingredients_set = set(test_ingredients.split(', '))

    # Calculate precision and recall
    true_positives = len(recommended_ingredients & test_ingredients_set)
    predicted_positives = len(recommended_ingredients)
    actual_positives = len(test_ingredients_set)

    precision = true_positives / predicted_positives if predicted_positives > 0 else 0
    recall = true_positives / actual_positives if actual_positives > 0 else 0
    
    return precision, recall

In [13]:
# Load user ratings
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_df[['userId', 'recipeId', 'rating']], reader)

# Train-test split
trainset, testset = train_test_split(data, test_size=0.25, random_state=42)

# Use KNNBasic for collaborative filtering
algo = KNNBasic()
algo.fit(trainset)

# Predict ratings for test set
predictions = algo.test(testset)

# Evaluate collaborative filtering model
accuracy_rmse = accuracy.rmse(predictions)
accuracy_mae = accuracy.mae(predictions)

print(f"Collaborative Filtering - RMSE: {accuracy_rmse}")
print(f"Collaborative Filtering - MAE: {accuracy_mae}")

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.0192
MAE:  0.7990
Collaborative Filtering - RMSE: 1.019232485961972
Collaborative Filtering - MAE: 0.7990223389135312


In [14]:
# Example usage with cross-validation for content-based filtering
kf = KFold(n_splits=5, shuffle=True, random_state=1)
precision_scores = []
recall_scores = []

for train_index, test_index in kf.split(recipes_df):
    train_df, test_df = recipes_df.iloc[train_index], recipes_df.iloc[test_index]
    tfidf_vectorizer.fit(train_df['Ingredients'])
    tfidf_matrix = tfidf_vectorizer.transform(train_df['Ingredients'])
    
    for _, test_row in test_df.iterrows():
        user_ingredients = train_df.sample(n=1)['Ingredients'].values[0]  # Menggunakan satu bahan dari train set
        test_ingredients = test_row['Ingredients']  # Bahan dari test set untuk evaluasi
        precision, recall = evaluate_recommendations(user_ingredients, test_ingredients)
        precision_scores.append(precision)
        recall_scores.append(recall)

average_precision = sum(precision_scores) / len(precision_scores)
average_recall = sum(recall_scores) / len(recall_scores)

print(f"Content-Based Filtering - Average Precision: {average_precision}")
print(f"Content-Based Filtering - Average Recall: {average_recall}")

Content-Based Filtering - Average Precision: 0.0730352732703867
Content-Based Filtering - Average Recall: 0.4800387661475886


In [15]:
# Hybrid Recommendation System
def hybrid_recommendations(user_id, user_ingredients, top_n=10):
    # Get collaborative filtering recommendations
    user_inner_id = algo.trainset.to_inner_uid(user_id)
    user_ratings = algo.trainset.ur[user_inner_id]
    similar_items = [algo.trainset.to_raw_iid(inner_id) for inner_id in algo.get_neighbors(user_inner_id, k=top_n)]
    
    # Get content-based recommendations
    content_recommendations = get_content_based_recommendations(user_ingredients, top_n=top_n)
    content_recommendation_ids = content_recommendations['recipeId'].tolist()
    
    # Combine recommendations
    combined_recommendations = list(set(similar_items) | set(content_recommendation_ids))
    
    # Limit to top_n recommendations
    combined_recommendations = combined_recommendations[:top_n]
    
    return recipes_df[recipes_df['recipeId'].isin(combined_recommendations)]

# Evaluate Hybrid Recommendations
def evaluate_hybrid_recommendations(user_id, user_ingredients, test_ingredients, top_n=10):
    recommendations = hybrid_recommendations(user_id, user_ingredients, top_n=top_n)
    recommended_ingredients = set(recommendations['Ingredients'].str.split(', ').sum())
    test_ingredients_set = set(test_ingredients.split(', '))

    # Calculate precision and recall
    true_positives = len(recommended_ingredients & test_ingredients_set)
    predicted_positives = len(recommended_ingredients)
    actual_positives = len(test_ingredients_set)

    precision = true_positives / predicted_positives if predicted_positives > 0 else 0
    recall = true_positives / actual_positives if actual_positives > 0 else 0
    
    return precision, recall


In [16]:
# Example usage with cross-validation for hybrid filtering
precision_scores = []
recall_scores = []

for train_index, test_index in kf.split(recipes_df):
    train_df, test_df = recipes_df.iloc[train_index], recipes_df.iloc[test_index]
    tfidf_vectorizer.fit(train_df['Ingredients'])
    tfidf_matrix = tfidf_vectorizer.transform(train_df['Ingredients'])
    
    for _, test_row in test_df.iterrows():
        user_id = ratings_df.sample(n=1)['userId'].values[0]  # Menggunakan satu user dari dataset ratings
        user_ingredients = train_df.sample(n=1)['Ingredients'].values[0]  # Menggunakan satu bahan dari train set
        test_ingredients = test_row['Ingredients']  # Bahan dari test set untuk evaluasi
        precision, recall = evaluate_hybrid_recommendations(user_id, user_ingredients, test_ingredients)
        precision_scores.append(precision)
        recall_scores.append(recall)

average_precision = sum(precision_scores) / len(precision_scores)
average_recall = sum(recall_scores) / len(recall_scores)

print(f"Hybrid Filtering - Average Precision: {average_precision}")
print(f"Hybrid Filtering - Average Recall: {average_recall}")

Hybrid Filtering - Average Precision: 0.07223341230436234
Hybrid Filtering - Average Recall: 0.4850172125450051
