# CAC Project 1 (SNA + RS)

In [None]:
import pandas as pd

members = pd.read_csv('data/pp_members.csv')
recipes = pd.read_csv('data/pp_recipes.csv')
reviews = pd.read_csv('data/pp_reviews.csv')

# Exploratory Data Analysis and Processing

In [None]:
members

In [None]:
recipes

In [None]:
reviews

In [None]:
import ast

def ing_process(x, ing_or_quant):

    try: 
        ing_list = ast.literal_eval(x)
    except:
        print(x)
        return None    

    try:
        res = list(ing_list.values())[0]
    except:
        print(ing_list)
        return None
    
    return [x[ing_or_quant] for x in res]

recipes['ingredients_pp'] = recipes['ingredients'].apply(ing_process, args=(0,))

In [None]:
recipes['ingredients_pp']
recipes['ingredients_pp'][0]

In [None]:
recipes['quantities_pp'] = recipes['ingredients'].apply(ing_process, args=(1,))
recipes['quantities_pp']

In [None]:
recipes['ingredients_pp'].apply(type).unique()

recipes[recipes['ingredients_pp'].apply(type) == type(None)]

recipes = recipes.drop(recipes[recipes['ingredients_pp'].apply(type) == type(None)].index)

In [None]:
import itertools 
from collections import defaultdict

# Create edges for recipes, based on ingredients in common as weight
def ing_freq_edge_weight(df,min_weight=0):
    ingredients_freq = {}
    # frequency of each ingredient save to a dict
    for i in range(len(df)):
        for j in range(len(df.iloc[i]['ingredients_pp'])):
            if df.iloc[i]['ingredients_pp'][j] in ingredients_freq:
                ingredients_freq[df.iloc[i]['ingredients_pp'][j]] += 1
            else:
                ingredients_freq[df.iloc[i]['ingredients_pp'][j]] = 1

    print("ing freq", ingredients_freq)
    long_df = df.explode('ingredients_pp')
    graph_structure = defaultdict(dict)

    for ingredient, rows in long_df.groupby('ingredients_pp'):
        # Get all unique pairs of recipes containing this ingredient
        pairs = itertools.combinations(rows.index.unique(), 2)

        # Calculate weight based on ingredient frequency
        weight = 1 / ingredients_freq[ingredient]

        # Update the graph structure with the weight for each pair
        for a, b in pairs:
            if b in graph_structure[a]:
                graph_structure[a][b] += weight
                graph_structure[b][a] += weight
            else:
                graph_structure[a][b] = weight
                graph_structure[b][a] = weight

    # Convert the graph structure to a list of tuples [(index1, index2, weight), ...]
    index_pairs = [(a, b, graph_structure[a][b]) for a in graph_structure for b in graph_structure[a] if (a < b) and (graph_structure[a][b]>=min_weight)]

    pairs_df = pd.DataFrame(index_pairs, columns=['from', 'to','weight'])
    return pairs_df

In [None]:
pd.set_option('display.max_columns', 39)

# get the top 1000 recipes with the most ratings
top_recipes = recipes.sort_values(by='number_of_ratings', ascending=False)[0:1000]

top_recipes.head()

# Building the graph

In [None]:
from igraph import Graph, plot

links = ing_freq_edge_weight(top_recipes)

import matplotlib.pyplot as plt

net = Graph.DataFrame(links, directed=False, use_vids=False,vertices=top_recipes)

In [None]:
weights = net.es['weight']

min_weight = min(weights)
max_weight = max(weights)

print(min_weight, max_weight)

## Clustering

In [None]:
# cluster detection algorithm
clusters = net.community_multilevel(weights=net.es['weight'])

In [None]:
node_titles = net.vs['title']

# Print the clusters with node titles
for i, cluster in enumerate(clusters):
    node_titles_in_cluster = [node_titles[node_id] for node_id in cluster]
    no_of_recipes = len(node_titles_in_cluster)
    if no_of_recipes > 1:
        print(f"[{i}] ({no_of_recipes}) {' || '.join(node_titles_in_cluster)}")

In [None]:
import numpy as np

num_clusters = len(set(clusters.membership))
print(num_clusters)

# Generate random colors for clusters
vertex_colors = [plt.cm.tab10(i) for i in np.linspace(0, 1, num_clusters)]

# Plot the graph with clusters highlighted
plot(net, target="plots/graph_ingredients_clusters.png", vertex_size=5, vertex_color=vertex_colors, edge_width=0.1, edge_arrow_size=0.4 ,arrow_width=1, bbox=(0,0,1000,1000))

In [None]:
cluster_sizes = [len(cluster) for cluster in clusters]
print(cluster_sizes)

plt.hist(cluster_sizes, bins=range(0, 400, 20))

In [None]:
# Create a layout based on community membership
layout = net.layout_fruchterman_reingold()

# Plot each cluster separately
for i, cluster in enumerate(clusters):
    subgraph = net.subgraph(cluster)

    if(len(subgraph.vs) < 5): # minimum number of nodes in a cluster
        continue

    plot(subgraph, target=f"plots/cluster_{i}.png", vertex_size=5, vertex_color=vertex_colors[i], edge_width=0.1, edge_arrow_size=0.4, arrow_width=1, bbox=(0,0,500,500), layout=layout)

In [None]:
print(f"Number of recipes: {len(top_recipes)}")

def create_reviews_dict(recipes_ids):
    top_reviews = reviews[reviews['recipe_id'].isin(recipes_ids)]
    reviews_dict = top_reviews.groupby('recipe_id').apply(lambda x: list(zip(x['member_id'], x['rating']))).to_dict()
    return top_reviews, reviews_dict
    

top_reviews, reviews_dict = create_reviews_dict(top_recipes['new_recipe_id'])

# Sparsity
sparsity = len(top_reviews) / (len(top_recipes) * len(top_reviews['member_id'].unique()))

print(f"Sparsity: {sparsity:.2%}")

In [None]:
# get largest cluster
top_cluster = max(clusters, key=len)
print(len(top_cluster))

# get the ids in the top cluster
top_cluster_ids = [net.vs[node_id]['new_recipe_id'] for node_id in top_cluster]
top_cluster_ids

# network_reviews, network_reviews_dict = create_reviews_dict(top_recipes['new_recipe_id'])
cluster_reviews, cluster_reviews_dict = create_reviews_dict(top_cluster_ids)

# Recommender Systems

In [None]:
# Create dataframe for the cluster dictionary
def create_ratings_df(ratings_dict):
    ratings_rows = []
    for recipe_id, ratings in ratings_dict.items():
        for member_id, rating in ratings:
            ratings_rows.append((member_id, recipe_id, rating))

    ratings_df = pd.DataFrame(ratings_rows, columns=['member_id', 'recipe_id', 'rating'])
    return ratings_df

ratings_sample_df = create_ratings_df(cluster_reviews_dict)

### Split the data between train and test

In [None]:
from surprise import Dataset, Reader, SVD, KNNWithMeans, accuracy
from surprise.model_selection import train_test_split

# Find maximum and minimum rating
max_rating = ratings_sample_df['rating'].max()
min_rating = ratings_sample_df['rating'].min()
print(max_rating, min_rating)

reader = Reader(rating_scale=(min_rating, max_rating))

data = Dataset.load_from_df(ratings_sample_df, reader)

print(data.df.head())

trainset, testset = train_test_split(data, test_size=0.2)

### Content-based filtering

Select user from top cluster.

In [None]:
ref_uid = 0

for (member, recipe, rating) in ratings_sample_df.values: 
    if recipe in top_cluster_ids and recipe != 9533:
        ref_uid = member
        break

print(int(ref_uid))

Using tf-idf for description terms to identify similarities between descriptions and as a result similarity between recipes. Given a recipe, suggest others. 

In [None]:
# Content-based filtering
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

top_recipes_filtered = top_recipes.dropna(subset=['description'])

tfidf_vectorizer = TfidfVectorizer()
item_features = tfidf_vectorizer.fit_transform(top_recipes_filtered['description'])

cosine_sim = cosine_similarity(item_features, item_features)

top_n_indices = np.argsort(-cosine_sim, axis=1)[:, 1:21]

top_n_recipe_ids = top_recipes_filtered['new_recipe_id'].values[top_n_indices]

similar_recipes_dict = {recipe_id: top_n_recipe_ids[i].tolist() for i, recipe_id in enumerate(top_recipes_filtered['new_recipe_id'])}

list(similar_recipes_dict.keys())[0:5]

In [None]:
# Get the top 10 similar recipes for a specific recipe
recipe_id = 205530
similar_recipe_ids = similar_recipes_dict[recipe_id]
similar_recipe_ids

In [None]:
# get recipe entry for the recipe_id
recipe = top_recipes_filtered[top_recipes_filtered['new_recipe_id'] == recipe_id]
recipe2 = top_recipes_filtered[top_recipes_filtered['new_recipe_id'] == similar_recipe_ids[1]]

recipe

In [None]:
recipe2

In [None]:
# given a user recommend a given number of recipes based on what he liked
def recommend_recipes(user_id, n_recipes=10):
    user_ratings = ratings_sample_df[ratings_sample_df['member_id'] == user_id]

    rated_recipe_ids = user_ratings['recipe_id'].values

    top_rated_recipes = user_ratings[user_ratings['rating'] > 3]['recipe_id'].values

    print(user_ratings)

    recommendations = []

    for recipe_id in top_rated_recipes:
        similar_recipe_ids = similar_recipes_dict[recipe_id]
        print("AA", similar_recipe_ids)
        for similar_recipe_id in similar_recipe_ids:
            if similar_recipe_id not in rated_recipe_ids:
                recommendations.append(similar_recipe_id)

        if len(recommendations) >= n_recipes * 20:
            break

    recommendations = top_recipes[top_recipes['new_recipe_id'].isin(recommendations)].sort_values(by='average_rating', ascending=False).head(n_recipes)

    return recommendations


content_normal = recommend_recipes(ref_uid)

### Content-based filtering for the top cluster

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

top_cluster_recipes = top_recipes[top_recipes['new_recipe_id'].isin(top_cluster_ids)]

top_recipes_top_cluster_filtered = top_cluster_recipes.dropna(subset=['description'])

tfidf_vectorizer = TfidfVectorizer()
item_features = tfidf_vectorizer.fit_transform(top_recipes_top_cluster_filtered['description'])

cosine_sim = cosine_similarity(item_features, item_features)

top_n_indices = np.argsort(-cosine_sim, axis=1)[:, 1:21]

top_n_recipe_ids = top_recipes_top_cluster_filtered['new_recipe_id'].values[top_n_indices]

similar_recipes_cluster_dict = {recipe_id: top_n_recipe_ids[i].tolist() for i, recipe_id in enumerate(top_recipes_top_cluster_filtered['new_recipe_id'])}

list(similar_recipes_cluster_dict.keys())[0:5]

In [None]:
def recommend_recipes_top_cluster(user_id, n_recipes=10):
    user_ratings = ratings_sample_df[ratings_sample_df['member_id'] == user_id]

    rated_recipe_ids = user_ratings['recipe_id'].values

    top_rated_recipes = user_ratings[user_ratings['rating'] > 3]['recipe_id'].values

    top_rated_recipes = np.intersect1d(top_rated_recipes, top_cluster_ids)

    print(user_ratings)

    recommendations = []

    for recipe_id in top_rated_recipes:
        similar_recipe_ids = similar_recipes_cluster_dict[recipe_id]
        print("AA", similar_recipe_ids)
        for similar_recipe_id in similar_recipe_ids:
            if similar_recipe_id not in rated_recipe_ids:
                recommendations.append(similar_recipe_id)

        if len(recommendations) >= n_recipes * 20:
            break

    # get top recipes based on rating 
    recommendations = top_recipes[top_recipes['new_recipe_id'].isin(recommendations)].sort_values(by='average_rating', ascending=False).head(n_recipes)

    return recommendations


content_cluster = recommend_recipes_top_cluster(ref_uid)

Comparition between cluster vs normal content recommendation. 

In [None]:
content_normal[['title', 'description', 'average_rating']]

In [None]:
content_cluster[['title', 'description', 'average_rating']]

### Apply Algorithms

In [None]:
from surprise import KNNWithMeans, SVD
from surprise import accuracy

def calculate_precision_recall(predictions, threshold=4):
    tp = fp = fn = 0

    for uid, _, true_r, est, _ in predictions:
        if est >= threshold: 
            if true_r >= threshold:
                tp += 1  
            else:
                fp += 1 
        else: 
            if true_r >= threshold:
                fn += 1 

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return precision, recall, f1

def evaluate_algorithm(algo, trainset, testset):
    algo.fit(trainset)
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions)
    precision, recall, f1 = calculate_precision_recall(predictions)
    return algo, rmse, precision, recall, f1

### Item-based collaborative filtering

In [None]:
# Item based collaborative filtering
algo_svd = SVD()
algo_knn_item = KNNWithMeans(sim_options={'name': 'cosine', 'user_based': False})
algo_knn_item2 = KNNWithMeans(sim_options={'name': 'pearson', 'user_based': False})

model_svd, rmse_svd, precision_svd, recall_svd, f1_svd = evaluate_algorithm(algo_svd, trainset, testset)
model_knn, rmse_knn, precision_knn, recall_knn, f1_knn = evaluate_algorithm(algo_knn_item, trainset, testset)   
model_knn2, rmse_knn2, precision_knn2, recall_knn2, f1_knn2 = evaluate_algorithm(algo_knn_item2, trainset, testset)

print("--------------------")
print(f"SVD RMSE: {rmse_svd}")
print(f"SVD Precision: {precision_svd}")
print(f"SVD Recall: {recall_svd}")
print(f"SVD F1: {f1_svd}")
print("--------------------")
print(f"KNN RMSE: {rmse_knn}")
print(f"KNN Precision: {precision_knn}")
print(f"KNN Recall: {recall_knn}")
print(f"KNN F1: {f1_knn}")
print("--------------------")
print(f"KNN2 RMSE: {rmse_knn2}")
print(f"KNN2 Precision: {precision_knn2}")
print(f"KNN2 Recall: {recall_knn2}")
print(f"KNN2 F1: {f1_knn2}")

### User-based collaborative filtering

In [None]:
# User based collaborative filtering
algo_knn_user = KNNWithMeans(sim_options={'name': 'cosine', 'user_based': True})
algo_knn_user2 = KNNWithMeans(sim_options={'name': 'pearson', 'user_based': True})

model_knn_user, rmse_knn_user, precision_knn_user, recall_knn_user, f1_knn_user = evaluate_algorithm(algo_knn_user, trainset, testset)
model_knn_user2, rmse_knn_user2, precision_knn_user2, recall_knn_user2, f1_knn_user2 = evaluate_algorithm(algo_knn_user2, trainset, testset)

print("--------------------")
print(f"KNN User RMSE: {rmse_knn_user}")
print(f"KNN User Precision: {precision_knn_user}")
print(f"KNN User Recall: {recall_knn_user}")
print(f"KNN User F1: {f1_knn_user}")
print("--------------------")
print(f"KNN User2 RMSE: {rmse_knn_user2}")
print(f"KNN User2 Precision: {precision_knn_user2}")
print(f"KNN User2 Recall: {recall_knn_user2}")
print(f"KNN User2 F1: {f1_knn_user2}")

### Predict for a user

Predict ratings for user-item pairs using a given model. If a user-item pair has a known rating in the input data, it retains that rating, otherwise, it predicts a rating using the model.

In [None]:
users_seen = trainset.all_users()
items_seen = trainset.all_items()

raw_users_seen_ids = [trainset.to_raw_uid(uid) for uid in users_seen]
raw_items_seen_ids = [trainset.to_raw_iid(iid) for iid in items_seen]

user_test_id = 1

# Check if a user is in the training set
if user_test_id in raw_users_seen_ids:
    print(f"User {user_test_id} is in the training set")
else:
    print(f"User {user_test_id} is not in the training set")

    user_test_id = min(raw_users_seen_ids, key=lambda x: np.mean([r for (_, r) in trainset.ur[trainset.to_inner_uid(x)]]))    

    print(f" selecting user {user_test_id} instead")


def predict_ratings(model, raw_users, raw_items, data):
    predictions = {}
    # Add to the predictions dictionary all ratings that were previously known
    for user_id, item_id, rating in data.itertuples(index=False):
        predictions[(user_id, item_id)] = rating

    # Save the predictions to file
    with open('predictions.csv', 'w') as f:
        for (user_id, item_id), rating in predictions.items():
            f.write(f"{user_id},{item_id},{rating}\n")
    

    for user_id in raw_users:
        if user_id == 311157:
            print("user_id", user_id)
        for item_id in raw_items:
            if (user_id, item_id) not in predictions:
                prediction = model.predict(user_id, item_id)
                predictions[(user_id, item_id)] = prediction.est

    with open('final_predictions.csv', 'w') as f:
        for (user_id, item_id), rating in predictions.items():
            f.write(f"{user_id},{item_id},{rating}\n")

    return predictions

In [None]:
# Item based
pred_svd = predict_ratings(model_svd, raw_users_seen_ids, raw_items_seen_ids, ratings_sample_df)
pred_knn = predict_ratings(model_knn, raw_users_seen_ids, raw_items_seen_ids, ratings_sample_df)
pred_knn2 = predict_ratings(model_knn2, raw_users_seen_ids, raw_items_seen_ids, ratings_sample_df)

In [None]:
# User based
pred_knn_user = predict_ratings(model_knn_user, raw_users_seen_ids, raw_items_seen_ids, ratings_sample_df)
pred_knn_user2 = predict_ratings(model_knn_user2, raw_users_seen_ids, raw_items_seen_ids, ratings_sample_df)

### Get Top-N Recommendations

In [None]:
# Get top 5 recommendations for a user from the ones he hasn't rated yet
def get_top_n_recommendations(predictions, user_id, data, n=5):

    items_not_rated = data[data['member_id'] != user_id]['recipe_id'].unique()
    print(len(items_not_rated))

    user_predictions = [(iid, pred) for (uid, iid), pred in predictions.items() if uid == user_id and iid in items_not_rated]   

    items_rated = data[data['member_id'] == user_id]['recipe_id'].unique()
    print(len(items_rated))

    user_ratings = [(iid, pred) for (uid, iid), pred in predictions.items() if uid == user_id and iid in items_rated] 

    print(len(user_predictions))
    user_predictions.sort(key=lambda x: x[1], reverse=True)
    return user_predictions[:n]

In [None]:
top_n_svd = get_top_n_recommendations(pred_svd, user_test_id, ratings_sample_df,20)
top_n_knn = get_top_n_recommendations(pred_knn, user_test_id, ratings_sample_df,20)
top_n_knn2 = get_top_n_recommendations(pred_knn2, user_test_id, ratings_sample_df,20)

print("User ID:", user_test_id)
print("--------------------")
print("SVD: ", top_n_svd)
print("KNN: ", top_n_knn)
print("KNN2: ", top_n_knn2)

In [None]:
top_n_knn_user = get_top_n_recommendations(pred_knn_user, user_test_id, ratings_sample_df,20)
top_n_knn_user2 = get_top_n_recommendations(pred_knn_user2, user_test_id, ratings_sample_df,20)

print("--------------------")
print("KNN User: ", top_n_knn_user)
print("KNN User2: ", top_n_knn_user2)

In [None]:
# Show ratings_sample_df ratings distribution
plt.hist(ratings_sample_df['rating'], bins=20)

#### Results

In [None]:
results_item_svd = top_recipes[top_recipes['new_recipe_id'].isin([x[0] for x in top_n_svd])][['title', 'description', 'average_rating']]
results_item_svd

In [None]:
results_item_knn = top_recipes[top_recipes['new_recipe_id'].isin([x[0] for x in top_n_knn])][['title', 'description', 'average_rating']]
results_item_knn

In [None]:
results_item_knn2 = top_recipes[top_recipes['new_recipe_id'].isin([x[0] for x in top_n_knn2])][['title', 'description', 'average_rating']]
results_item_knn2

In [None]:
results_user_knn = top_recipes[top_recipes['new_recipe_id'].isin([x[0] for x in top_n_knn_user])][['title', 'description', 'average_rating']]
results_user_knn

In [None]:
results_user_knn2 = top_recipes[top_recipes['new_recipe_id'].isin([x[0] for x in top_n_knn_user2])][['title', 'description', 'average_rating']]
results_user_knn2

# Metrics

In [None]:
uid = 498271
user_ratings = ratings_sample_df[ratings_sample_df['member_id'] == uid]
user_ratings

## Precision at k

In [None]:
recipes_liked = top_recipes[top_recipes['new_recipe_id'].isin(user_ratings['recipe_id'])][['title', 'ingredients', 'average_rating']]
recipes_liked

### Item-based

In [None]:
pred_svd_user = get_top_n_recommendations(pred_svd, uid, ratings_sample_df, 20)
pred_knn_user = get_top_n_recommendations(pred_knn, uid, ratings_sample_df, 20)
pred_knn_user2 = get_top_n_recommendations(pred_knn2, uid, ratings_sample_df, 20)
pred_svd_user

We identified the following ingredients as relevant: 
- chicken 
- tortilla
- chips
- tomatoes
- cheese
- spaghetti pasta
- pepper
- tomato sauce
- ground beef
- green beans

#### SVD

In [None]:
recipes_svd = top_recipes[top_recipes['new_recipe_id'].isin([x[0] for x in pred_svd_user])].sort_values(by='average_rating', ascending=False)
recipes_svd[['title', 'ingredients', 'average_rating']]

In [None]:
recipes_svd['relevant'] = [1,1,1,1,0,1,1,0,1,1,1,1,1,1,1,1,0,1,0,1]
recipes_svd[['title', 'relevant']]

#### KNN Cosine

In [None]:
recipes_knn = top_recipes[top_recipes['new_recipe_id'].isin([x[0] for x in pred_knn_user])].sort_values(by='average_rating', ascending=False)
recipes_knn[['title', 'ingredients', 'average_rating']]

#### KNN Pearson

In [None]:
recipes_knn2 = top_recipes[top_recipes['new_recipe_id'].isin([x[0] for x in pred_knn_user2])].sort_values(by='average_rating', ascending=False)
recipes_knn2[['title', 'ingredients', 'average_rating']]

# NLP

## PLN

Creation of a sample of the dataset to test the NLP model.

In [None]:
pln_recipes_by_amount = recipes.sort_values(by='number_of_ratings', ascending=False)[0:10000]
pln_recipes_by_dates = recipes.sort_values(by='last_changed_date', ascending=False)[0:10000]
pln_recipes_by_amount = pln_recipes_by_amount.sort_values(by='last_changed_date')

Cleaning of the data and removal of empty values. Also selects which attributes to use in the model. It has different modes:
1) 'ingredients': uses only the ingredients
2) 'description': uses only the description
3) 'all': uses both ingredients and description

In [None]:
def clean_select_attributes(df, attributes_mode):
    df = df.dropna(subset=['description','ingredients_pp'])
    if attributes_mode == 1:
        df['attributes'] = df['description']
    elif attributes_mode == 2:
        df['attributes'] = df.apply(lambda row: ' '.join(map(str,row['ingredients_pp'])),axis=1)
    elif attributes_mode == 3:
        df['attributes'] = df.apply(lambda row: ''.join(row['description'] + ' ' + ' '.join(map(str,row['ingredients_pp']))), axis=1)
    text_sample = df['attributes']
    text_sample.index = df['last_changed_date']
    return text_sample, df

In [None]:
amount_description_text_sample, amount_description_df = clean_select_attributes(pln_recipes_by_amount, 1)
amount_ingredients_text_sample, amount_ingredients_df = clean_select_attributes(pln_recipes_by_amount, 2)
amount_both_text_sample, amount_both_df = clean_select_attributes(pln_recipes_by_amount, 3)

dates_description_text_sample, dates_description_df = clean_select_attributes(pln_recipes_by_dates, 1)
dates_ingredients_text_sample, dates_ingredients_df = clean_select_attributes(pln_recipes_by_dates, 2)
dates_both_text_sample, dates_both_df = clean_select_attributes(pln_recipes_by_dates, 3)

Prepare the data for the model, using PLN techniques.

In [1]:
import nltk
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Utilizador\AppData\Roaming\nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Utilizador\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [None]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def clean_text_lemmatization(text_sample):
    lemmatizer = WordNetLemmatizer()
    sw = set(stopwords.words('english'))
    text_sample = text_sample.apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))
    text_sample = text_sample.apply(lambda x: x.lower())
    text_sample = text_sample.apply(lambda x: ' '.join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in x.split() if w not in sw]))
    return text_sample

def clean_text_stemming(text_sample):
    ps = PorterStemmer()
    sw = set(stopwords.words('english'))
    text_sample = text_sample.apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))
    text_sample = text_sample.apply(lambda x: x.lower())
    text_sample = text_sample.apply(lambda x: ' '.join([ps.stem(w) for w in x.split() if w not in sw]))
    return text_sample

def clean_text_tokenization(text_sample):
    sw = set(stopwords.words('english'))
    text_sample = text_sample.apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))
    text_sample = text_sample.apply(lambda x: x.lower())
    text_sample = text_sample.apply(lambda x: [w for w in x.split() if w not in sw])
    return text_sample

In [None]:
tokenized_amount_description_text_sample = clean_text_tokenization(amount_description_text_sample)
tokenized_amount_ingredients_text_sample = clean_text_tokenization(amount_ingredients_text_sample)
tokenized_amount_both_text_sample = clean_text_tokenization(amount_both_text_sample)

tokenized_dates_description_text_sample = clean_text_tokenization(dates_description_text_sample)
tokenized_dates_ingredients_text_sample = clean_text_tokenization(dates_ingredients_text_sample)
tokenized_dates_both_text_sample = clean_text_tokenization(dates_both_text_sample)

lemmatized_amount_description_text_sample = clean_text_lemmatization(amount_description_text_sample)
lemmatized_amount_ingredients_text_sample = clean_text_lemmatization(amount_ingredients_text_sample)
lemmatized_amount_both_text_sample = clean_text_lemmatization(amount_both_text_sample)

lemmatized_dates_description_text_sample = clean_text_lemmatization(dates_description_text_sample)
lemmatized_dates_ingredients_text_sample = clean_text_lemmatization(dates_ingredients_text_sample)
lemmatized_dates_both_text_sample = clean_text_lemmatization(dates_both_text_sample)

stemmed_amount_description_text_sample = clean_text_stemming(amount_description_text_sample)
stemmed_amount_ingredients_text_sample = clean_text_stemming(amount_ingredients_text_sample)
stemmed_amount_both_text_sample = clean_text_stemming(amount_both_text_sample)

stemmed_dates_description_text_sample = clean_text_stemming(dates_description_text_sample)
stemmed_dates_ingredients_text_sample = clean_text_stemming(dates_ingredients_text_sample)
stemmed_dates_both_text_sample = clean_text_stemming(dates_both_text_sample)

Apply vectorizer####

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
count_vectorizer = CountVectorizer()
tfidf_vectorizer = TfidfVectorizer()


def get_matrix(text_sample, vectorizer):
    matrix = vectorizer.fit_transform(text_sample)
    return matrix, vectorizer

count_tokenized_amount_description_matrix, count_tokenized_amount_description_vectorizer = get_matrix(tokenized_amount_description_text_sample, count_vectorizer)
count_tokenized_amount_ingredients_matrix, count_tokenized_amount_ingredients_vectorizer = get_matrix(tokenized_amount_ingredients_text_sample, count_vectorizer)
count_tokenized_amount_both_matrix, count_tokenized_amount_both_vectorizer = get_matrix(tokenized_amount_both_text_sample, count_vectorizer)

count_tokenized_dates_description_matrix, count_tokenized_dates_description_vectorizer = get_matrix(tokenized_dates_description_text_sample, count_vectorizer)
count_tokenized_dates_ingredients_matrix, count_tokenized_dates_ingredients_vectorizer = get_matrix(tokenized_dates_ingredients_text_sample, count_vectorizer)
count_tokenized_dates_both_matrix, count_tokenized_dates_both_vectorizer = get_matrix(tokenized_dates_both_text_sample, count_vectorizer)

tfidf_tokenized_amount_description_matrix, tfidf_tokenized_amount_description_vectorizer = get_matrix(tokenized_amount_description_text_sample, tfidf_vectorizer)
tfidf_tokenized_amount_ingredients_matrix, tfidf_tokenized_amount_ingredients_vectorizer = get_matrix(tokenized_amount_ingredients_text_sample, tfidf_vectorizer)
tfidf_tokenized_amount_both_matrix, tfidf_tokenized_amount_both_vectorizer = get_matrix(tokenized_amount_both_text_sample, tfidf_vectorizer)

tfidf_tokenized_dates_description_matrix, tfidf_tokenized_dates_description_vectorizer = get_matrix(tokenized_dates_description_text_sample, tfidf_vectorizer)
tfidf_tokenized_dates_ingredients_matrix, tfidf_tokenized_dates_ingredients_vectorizer = get_matrix(tokenized_dates_ingredients_text_sample, tfidf_vectorizer)
tfidf_tokenized_dates_both_matrix, tfidf_tokenized_dates_both_vectorizer = get_matrix(tokenized_dates_both_text_sample, tfidf_vectorizer)

count_lemmatized_amount_description_matrix, count_lemmatized_amount_description_vectorizer = get_matrix(lemmatized_amount_description_text_sample, count_vectorizer)
count_lemmatized_amount_ingredients_matrix, count_lemmatized_amount_ingredients_vectorizer = get_matrix(lemmatized_amount_ingredients_text_sample, count_vectorizer)
count_lemmatized_amount_both_matrix, count_lemmatized_amount_both_vectorizer = get_matrix(lemmatized_amount_both_text_sample, count_vectorizer)

count_lemmatized_dates_description_matrix, count_lemmatized_dates_description_vectorizer = get_matrix(lemmatized_dates_description_text_sample, count_vectorizer)
count_lemmatized_dates_ingredients_matrix, count_lemmatized_dates_ingredients_vectorizer = get_matrix(lemmatized_dates_ingredients_text_sample, count_vectorizer)
count_lemmatized_dates_both_matrix, count_lemmatized_dates_both_vectorizer = get_matrix(lemmatized_dates_both_text_sample, count_vectorizer)

tfidf_lemmatized_amount_description_matrix, tfidf_lemmatized_amount_description_vectorizer = get_matrix(lemmatized_amount_description_text_sample, tfidf_vectorizer)
tfidf_lemmatized_amount_ingredients_matrix, tfidf_lemmatized_amount_ingredients_vectorizer = get_matrix(lemmatized_amount_ingredients_text_sample, tfidf_vectorizer)
tfidf_lemmatized_amount_both_matrix, tfidf_lemmatized_amount_both_vectorizer = get_matrix(lemmatized_amount_both_text_sample, tfidf_vectorizer)

tfidf_lemmatized_dates_description_matrix, tfidf_lemmatized_dates_description_vectorizer = get_matrix(lemmatized_dates_description_text_sample, tfidf_vectorizer)
tfidf_lemmatized_dates_ingredients_matrix, tfidf_lemmatized_dates_ingredients_vectorizer = get_matrix(lemmatized_dates_ingredients_text_sample, tfidf_vectorizer)
tfidf_lemmatized_dates_both_matrix, tfidf_lemmatized_dates_both_vectorizer = get_matrix(lemmatized_dates_both_text_sample, tfidf_vectorizer)

count_stemmed_amount_description_matrix, count_stemmed_amount_description_vectorizer = get_matrix(stemmed_amount_description_text_sample, count_vectorizer)
count_stemmed_amount_ingredients_matrix, count_stemmed_amount_ingredients_vectorizer = get_matrix(stemmed_amount_ingredients_text_sample, count_vectorizer)
count_stemmed_amount_both_matrix, count_stemmed_amount_both_vectorizer = get_matrix(stemmed_amount_both_text_sample, count_vectorizer)

count_stemmed_dates_description_matrix, count_stemmed_dates_description_vectorizer = get_matrix(stemmed_dates_description_text_sample, count_vectorizer)
count_stemmed_dates_ingredients_matrix, count_stemmed_dates_ingredients_vectorizer = get_matrix(stemmed_dates_ingredients_text_sample, count_vectorizer)
count_stemmed_dates_both_matrix, count_stemmed_dates_both_vectorizer = get_matrix(stemmed_dates_both_text_sample, count_vectorizer)

tfidf_stemmed_amount_description_matrix, tfidf_stemmed_amount_description_vectorizer = get_matrix(stemmed_amount_description_text_sample, tfidf_vectorizer)
tfidf_stemmed_amount_ingredients_matrix, tfidf_stemmed_amount_ingredients_vectorizer = get_matrix(stemmed_amount_ingredients_text_sample, tfidf_vectorizer)
tfidf_stemmed_amount_both_matrix, tfidf_stemmed_amount_both_vectorizer = get_matrix(stemmed_amount_both_text_sample, tfidf_vectorizer)

tfidf_stemmed_dates_description_matrix, tfidf_stemmed_dates_description_vectorizer = get_matrix(stemmed_dates_description_text_sample, tfidf_vectorizer)
tfidf_stemmed_dates_ingredients_matrix, tfidf_stemmed_dates_ingredients_vectorizer = get_matrix(stemmed_dates_ingredients_text_sample, tfidf_vectorizer)
tfidf_stemmed_dates_both_matrix, tfidf_stemmed_dates_both_vectorizer = get_matrix(stemmed_dates_both_text_sample, tfidf_vectorizer)

In [None]:
from collections import Counter
import numpy as np
from bokeh.plotting import figure, output_file, show
from bokeh.models import Label
from bokeh.io import output_notebook
output_notebook()

n_topics = 10

def get_keys(topic_matrix):
    keys = topic_matrix.argmax(axis=1).tolist()
    return keys
def keys_to_counts(keys):
    count_pairs = sorted(Counter(keys).items())
    print(count_pairs)
    categories = [pair[0] for pair in sorted(count_pairs)]
    counts = [pair[1] for pair in sorted(count_pairs)]
    return (categories, counts)

def get_top_n_words(n, keys, document_term_matrix, count_vectorizer):
    '''
    Returns a list of n_topic strings, where each string contains the n most common 
    words in a predicted category, in order.
    '''
    top_words = []
    n_topics = np.unique(keys).size  # Ensure you know the number of unique topics

    for topic in range(n_topics):
        # Initialize a zero vector of the same shape as a row in your document_term_matrix
        temp_vector_sum = np.zeros((1, document_term_matrix.shape[1]))
        for i in range(len(keys)):
            if keys[i] == topic:
                # Increment by the row corresponding to the document associated with the topic
                temp_vector_sum += document_term_matrix[i].toarray()  # convert sparse matrix row to dense

        # Extract the indices of the top n words; these are the columns in the matrix
        top_n_word_indices = np.flip(np.argsort(temp_vector_sum)[0][-n:], 0)
        
        # Retrieve the actual words from the count_vectorizer
        topic_words = [count_vectorizer.get_feature_names_out()[index] for index in top_n_word_indices]
        top_words.append(" ".join(topic_words))
    
    return top_words

def get_mean_topic_vectors(keys, two_dim_vectors):
    '''
    returns a list of centroid vectors from each predicted topic category
    '''
    mean_topic_vectors = []
    for t in range(n_topics):
        articles_in_that_topic = []
        for i in range(len(keys)):
            if keys[i] == t:
                #print(t, two_dim_vectors[i])
                articles_in_that_topic.append(two_dim_vectors[i])    
        print(articles_in_that_topic)
        articles_in_that_topic = np.vstack(articles_in_that_topic)
        mean_article_in_that_topic = np.mean(articles_in_that_topic, axis=0)
        mean_topic_vectors.append(mean_article_in_that_topic)
    return mean_topic_vectors

colormap = np.array([
    "#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c",
    "#98df8a", "#d62728", "#ff9896", "#9467bd", "#c5b0d5",
    "#8c564b", "#c49c94", "#e377c2", "#f7b6d2", "#7f7f7f",
    "#c7c7c7", "#bcbd22", "#dbdb8d", "#17becf", "#9edae5" ])
colormap = colormap[:n_topics]

Create LDA Model

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
lda_model = LatentDirichletAllocation(n_components=n_topics, learning_method='online', 
                                          random_state=0, verbose=0)
lda_topic_matrix = lda_model.fit_transform(count_tokenized_amount_description_matrix)
lda_keys = get_keys(lda_topic_matrix)
lda_categories, lda_counts = keys_to_counts(lda_keys)
top_n_words_lda = get_top_n_words(10, lda_keys, count_tokenized_amount_description_matrix, count_tokenized_amount_description_vectorizer)

for i in range(len(top_n_words_lda)):
    print("Topic {}: ".format(i+1), top_n_words_lda[i])

In [None]:
import matplotlib.pyplot as plt

top_3_words_lda = get_top_n_words(3, lda_keys, count_tokenized_amount_description_matrix, count_tokenized_amount_description_vectorizer)
labels = ['Topic {}: \n'.format(i) + top_3_words_lda[i] for i in range(len(top_3_words_lda))]

fig, ax = plt.subplots(figsize=(16,8))
ax.bar(lda_categories, lda_counts)
ax.set_xticks(lda_categories)
ax.set_xticklabels(labels)
ax.set_title('LDA topic counts')
ax.set_ylabel('Number of headlines')

TSNE Data Visualization

In [None]:
from sklearn.manifold import TSNE

tsne_lda_model = TSNE(n_components=2, perplexity=50, learning_rate=100, 
                        n_iter=2000, verbose=1, random_state=0, angle=0.75)
tsne_lda_vectors = tsne_lda_model.fit_transform(lda_topic_matrix)
lda_mean_topic_vectors = get_mean_topic_vectors(lda_keys, tsne_lda_vectors)
print(len(lda_keys))
print(len(tsne_lda_vectors))
print(lda_mean_topic_vectors)

plot = figure(title="t-SNE Clustering of {} LDA Topics".format(n_topics), width=700, height=700)
plot.scatter(x=tsne_lda_vectors[:,0], y=tsne_lda_vectors[:,1], color=colormap[lda_keys])

for t in range(n_topics):
    label = Label(x=lda_mean_topic_vectors[t][0], y=lda_mean_topic_vectors[t][1], 
                  text=top_3_words_lda[t], text_color=colormap[t])
    plot.add_layout(label)

show(plot)

Get Reviews from the top topic

In [None]:
largest_topic_recipes_ids = [i for i in range(len(lda_keys)) if lda_keys[i] == lda_counts.index(max(lda_counts))]
topic_reviews, topic_reviews_dict = create_reviews_dict(largest_topic_recipes_ids)
ratings_topics_sample_df = create_ratings_df(topic_reviews_dict)


# Time-Series Analysis

In [None]:
cluster_reviews.head()

In [None]:
cluster_reviews.shape

In [None]:
dates = sorted(list(cluster_reviews['last_modified_date']))
dates

In [None]:
def reviews_until_date(reviews, date):
    filtered_reviews = reviews[reviews['last_modified_date'] <= date]
    return len(filtered_reviews)

reviews_until_date(cluster_reviews, dates[1])


In [None]:
review_count = pd.DataFrame([[date, reviews_until_date(cluster_reviews, date)] for date in dates], columns=['Date', 'Count']).drop_duplicates()
review_count

In [None]:
def reviews_between_dates(reviews, initial_date, end_date):
    filtered_reviews = reviews[(reviews['last_modified_date'] > initial_date) & (reviews['last_modified_date'] <= end_date)]
    return len(filtered_reviews)


def bin_dates(dataset, bin_size=20): 
    first_date = dataset['last_modified_date'].min()
    last_date = dataset['last_modified_date'].max()

    bin_range = (pd.to_datetime(last_date) - pd.to_datetime(first_date)) / bin_size

    bins = [pd.to_datetime(first_date) + bin_range * i for i in range(bin_size)]

    for i in range(len(bins)):  
        bins[i] = bins[i].strftime('%Y-%m-%d')

    bins.append(last_date)

    return bins  


dates = bin_dates(cluster_reviews)

binned_reviews = []
for i in range(len(dates) - 1):
    initial_date = dates[i]
    end_date = dates[i + 1]
    reviews_count = reviews_between_dates(cluster_reviews, initial_date, end_date)
    binned_reviews.append({'start_date': initial_date, 'end_date': end_date, 'review_count': reviews_count})

binned_reviews_df = pd.DataFrame(binned_reviews)
print(binned_reviews_df)

In [None]:
def plot_df(df, x, y, title="", xlabel='Date', ylabel='Count', dpi=100):
    plt.figure(figsize=(15,4), dpi=dpi)
    plt.plot(x, y, color='tab:red', )
    plt.xticks(rotation=70)
    plt.gca().set(title=title, xlabel=xlabel, ylabel=ylabel)
    plt.show()

plot_df(binned_reviews_df, x=binned_reviews_df['start_date'], y=binned_reviews_df['review_count'], title='Temporal Analysis of Reviews')

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose
from dateutil.parser import parse

def plot_decompositions(df, label, period=10): 
    # Multiplicative Decomposition 
    multiplicative_decomposition = seasonal_decompose(df[label], model='multiplicative', period=period)

    # Additive Decomposition
    additive_decomposition = seasonal_decompose(df[label], model='additive', period=period)

    # Plot
    plt.rcParams.update({'figure.figsize': (16,12)})
    multiplicative_decomposition.plot().suptitle('Multiplicative Decomposition', fontsize=16)
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])

    additive_decomposition.plot().suptitle('Additive Decomposition', fontsize=16)
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])

    plt.show()

In [None]:
plot_decompositions(binned_reviews_df, 'review_count', period=10)

In [None]:
plot_decompositions(review_count, 'Count', period=len(review_count) // 2)

In [None]:
def average_rating_between_dates(reviews, initial_date, end_date):
    filtered_reviews = reviews[(reviews['last_modified_date'] > initial_date) & (reviews['last_modified_date'] <= end_date)]
    return filtered_reviews['rating'].mean()

def average_rating_until_date(reviews, date):
    filtered_reviews = reviews[reviews['last_modified_date'] <= date]
    return filtered_reviews['rating'].mean()

def bin_dates(dataset, bin_size=20):
    first_date = dataset['last_modified_date'].min()
    last_date = dataset['last_modified_date'].max()

    bin_range = (pd.to_datetime(last_date) - pd.to_datetime(first_date)) / bin_size

    bins = [pd.to_datetime(first_date) + bin_range * i for i in range(bin_size)]

    for i in range(len(bins)):
        bins[i] = bins[i].strftime('%Y-%m-%d')

    bins.append(last_date)

    return bins

dates = bin_dates(cluster_reviews, 50)

binned_ratings = []

for i in range(len(dates) - 1):
    initial_date = dates[i]
    end_date = dates[i + 1]
    average_rating = average_rating_between_dates(cluster_reviews, initial_date, end_date)
    binned_ratings.append({'start_date': initial_date, 'end_date': end_date, 'average_rating': average_rating})

binned_ratings_df = pd.DataFrame(binned_ratings)
print(binned_ratings_df)

In [None]:
binned_ratings_df['average_rating'].isna().sum()

In [None]:
plot_decompositions(binned_ratings_df, 'average_rating', period=len(binned_ratings_df) // 2)

# Prediction of the popularity of a recipe

Based on historical data, attempt to predict the popularity of a recipe.

In [None]:
import pandas as pd

# was using cluster_reviews previously bit changed to top recipes' reviews
top_reviews = reviews[reviews['recipe_id'].isin(top_recipes['new_recipe_id'])]

def aggregate_reviews(review_data, time_interval='daily'):

    new_review_data = review_data.copy()

    new_review_data['last_modified_date'] = pd.to_datetime(review_data['last_modified_date'])

    new_review_data.set_index('last_modified_date', inplace=True)

    if time_interval == 'daily':
        aggregated_data = new_review_data.groupby(['recipe_id', pd.Grouper(freq='D')]).agg({
            'rating': 'mean',
            'likes': 'sum',
            'review_id': 'count'  # counting reviews
        }).reset_index()

    elif time_interval == 'weekly':
        aggregated_data = new_review_data.groupby(['recipe_id', pd.Grouper(freq='W-MON')]).agg({
            'rating': 'mean',
            'likes': 'sum',
            'review_id': 'count'
        }).reset_index()

    elif time_interval == 'monthly':
        aggregated_data = new_review_data.groupby(['recipe_id', pd.Grouper(freq='M')]).agg({
            'rating': 'mean',
            'likes': 'sum',
            'review_id': 'count'
        }).reset_index()

    aggregated_data = aggregated_data.rename(columns={'review_id': 'review_count'})

    return aggregated_data

daily_reviews = aggregate_reviews(top_reviews, time_interval='daily')
weekly_reviews = aggregate_reviews(top_reviews, time_interval='weekly')
monthly_reviews = aggregate_reviews(top_reviews, time_interval='monthly')

In [None]:
daily_reviews.head()

In [None]:
weekly_reviews.head()

In [None]:
monthly_reviews.head()

## Measure correlation between rating, likes, and review count

### Daily

#### Correlation matrix

In [None]:
corr_matrix_daily = daily_reviews[['rating', 'likes', 'review_count']].corr()
corr_matrix_daily

#### Pearson Correlation

In [None]:
pearson_corr_coefficient_likes_rating = daily_reviews['likes'].corr(daily_reviews['rating'])
pearson_corr_coefficient_likes_review_count = daily_reviews['likes'].corr(daily_reviews['review_count'])
pearson_corr_coefficient_rating_review_count = daily_reviews['rating'].corr(daily_reviews['review_count'])

print("\nPearson Correlation Coefficients:")
print("Likes - Average Rating:", pearson_corr_coefficient_likes_rating)
print("Likes - Review Count:", pearson_corr_coefficient_likes_review_count)
print("Average Rating - Review Count:", pearson_corr_coefficient_rating_review_count)

#### Spearman rank coefficient

In [None]:
from scipy.stats import spearmanr

spearman_corr_coefficient_likes_rating, _ = spearmanr(daily_reviews['likes'], daily_reviews['rating'])
spearman_corr_coefficient_likes_review_count, _ = spearmanr(daily_reviews['likes'], daily_reviews['review_count'])
spearman_corr_coefficient_rating_review_count, _ = spearmanr(daily_reviews['rating'], daily_reviews['review_count'])

print("\nSpearman Rank Correlation Coefficients:")
print("Likes - Average Rating:", spearman_corr_coefficient_likes_rating)
print("Likes - Review Count:", spearman_corr_coefficient_likes_review_count)
print("Average Rating - Review Count:", spearman_corr_coefficient_rating_review_count)

### Weekly

#### Correlation matrix

In [None]:
corr_matrix_weekly = weekly_reviews[['rating', 'likes', 'review_count']].corr()
corr_matrix_weekly

#### Pearson Correlation

In [None]:
pearson_corr_coefficient_likes_rating = weekly_reviews['likes'].corr(weekly_reviews['rating'])
pearson_corr_coefficient_likes_review_count = weekly_reviews['likes'].corr(weekly_reviews['review_count'])
pearson_corr_coefficient_rating_review_count = weekly_reviews['rating'].corr(weekly_reviews['review_count'])

print("\nPearson Correlation Coefficients:")
print("Likes - Average Rating:", pearson_corr_coefficient_likes_rating)
print("Likes - Review Count:", pearson_corr_coefficient_likes_review_count)
print("Average Rating - Review Count:", pearson_corr_coefficient_rating_review_count)

#### Spearman rank coefficient

In [None]:
from scipy.stats import spearmanr

spearman_corr_coefficient_likes_rating, _ = spearmanr(weekly_reviews['likes'], weekly_reviews['rating'])
spearman_corr_coefficient_likes_review_count, _ = spearmanr(weekly_reviews['likes'], weekly_reviews['review_count'])
spearman_corr_coefficient_rating_review_count, _ = spearmanr(weekly_reviews['rating'], weekly_reviews['review_count'])

print("\nSpearman Rank Correlation Coefficients:")
print("Likes - Average Rating:", spearman_corr_coefficient_likes_rating)
print("Likes - Review Count:", spearman_corr_coefficient_likes_review_count)
print("Average Rating - Review Count:", spearman_corr_coefficient_rating_review_count)

### Monthly

#### Correlation matrix

In [None]:
corr_matrix_monthly = monthly_reviews[['rating', 'likes', 'review_count']].corr()
corr_matrix_monthly

#### Pearson Correlation

In [None]:
pearson_corr_coefficient_likes_rating = monthly_reviews['likes'].corr(monthly_reviews['rating'])
pearson_corr_coefficient_likes_review_count = monthly_reviews['likes'].corr(monthly_reviews['review_count'])
pearson_corr_coefficient_rating_review_count = monthly_reviews['rating'].corr(monthly_reviews['review_count'])

print("\nPearson Correlation Coefficients:")
print("Likes - Average Rating:", pearson_corr_coefficient_likes_rating)
print("Likes - Review Count:", pearson_corr_coefficient_likes_review_count)
print("Average Rating - Review Count:", pearson_corr_coefficient_rating_review_count)

#### Spearman rank coefficient

In [None]:
from scipy.stats import spearmanr

spearman_corr_coefficient_likes_rating, _ = spearmanr(monthly_reviews['likes'], monthly_reviews['rating'])
spearman_corr_coefficient_likes_review_count, _ = spearmanr(monthly_reviews['likes'], monthly_reviews['review_count'])
spearman_corr_coefficient_rating_review_count, _ = spearmanr(monthly_reviews['rating'], monthly_reviews['review_count'])

print("\nSpearman Rank Correlation Coefficients:")
print("Likes - Average Rating:", spearman_corr_coefficient_likes_rating)
print("Likes - Review Count:", spearman_corr_coefficient_likes_review_count)
print("Average Rating - Review Count:", spearman_corr_coefficient_rating_review_count)

### Conclusions on variable dependence

From the Correlation matrix we can conclude that overall there is a: 
- Negative correlation between review count and rating. 
- Slightly negative correlation between like count and rating.
- Slightly positive correlation between like count and review count. 

## Dataset selection

In [None]:
dataset = monthly_reviews.copy()

## Feature engineering

Creating more relavant features for modeling and achieve better model performance.

### Likes to reviews ratio

In [None]:
dataset['likes_to_reviews_ratio'] = dataset['likes'] / dataset['review_count']

### Likes to rating ratio

In [None]:
dataset['likes_to_rating_ratio'] = dataset['likes'] / dataset['rating']

### Popularity score

In [None]:
weights = {
    'rating': 0.2,
    'likes': 0.4,
    'review_count': 0.4,
    'likes_to_reviews_ratio': 0.1,
    'likes_to_rating_ratio': 0.1
}

# normalize features
normalized_features = (dataset[['rating', 'likes', 'review_count', 'likes_to_reviews_ratio', 'likes_to_rating_ratio']] - dataset[['rating', 'likes', 'review_count', 'likes_to_reviews_ratio', 'likes_to_rating_ratio']].min()) / (dataset[['rating', 'likes', 'review_count', 'likes_to_reviews_ratio', 'likes_to_rating_ratio']].max() - dataset[['rating', 'likes', 'review_count', 'likes_to_reviews_ratio', 'likes_to_rating_ratio']].min())

dataset['popularity_score'] = (normalized_features['rating'] * weights['rating']) + (normalized_features['likes'] * weights['likes']) + (normalized_features['review_count'] * weights['review_count']) + (normalized_features['likes_to_reviews_ratio'] * weights['likes_to_reviews_ratio']) + (normalized_features['likes_to_rating_ratio'] * weights['likes_to_rating_ratio'])

'''
daily_reviews['popularity_rank'] = daily_reviews['popularity_score'].rank(ascending=False)

top_popular_recipes = daily_reviews.sort_values(by='popularity_rank').head(10)
print(top_popular_recipes[['recipe_id', 'popularity_score', 'popularity_rank']])
'''

### New look of dataset with new features

In [None]:
dataset.head()

# Modeling

Preparing dataset for making predictions about the possible popularity of a recipe. 

## Preparing dataset for modeling

### Converting date to day, month and year categorical columns

In [None]:
dataset['last_modified_date'] = pd.to_datetime(dataset['last_modified_date'])

dataset['day'] = dataset['last_modified_date'].dt.day
dataset['month'] = dataset['last_modified_date'].dt.month
dataset['year'] = dataset['last_modified_date'].dt.year

dataset.head()

### Selecting most relevant features for modeling

In [None]:
corr_matrix = dataset[['rating', 'likes', 'review_count', 'likes_to_reviews_ratio', 'likes_to_rating_ratio', 'popularity_score', 
                            'day', 'month', 'year']].corr()
corr_matrix

In [None]:
modeling_data = dataset[['rating', 'last_modified_date', 'likes', 'review_count', 'likes_to_reviews_ratio', 'likes_to_rating_ratio', 
                        'popularity_score']]

modeling_data.dropna(inplace=True)

### Split in train and test sets

In [None]:
data_sorted = modeling_data.sort_values(by=['last_modified_date'], ascending=False)

test_size = 0.2

split_index = int(len(data_sorted) * test_size)

test_data = data_sorted.iloc[:split_index]
train_data = data_sorted.iloc[split_index:]

train_data = train_data.sort_values(by=['last_modified_date'])
test_data = test_data.sort_values(by=['last_modified_date'])

print("Training set size:", len(train_data))
print("Testing set size:", len(test_data))

### Modeling

#### ARIMA

In [None]:
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error

# Train the ARIMA model
p, d, q = 1, 1, 1 
arima_model = ARIMA(train_data['popularity_score'], order=(p, d, q))
arima_model_fit = arima_model.fit()

# Make predictions on the test data
predictions_arima = arima_model_fit.forecast(steps=len(test_data))

# Display the predictions
print(predictions_arima)

'''
import itertools
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error

# Define ranges for p, d, and q values
p_values = range(0, 4)  # Example range for p
d_values = range(0, 4)  # Example range for d
q_values = range(0, 4)  # Example range for q

# Create a grid of hyperparameters
param_grid = list(itertools.product(p_values, d_values, q_values))

# Split the data into training and validation sets
# Assuming train_data and validation_data are already defined
# You may need to define these variables appropriately
# Also, ensure that the 'popularity_score' column exists in both train_data and validation_data

# Initialize variables to store best model and its performance
best_model = None
best_mse = float('inf')  # Initialize with a large value

# Perform grid search
for param in param_grid:
    p, d, q = param
    try:
        # Train ARIMA model with current hyperparameters on the training set
        arima_model = ARIMA(train_data['popularity_score'], order=(p, d, q))
        arima_model_fit = arima_model.fit()

        # Make predictions on the validation set
        predictions_arima = arima_model_fit.forecast(steps=len(test_data))

        # Calculate Mean Squared Error (MSE) on the validation set
        mse = mean_squared_error(test_data['popularity_score'], predictions_arima)

        # Update best model if current model has lower MSE
        if mse < best_mse:
            best_model = arima_model_fit
            best_mse = mse

    except:
        continue'''

In [None]:
print(p, d, q)

In [None]:
import matplotlib.pyplot as plt

plt.plot(predictions_arima)
plt.title('Predictions Plot')
plt.xlabel('Index')
plt.ylabel('Predicted Value')
plt.show()

#### Exponential Smoothing

In [None]:
import pandas as pd
from statsmodels.tsa.holtwinters import SimpleExpSmoothing
from sklearn.metrics import mean_squared_error
'''
# Ensure 'last_modified_date' is a datetime column
train_data['last_modified_date'] = pd.to_datetime(train_data['last_modified_date'])
test_data['last_modified_date'] = pd.to_datetime(test_data['last_modified_date'])

# Set 'last_modified_date' as the index
train_data.set_index('last_modified_date', inplace=True)
test_data.set_index('last_modified_date', inplace=True)

# Check for missing values
print(train_data.isna().sum())
print(test_data.isna().sum())

# Ensure no missing values
train_data = train_data.dropna()
test_data = test_data.dropna()'''

# Train the Simple Exponential Smoothing model
ses_model = SimpleExpSmoothing(train_data['popularity_score'])
ses_model_fit = ses_model.fit(smoothing_level=0.2, optimized=True)  # You can adjust the smoothing_level

# Make predictions on the test data
predictions_es = ses_model_fit.forecast(steps=len(test_data))

# Display the predictions
print(predictions_es)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(test_data['popularity_score'], predictions_es)
print("Mean Squared Error:", mse)

# Display the model summary
print(ses_model_fit.summary())


In [None]:
import matplotlib.pyplot as plt

plt.plot(predictions_es)
plt.title('Predictions Plot')
plt.xlabel('Index')
plt.ylabel('Predicted Value')
plt.show()

#### Random Forest

##### Choosing a different dataset

While ARIMA and Exponential Smoothing models are able to capture temporal information, ML models like RF are not able to interpret this information. As such, we need to convert the date variable into categorical attributes in order for the RF to be able to interpret and analyze this information better. For example, instead of having a timestamp representing dates, we could convert that timestamp in 3 categorical attributes, them being day, month and year.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

modeling_data = dataset.sort_values(by=['last_modified_date'], ascending=True)
modeling_data = modeling_data[['recipe_id', 'rating', 'likes', 'review_count', 'likes_to_reviews_ratio', 'likes_to_rating_ratio', 'popularity_score', 'day', 'month', 'year']]

##### Feature Engineering

ARIMA and Exponential Smoothing are designed for time series data, but RF is not. So one good suggestion would be to shift the target variable, popularity_score, so that it can handle time data. 

In [None]:
modeling_data['popularity_score'] = modeling_data['popularity_score'].shift(1)
modeling_data.head()

In [None]:
modeling_data = modeling_data.dropna()

##### Split into train and test data

In [None]:
train_data, test_data = train_test_split(modeling_data, test_size=0.2, shuffle=False)

features = ['rating', 'likes', 'review_count', 'likes_to_reviews_ratio', 'likes_to_rating_ratio', 'day', 'month', 'year']
target = 'popularity_score'

X_train = train_data[features]
y_train = train_data[target]
X_test = test_data[features]
y_test = test_data[target]

#### Applying the model

In [None]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the test data
predictions = rf_model.predict(X_test)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, predictions)
print("Mean Squared Error:", mse)

# r2 score
r2 = rf_model.score(X_test, y_test)
print("R2 Score:", r2)

# measure accuracy 
accuracy = rf_model.score(X_test, y_test)
print("Accuracy:", accuracy)

# Optional: Print feature importances
importances = rf_model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': features, 'Importance': importances})
print(feature_importance_df)

In [None]:
len(modeling_data)