# CAC Project 1 (SNA + RS)

In [None]:
import pandas as pd

members = pd.read_csv('data/pp_members.csv')
recipes = pd.read_csv('data/pp_recipes.csv')
reviews = pd.read_csv('data/pp_reviews.csv')

# Exploratory Data Analysis and Processing

In [None]:
members

In [None]:
recipes

In [None]:
reviews

In [None]:
import ast

def ing_process(x, ing_or_quant):

    try: 
        ing_list = ast.literal_eval(x)
    except:
        print(x)
        return None    

    try:
        res = list(ing_list.values())[0]
    except:
        print(ing_list)
        return None
    
    return [x[ing_or_quant] for x in res]

recipes['ingredients_pp'] = recipes['ingredients'].apply(ing_process, args=(0,))

In [None]:
recipes['ingredients_pp']
recipes['ingredients_pp'][0]

In [None]:
recipes['quantities_pp'] = recipes['ingredients'].apply(ing_process, args=(1,))
recipes['quantities_pp']

In [None]:
recipes['ingredients_pp'].apply(type).unique()

recipes[recipes['ingredients_pp'].apply(type) == type(None)]

recipes = recipes.drop(recipes[recipes['ingredients_pp'].apply(type) == type(None)].index)

In [None]:
import itertools 
from collections import defaultdict

# Create edges for recipes, based on ingredients in common as weight
def ing_freq_edge_weight(df,min_weight=0):
    ingredients_freq = {}
    # frequency of each ingredient save to a dict
    for i in range(len(df)):
        for j in range(len(df.iloc[i]['ingredients_pp'])):
            if df.iloc[i]['ingredients_pp'][j] in ingredients_freq:
                ingredients_freq[df.iloc[i]['ingredients_pp'][j]] += 1
            else:
                ingredients_freq[df.iloc[i]['ingredients_pp'][j]] = 1

    print("ing freq", ingredients_freq)
    long_df = df.explode('ingredients_pp')
    graph_structure = defaultdict(dict)

    for ingredient, rows in long_df.groupby('ingredients_pp'):
        # Get all unique pairs of recipes containing this ingredient
        pairs = itertools.combinations(rows.index.unique(), 2)

        # Calculate weight based on ingredient frequency
        weight = 1 / ingredients_freq[ingredient]

        # Update the graph structure with the weight for each pair
        for a, b in pairs:
            if b in graph_structure[a]:
                graph_structure[a][b] += weight
                graph_structure[b][a] += weight
            else:
                graph_structure[a][b] = weight
                graph_structure[b][a] = weight

    # Convert the graph structure to a list of tuples [(index1, index2, weight), ...]
    index_pairs = [(a, b, graph_structure[a][b]) for a in graph_structure for b in graph_structure[a] if (a < b) and (graph_structure[a][b]>=min_weight)]

    pairs_df = pd.DataFrame(index_pairs, columns=['from', 'to','weight'])
    return pairs_df

In [None]:
pd.set_option('display.max_columns', 39)

# get the top 1000 recipes with the most ratings
top_recipes = recipes.sort_values(by='number_of_ratings', ascending=False)[0:1000]

top_recipes.head()

# Building the graph

In [None]:
from igraph import Graph, plot

links = ing_freq_edge_weight(top_recipes)

import matplotlib.pyplot as plt

net = Graph.DataFrame(links, directed=False, use_vids=False,vertices=top_recipes)

In [None]:
weights = net.es['weight']

min_weight = min(weights)
max_weight = max(weights)

print(min_weight, max_weight)

## Clustering

In [None]:
# cluster detection algorithm
clusters = net.community_multilevel(weights=net.es['weight'])

In [None]:
node_titles = net.vs['title']

# Print the clusters with node titles
for i, cluster in enumerate(clusters):
    node_titles_in_cluster = [node_titles[node_id] for node_id in cluster]
    no_of_recipes = len(node_titles_in_cluster)
    if no_of_recipes > 1:
        print(f"[{i}] ({no_of_recipes}) {' || '.join(node_titles_in_cluster)}")

In [None]:
import numpy as np

num_clusters = len(set(clusters.membership))
print(num_clusters)

# Generate random colors for clusters
vertex_colors = [plt.cm.tab10(i) for i in np.linspace(0, 1, num_clusters)]

# Plot the graph with clusters highlighted
plot(net, target="plots/graph_ingredients_clusters.png", vertex_size=5, vertex_color=vertex_colors, edge_width=0.1, edge_arrow_size=0.4 ,arrow_width=1, bbox=(0,0,1000,1000))

In [None]:
cluster_sizes = [len(cluster) for cluster in clusters]
print(cluster_sizes)

plt.hist(cluster_sizes, bins=range(0, 400, 20))

In [None]:
# Create a layout based on community membership
layout = net.layout_fruchterman_reingold()

# Plot each cluster separately
for i, cluster in enumerate(clusters):
    subgraph = net.subgraph(cluster)

    if(len(subgraph.vs) < 5): # minimum number of nodes in a cluster
        continue

    plot(subgraph, target=f"plots/cluster_{i}.png", vertex_size=5, vertex_color=vertex_colors[i], edge_width=0.1, edge_arrow_size=0.4, arrow_width=1, bbox=(0,0,500,500), layout=layout)

In [None]:
print(f"Number of recipes: {len(top_recipes)}")

def create_reviews_dict(recipes_ids):
    top_reviews = reviews[reviews['recipe_id'].isin(recipes_ids)]
    reviews_dict = top_reviews.groupby('recipe_id').apply(lambda x: list(zip(x['member_id'], x['rating']))).to_dict()
    return top_reviews, reviews_dict
    

top_reviews, reviews_dict = create_reviews_dict(top_recipes['new_recipe_id'])

# Sparsity
sparsity = len(top_reviews) / (len(top_recipes) * len(top_reviews['member_id'].unique()))

print(f"Sparsity: {sparsity:.2%}")

In [None]:
# get largest cluster
top_cluster = max(clusters, key=len)
print(len(top_cluster))

# get the ids in the top cluster
top_cluster_ids = [net.vs[node_id]['new_recipe_id'] for node_id in top_cluster]
top_cluster_ids

# network_reviews, network_reviews_dict = create_reviews_dict(top_recipes['new_recipe_id'])
cluster_reviews, cluster_reviews_dict = create_reviews_dict(top_cluster_ids)

# Recommender Systems

In [None]:
# Create dataframe for the cluster dictionary
def create_ratings_df(ratings_dict):
    ratings_rows = []
    for recipe_id, ratings in ratings_dict.items():
        for member_id, rating in ratings:
            ratings_rows.append((member_id, recipe_id, rating))

    ratings_df = pd.DataFrame(ratings_rows, columns=['member_id', 'recipe_id', 'rating'])
    return ratings_df

ratings_sample_df = create_ratings_df(cluster_reviews_dict)

### Split the data between train and test

In [None]:
from surprise import Dataset, Reader, SVD, KNNWithMeans, accuracy
from surprise.model_selection import train_test_split

# Find maximum and minimum rating
max_rating = ratings_sample_df['rating'].max()
min_rating = ratings_sample_df['rating'].min()
print(max_rating, min_rating)

reader = Reader(rating_scale=(min_rating, max_rating))

data = Dataset.load_from_df(ratings_sample_df, reader)

print(data.df.head())

trainset, testset = train_test_split(data, test_size=0.2)

### Content-based filtering

Select user from top cluster.

In [None]:
ref_uid = 0

for (member, recipe, rating) in ratings_sample_df.values: 
    if recipe in top_cluster_ids and recipe != 9533:
        ref_uid = member
        break

print(int(ref_uid))

Using tf-idf for description terms to identify similarities between descriptions and as a result similarity between recipes. Given a recipe, suggest others. 

In [None]:
# Content-based filtering
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

top_recipes_filtered = top_recipes.dropna(subset=['description'])

tfidf_vectorizer = TfidfVectorizer()
item_features = tfidf_vectorizer.fit_transform(top_recipes_filtered['description'])

cosine_sim = cosine_similarity(item_features, item_features)

top_n_indices = np.argsort(-cosine_sim, axis=1)[:, 1:21]

top_n_recipe_ids = top_recipes_filtered['new_recipe_id'].values[top_n_indices]

similar_recipes_dict = {recipe_id: top_n_recipe_ids[i].tolist() for i, recipe_id in enumerate(top_recipes_filtered['new_recipe_id'])}

list(similar_recipes_dict.keys())[0:5]

In [None]:
# Get the top 10 similar recipes for a specific recipe
recipe_id = 205530
similar_recipe_ids = similar_recipes_dict[recipe_id]
similar_recipe_ids

In [None]:
# get recipe entry for the recipe_id
recipe = top_recipes_filtered[top_recipes_filtered['new_recipe_id'] == recipe_id]
recipe2 = top_recipes_filtered[top_recipes_filtered['new_recipe_id'] == similar_recipe_ids[1]]

recipe

In [None]:
recipe2

In [None]:
# given a user recommend a given number of recipes based on what he liked
def recommend_recipes(user_id, n_recipes=10):
    user_ratings = ratings_sample_df[ratings_sample_df['member_id'] == user_id]

    rated_recipe_ids = user_ratings['recipe_id'].values

    top_rated_recipes = user_ratings[user_ratings['rating'] > 3]['recipe_id'].values

    print(user_ratings)

    recommendations = []

    for recipe_id in top_rated_recipes:
        similar_recipe_ids = similar_recipes_dict[recipe_id]
        print("AA", similar_recipe_ids)
        for similar_recipe_id in similar_recipe_ids:
            if similar_recipe_id not in rated_recipe_ids:
                recommendations.append(similar_recipe_id)

        if len(recommendations) >= n_recipes * 20:
            break

    recommendations = top_recipes[top_recipes['new_recipe_id'].isin(recommendations)].sort_values(by='average_rating', ascending=False).head(n_recipes)

    return recommendations


content_normal = recommend_recipes(ref_uid)

### Content-based filtering for the top cluster

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

top_cluster_recipes = top_recipes[top_recipes['new_recipe_id'].isin(top_cluster_ids)]

top_recipes_top_cluster_filtered = top_cluster_recipes.dropna(subset=['description'])

tfidf_vectorizer = TfidfVectorizer()
item_features = tfidf_vectorizer.fit_transform(top_recipes_top_cluster_filtered['description'])

cosine_sim = cosine_similarity(item_features, item_features)

top_n_indices = np.argsort(-cosine_sim, axis=1)[:, 1:21]

top_n_recipe_ids = top_recipes_top_cluster_filtered['new_recipe_id'].values[top_n_indices]

similar_recipes_cluster_dict = {recipe_id: top_n_recipe_ids[i].tolist() for i, recipe_id in enumerate(top_recipes_top_cluster_filtered['new_recipe_id'])}

list(similar_recipes_cluster_dict.keys())[0:5]

In [None]:
def recommend_recipes_top_cluster(user_id, n_recipes=10):
    user_ratings = ratings_sample_df[ratings_sample_df['member_id'] == user_id]

    rated_recipe_ids = user_ratings['recipe_id'].values

    top_rated_recipes = user_ratings[user_ratings['rating'] > 3]['recipe_id'].values

    top_rated_recipes = np.intersect1d(top_rated_recipes, top_cluster_ids)

    print(user_ratings)

    recommendations = []

    for recipe_id in top_rated_recipes:
        similar_recipe_ids = similar_recipes_cluster_dict[recipe_id]
        print("AA", similar_recipe_ids)
        for similar_recipe_id in similar_recipe_ids:
            if similar_recipe_id not in rated_recipe_ids:
                recommendations.append(similar_recipe_id)

        if len(recommendations) >= n_recipes * 20:
            break

    # get top recipes based on rating 
    recommendations = top_recipes[top_recipes['new_recipe_id'].isin(recommendations)].sort_values(by='average_rating', ascending=False).head(n_recipes)

    return recommendations


content_cluster = recommend_recipes_top_cluster(ref_uid)

Comparition between cluster vs normal content recommendation. 

In [None]:
content_normal[['title', 'description', 'average_rating']]

In [None]:
content_cluster[['title', 'description', 'average_rating']]

#### Applying a model to predict the rating for a recipe

The model below (logistic regression) is attempting to predict the average ratings of recipes based on their textual descriptions, and the accuracy metric indicates how well the model fits the data.

In [None]:
top_recipes_filtered['predicted_rating'] = predicted_ratings

top_n_recommendations = top_recipes_filtered.sort_values(by='predicted_rating', ascending=False)[0:10]

top_n_recommendations[['recipe_id', 'title', 'predicted_rating']]

In [None]:
recipe_id = 189335
rating = top_n_recommendations[top_n_recommendations['recipe_id'] == recipe_id]['average_rating']
rating

### Apply Algorithms

In [None]:
from surprise import KNNWithMeans, SVD
from surprise import accuracy

def calculate_precision_recall(predictions, threshold=4):
    tp = fp = fn = 0

    for uid, _, true_r, est, _ in predictions:
        if est >= threshold: 
            if true_r >= threshold:
                tp += 1  
            else:
                fp += 1 
        else: 
            if true_r >= threshold:
                fn += 1 

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return precision, recall, f1

def evaluate_algorithm(algo, trainset, testset):
    algo.fit(trainset)
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions)
    precision, recall, f1 = calculate_precision_recall(predictions)
    return algo, rmse, precision, recall, f1

### Item-based collaborative filtering

In [None]:
# Item based collaborative filtering
algo_svd = SVD()
algo_knn_item = KNNWithMeans(sim_options={'name': 'cosine', 'user_based': False})
algo_knn_item2 = KNNWithMeans(sim_options={'name': 'pearson', 'user_based': False})

model_svd, rmse_svd, precision_svd, recall_svd, f1_svd = evaluate_algorithm(algo_svd, trainset, testset)
model_knn, rmse_knn, precision_knn, recall_knn, f1_knn = evaluate_algorithm(algo_knn_item, trainset, testset)   
model_knn2, rmse_knn2, precision_knn2, recall_knn2, f1_knn2 = evaluate_algorithm(algo_knn_item2, trainset, testset)

print("--------------------")
print(f"SVD RMSE: {rmse_svd}")
print(f"SVD Precision: {precision_svd}")
print(f"SVD Recall: {recall_svd}")
print(f"SVD F1: {f1_svd}")
print("--------------------")
print(f"KNN RMSE: {rmse_knn}")
print(f"KNN Precision: {precision_knn}")
print(f"KNN Recall: {recall_knn}")
print(f"KNN F1: {f1_knn}")
print("--------------------")
print(f"KNN2 RMSE: {rmse_knn2}")
print(f"KNN2 Precision: {precision_knn2}")
print(f"KNN2 Recall: {recall_knn2}")
print(f"KNN2 F1: {f1_knn2}")

### User-based collaborative filtering

In [None]:
# User based collaborative filtering
algo_knn_user = KNNWithMeans(sim_options={'name': 'cosine', 'user_based': True})
algo_knn_user2 = KNNWithMeans(sim_options={'name': 'pearson', 'user_based': True})

model_knn_user, rmse_knn_user, precision_knn_user, recall_knn_user, f1_knn_user = evaluate_algorithm(algo_knn_user, trainset, testset)
model_knn_user2, rmse_knn_user2, precision_knn_user2, recall_knn_user2, f1_knn_user2 = evaluate_algorithm(algo_knn_user2, trainset, testset)

print("--------------------")
print(f"KNN User RMSE: {rmse_knn_user}")
print(f"KNN User Precision: {precision_knn_user}")
print(f"KNN User Recall: {recall_knn_user}")
print(f"KNN User F1: {f1_knn_user}")
print("--------------------")
print(f"KNN User2 RMSE: {rmse_knn_user2}")
print(f"KNN User2 Precision: {precision_knn_user2}")
print(f"KNN User2 Recall: {recall_knn_user2}")
print(f"KNN User2 F1: {f1_knn_user2}")

### Predict for a user

Predict ratings for user-item pairs using a given model. If a user-item pair has a known rating in the input data, it retains that rating, otherwise, it predicts a rating using the model.

In [None]:
users_seen = trainset.all_users()
items_seen = trainset.all_items()

raw_users_seen_ids = [trainset.to_raw_uid(uid) for uid in users_seen]
raw_items_seen_ids = [trainset.to_raw_iid(iid) for iid in items_seen]

user_test_id = 1

# Check if a user is in the training set
if user_test_id in raw_users_seen_ids:
    print(f"User {user_test_id} is in the training set")
else:
    print(f"User {user_test_id} is not in the training set")

    user_test_id = min(raw_users_seen_ids, key=lambda x: np.mean([r for (_, r) in trainset.ur[trainset.to_inner_uid(x)]]))    

    print(f" selecting user {user_test_id} instead")


def predict_ratings(model, raw_users, raw_items, data):
    predictions = {}
    # Add to the predictions dictionary all ratings that were previously known
    for user_id, item_id, rating in data.itertuples(index=False):
        predictions[(user_id, item_id)] = rating

    # Save the predictions to file
    with open('predictions.csv', 'w') as f:
        for (user_id, item_id), rating in predictions.items():
            f.write(f"{user_id},{item_id},{rating}\n")
    

    for user_id in raw_users:
        if user_id == 311157:
            print("user_id", user_id)
        for item_id in raw_items:
            if (user_id, item_id) not in predictions:
                prediction = model.predict(user_id, item_id)
                predictions[(user_id, item_id)] = prediction.est

    with open('final_predictions.csv', 'w') as f:
        for (user_id, item_id), rating in predictions.items():
            f.write(f"{user_id},{item_id},{rating}\n")

    return predictions

In [None]:
# Item based
pred_svd = predict_ratings(model_svd, raw_users_seen_ids, raw_items_seen_ids, ratings_sample_df)
pred_knn = predict_ratings(model_knn, raw_users_seen_ids, raw_items_seen_ids, ratings_sample_df)
pred_knn2 = predict_ratings(model_knn2, raw_users_seen_ids, raw_items_seen_ids, ratings_sample_df)

In [None]:
# User based
pred_knn_user = predict_ratings(model_knn_user, raw_users_seen_ids, raw_items_seen_ids, ratings_sample_df)
pred_knn_user2 = predict_ratings(model_knn_user2, raw_users_seen_ids, raw_items_seen_ids, ratings_sample_df)

### Get Top-N Recommendations

In [None]:
# Get top 5 recommendations for a user from the ones he hasn't rated yet
def get_top_n_recommendations(predictions, user_id, data, n=5):

    items_not_rated = data[data['member_id'] != user_id]['recipe_id'].unique()
    print(len(items_not_rated))

    user_predictions = [(iid, pred) for (uid, iid), pred in predictions.items() if uid == user_id and iid in items_not_rated]   

    items_rated = data[data['member_id'] == user_id]['recipe_id'].unique()
    print(len(items_rated))

    user_ratings = [(iid, pred) for (uid, iid), pred in predictions.items() if uid == user_id and iid in items_rated] 

    print(len(user_predictions))
    user_predictions.sort(key=lambda x: x[1], reverse=True)
    return user_predictions[:n]

In [None]:
top_n_svd = get_top_n_recommendations(pred_svd, user_test_id, ratings_sample_df,20)
top_n_knn = get_top_n_recommendations(pred_knn, user_test_id, ratings_sample_df,20)
top_n_knn2 = get_top_n_recommendations(pred_knn2, user_test_id, ratings_sample_df,20)

print("User ID:", user_test_id)
print("--------------------")
print("SVD: ", top_n_svd)
print("KNN: ", top_n_knn)
print("KNN2: ", top_n_knn2)

In [None]:
top_n_knn_user = get_top_n_recommendations(pred_knn_user, user_test_id, ratings_sample_df,20)
top_n_knn_user2 = get_top_n_recommendations(pred_knn_user2, user_test_id, ratings_sample_df,20)

print("--------------------")
print("KNN User: ", top_n_knn_user)
print("KNN User2: ", top_n_knn_user2)

In [None]:
# Show ratings_sample_df ratings distribution
plt.hist(ratings_sample_df['rating'], bins=20)

#### Results

In [None]:
results_item_svd = top_recipes[top_recipes['new_recipe_id'].isin([x[0] for x in top_n_svd])][['title', 'description', 'average_rating']]
results_item_svd

In [None]:
results_item_knn = top_recipes[top_recipes['new_recipe_id'].isin([x[0] for x in top_n_knn])][['title', 'description', 'average_rating']]
results_item_knn

In [None]:
results_item_knn2 = top_recipes[top_recipes['new_recipe_id'].isin([x[0] for x in top_n_knn2])][['title', 'description', 'average_rating']]
results_item_knn2

In [None]:
results_user_knn = top_recipes[top_recipes['new_recipe_id'].isin([x[0] for x in top_n_knn_user])][['title', 'description', 'average_rating']]
results_user_knn

In [None]:
results_user_knn2 = top_recipes[top_recipes['new_recipe_id'].isin([x[0] for x in top_n_knn_user2])][['title', 'description', 'average_rating']]
results_user_knn2

# Metrics

In [None]:
uid = 498271
user_ratings = ratings_sample_df[ratings_sample_df['member_id'] == uid]
user_ratings

## Precision at k

In [None]:
recipes_liked = top_recipes[top_recipes['new_recipe_id'].isin(user_ratings['recipe_id'])][['title', 'ingredients', 'average_rating']]
recipes_liked

### Item-based

In [None]:
pred_svd_user = get_top_n_recommendations(pred_svd, uid, ratings_sample_df, 20)
pred_knn_user = get_top_n_recommendations(pred_knn, uid, ratings_sample_df, 20)
pred_knn_user2 = get_top_n_recommendations(pred_knn2, uid, ratings_sample_df, 20)
pred_svd_user

We identified the following ingredients as relevant: 
- chicken 
- tortilla
- chips
- tomatoes
- cheese
- spaghetti pasta
- pepper
- tomato sauce
- ground beef
- green beans

#### SVD

In [None]:
recipes_svd = top_recipes[top_recipes['new_recipe_id'].isin([x[0] for x in pred_svd_user])].sort_values(by='average_rating', ascending=False)
recipes_svd[['title', 'ingredients', 'average_rating']]

In [None]:
recipes_svd['relevant'] = [1,1,1,1,0,1,1,0,1,1,1,1,1,1,1,1,0,1,0,1]
recipes_svd[['title', 'relevant']]

#### KNN Cosine

In [None]:
recipes_knn = top_recipes[top_recipes['new_recipe_id'].isin([x[0] for x in pred_knn_user])].sort_values(by='average_rating', ascending=False)
recipes_knn[['title', 'ingredients', 'average_rating']]

#### KNN Pearson

In [None]:
recipes_knn2 = top_recipes[top_recipes['new_recipe_id'].isin([x[0] for x in pred_knn_user2])].sort_values(by='average_rating', ascending=False)
recipes_knn2[['title', 'ingredients', 'average_rating']]

# NLP

# Time-Series Analysis

In [None]:
cluster_reviews.head()

In [None]:
cluster_reviews.shape

In [None]:
dates = sorted(list(cluster_reviews['last_modified_date']))
dates

In [None]:
def reviews_until_date(reviews, date):
    filtered_reviews = reviews[reviews['last_modified_date'] <= date]
    return len(filtered_reviews)

reviews_until_date(cluster_reviews, dates[1])


In [None]:
review_count = pd.DataFrame([[date, reviews_until_date(cluster_reviews, date)] for date in dates], columns=['Date', 'Count']).drop_duplicates()
review_count

In [None]:
def reviews_between_dates(reviews, initial_date, end_date):
    filtered_reviews = reviews[(reviews['last_modified_date'] > initial_date) & (reviews['last_modified_date'] <= end_date)]
    return len(filtered_reviews)


def bin_dates(dataset, bin_size=20): 
    first_date = dataset['last_modified_date'].min()
    last_date = dataset['last_modified_date'].max()

    bin_range = (pd.to_datetime(last_date) - pd.to_datetime(first_date)) / bin_size

    bins = [pd.to_datetime(first_date) + bin_range * i for i in range(bin_size)]

    for i in range(len(bins)):  
        bins[i] = bins[i].strftime('%Y-%m-%d')

    bins.append(last_date)

    return bins  


dates = bin_dates(cluster_reviews)

binned_reviews = []
for i in range(len(dates) - 1):
    initial_date = dates[i]
    end_date = dates[i + 1]
    reviews_count = reviews_between_dates(cluster_reviews, initial_date, end_date)
    binned_reviews.append({'start_date': initial_date, 'end_date': end_date, 'review_count': reviews_count})

binned_reviews_df = pd.DataFrame(binned_reviews)
print(binned_reviews_df)

In [None]:
def plot_df(df, x, y, title="", xlabel='Date', ylabel='Count', dpi=100):
    plt.figure(figsize=(15,4), dpi=dpi)
    plt.plot(x, y, color='tab:red', )
    plt.xticks(rotation=70)
    plt.gca().set(title=title, xlabel=xlabel, ylabel=ylabel)
    plt.show()

plot_df(binned_reviews_df, x=binned_reviews_df['start_date'], y=binned_reviews_df['review_count'], title='Temporal Analysis of Reviews')

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose
from dateutil.parser import parse

def plot_decompositions(df, label, period=10): 
    # Multiplicative Decomposition 
    multiplicative_decomposition = seasonal_decompose(df[label], model='multiplicative', period=period)

    # Additive Decomposition
    additive_decomposition = seasonal_decompose(df[label], model='additive', period=period)

    # Plot
    plt.rcParams.update({'figure.figsize': (16,12)})
    multiplicative_decomposition.plot().suptitle('Multiplicative Decomposition', fontsize=16)
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])

    additive_decomposition.plot().suptitle('Additive Decomposition', fontsize=16)
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])

    plt.show()

In [None]:
plot_decompositions(binned_reviews_df, 'review_count', period=10)

In [None]:
plot_decompositions(review_count, 'Count', period=len(review_count) // 2)

In [None]:
def average_rating_between_dates(reviews, initial_date, end_date):
    filtered_reviews = reviews[(reviews['last_modified_date'] > initial_date) & (reviews['last_modified_date'] <= end_date)]
    return filtered_reviews['rating'].mean()

def average_rating_until_date(reviews, date):
    filtered_reviews = reviews[reviews['last_modified_date'] <= date]
    return filtered_reviews['rating'].mean()

def bin_dates(dataset, bin_size=20):
    first_date = dataset['last_modified_date'].min()
    last_date = dataset['last_modified_date'].max()

    bin_range = (pd.to_datetime(last_date) - pd.to_datetime(first_date)) / bin_size

    bins = [pd.to_datetime(first_date) + bin_range * i for i in range(bin_size)]

    for i in range(len(bins)):
        bins[i] = bins[i].strftime('%Y-%m-%d')

    bins.append(last_date)

    return bins

dates = bin_dates(cluster_reviews, 50)

binned_ratings = []

for i in range(len(dates) - 1):
    initial_date = dates[i]
    end_date = dates[i + 1]
    average_rating = average_rating_between_dates(cluster_reviews, initial_date, end_date)
    binned_ratings.append({'start_date': initial_date, 'end_date': end_date, 'average_rating': average_rating})

binned_ratings_df = pd.DataFrame(binned_ratings)
print(binned_ratings_df)

In [None]:
binned_ratings_df['average_rating'].isna().sum()

In [None]:
plot_decompositions(binned_ratings_df, 'average_rating', period=len(binned_ratings_df) // 2)