# Pizza_recommendation_simple.ipynb file for Cart-Based Pizza Recommendation System
## Ben Pfeffer, Andrew Anctil, Bradon Wetzel
## CIS 530 - Advanced Data Mining - Professor Thomas Gyeera

### Import libraries

In [1]:
import pandas as pd
from pandas_profiling import ProfileReport
import matplotlib.pyplot as plt
from collections import Counter
from Levenshtein import distance as levenshtein_distance
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import mean_absolute_error

  from pandas_profiling import ProfileReport


### Load data (downloaded from kaggle) and perform data engineering

In [2]:
# Import datasets
od = pd.read_csv("pizza_sales/order_details.csv")
o = pd.read_csv("pizza_sales/orders.csv")
pt = pd.read_csv("pizza_sales/pizza_types.csv", encoding= 'unicode_escape') # codec decode error
p = pd.read_csv("pizza_sales/pizzas.csv")

In [3]:
# Join the datasets
ood = o.set_index('order_id').join(od.set_index('order_id')).reset_index()

In [4]:
ppt = p.set_index('pizza_type_id').join(pt.set_index('pizza_type_id')).reset_index()

In [5]:
df = ood.set_index('pizza_id').join(ppt.set_index('pizza_id')).reset_index().sort_values(by="order_details_id").reset_index(drop=True)

In [6]:
# view the data using the describe() function
df.describe()

Unnamed: 0,order_id,order_details_id,quantity,price
count,48620.0,48620.0,48620.0,48620.0
mean,10701.479761,24310.5,1.019622,16.494132
std,6180.11977,14035.529381,0.143077,3.621789
min,1.0,1.0,1.0,9.75
25%,5337.0,12155.75,1.0,12.75
50%,10682.5,24310.5,1.0,16.5
75%,16100.0,36465.25,1.0,20.25
max,21350.0,48620.0,4.0,35.95


In [7]:
# Get a profile report (pizzaData.html output)
prof = ProfileReport(df)
prof.to_file(output_file='pizzaData.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

  return func(*args, **kwargs)


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [8]:
# Group by the order and count pizzas per order
groups = df.groupby("order_id").agg({"pizza_type_id":"count"}).reset_index()
# Select where more than 1 pizza
multiOrders = groups[groups.pizza_type_id>1].order_id.values

In [9]:
# Create the multi-order dataframe
moDf = df[df.order_id.isin(multiOrders)].reset_index(drop=True)

## Ways to recommend:
### 1) User Similarity (similar prices and times)
### 2) Association (similar orders)
### 3) Item-item similarity * noted as best way from class notes
### 4) Popularity

### From class notes, best recommendation type is item to item
### In our case, this means pizzas that go with pizzas, not order

In [10]:
# Simplify the data
simple = moDf[["order_id", "quantity", "pizza_type_id"]]
copyDf = simple.copy()
# Set all quantities to 1
copyDf['quantity'] = 1

In [11]:
# Drop duplicates
copyDf = copyDf.drop_duplicates().reset_index(drop=True)

In [12]:
# Pivot the table to get the item to item dataframe that we need
itemitem = copyDf.pivot(index='pizza_type_id', columns='order_id', values='quantity').fillna(0)

### Determine the best metric to use with Nearest Neighbors

In [66]:
# Metrics found from https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.distance_metrics.html
metric_list = ['cityblock',
 'cosine',
 'euclidean',
 'l2',
 'l1',
 'manhattan',
 'nan_euclidean']

### Use training, validation and testing set

In [29]:
# Transpose data to extract list of pizzas
ii = itemitem.transpose().reset_index()

In [30]:
# Get the list of pizzas
pizza_types = itemitem.index.to_list()

In [35]:
# For all metrics:
oo_jacs_val = []
oo_cos_val = []
oo_lev_val = []
oo_rmse_val = []
oo_mae_val = []

oo_jacs = []
oo_cos = []
oo_lev = []
oo_rmse = []
oo_mae = []

# Iterate through metric list for NN
for metric in metric_list:
    print(metric)

    # 100 random splits, independent on previous splits
    plot_jacs_val = []
    plot_cos_val = []
    plot_lev_val = []
    plot_rmse_val = []
    plot_mae_val = []

    plot_jacs = []
    plot_cos = []
    plot_lev = []
    plot_rmse = []
    plot_mae = []

    # Iterate through 100 random trials of sampling
    for i in range(100):
        if(i%20==0):
            print(i)
        # Select the random sampled training set, validation set, and testing set
        train_size = 0.70
        val_size = 0.15
        ii_train = ii.sample(n=int(np.floor(len(ii)*train_size)))
        remain = ii[~ii['order_id'].isin(ii_train["order_id"])]
        ii_val = remain.sample(n=int(np.floor(len(remain)*val_size)))
        ii_test = remain[~remain['order_id'].isin(ii_val["order_id"])]

        # Reformat the data
        ii_train = ii_train.set_index('order_id').transpose()
        ii_val = ii_val.set_index('order_id').transpose()
        ii_test = ii_test.set_index('order_id').transpose()

        # Fit the nearest neighbors model using cosine similarity on the training set
        knn = NearestNeighbors(metric=metric, algorithm='brute')
        knn.fit(ii_train.values)
        distances_train, indices_train = knn.kneighbors(ii_train.values, n_neighbors=len(ii_train))


        # Make n recommendations and calculate metrics
        ns = np.arange(1,15)
        all_mins = [] # store the minimum similarities
        
        # Store the metrics for each n
        outer_jacs_val = []
        outer_cos_val = []
        outer_lev_val = []
        outer_rmse_val = []
        outer_mae_val = []
        outer_jacs = []
        outer_cos = []
        outer_lev = []
        outer_rmse = []
        outer_mae = []
        # Iterate through all ns
        for n in ns:
            all_jacs_val = [] # jaccard similarity
            all_cos_val = [] # cosine similarity
            all_lev_val = [] # lev distance = edit distance
            all_rmse_val = [] # RMSE
            all_mae_val = [] # mae
            all_jacs = [] # jaccard similarity
            all_cos = [] # cosine similarity
            all_lev = [] # lev distance = edit distance
            all_rmse = [] # RMSE
            all_mae = [] # mae
            idx = 0
            # Iterate through all pizza types
            for i in pizza_types:
                # Store the orders of current pizza as dataframe
                pizza_df_val = ii_val.transpose()[ii_val.transpose()[i]==1]
                pizza_df = ii_test.transpose()[ii_test.transpose()[i]==1]

                # Find the counts of true orders
                counts_val = []
                for i in pizza_df_val.columns:
                    counts_val.append(sum(pizza_df_val[i]))
                counts = []
                for i in pizza_df.columns:
                    counts.append(sum(pizza_df[i]))

                # Create dataframe to store this info
                result_df_val = pd.DataFrame()
                result_df_val["Pizza"] = pizza_df_val.columns
                result_df_val["Count"] = counts_val
                # Create dataframe to store this info
                result_df = pd.DataFrame()
                result_df["Pizza"] = pizza_df.columns
                result_df["Count"] = counts

                # Extract the model results
                curr = indices_train[idx]
                dist = distances_train[idx]
                idx += 1

                # Extract predicted vs true values
                recPizzas = [pizza_types[j] for j in curr[1:n+1]]
                truePizzas_val = list(result_df_val.sort_values("Count", ascending=False).Pizza.iloc[:n].values)
                truePizzas = list(result_df.sort_values("Count", ascending=False).Pizza.iloc[:n].values)

                # Get jaccard similarity
                # For val set
                correct_val = [i for i in recPizzas if i in truePizzas_val]
                combined = recPizzas.copy()
                combined.extend(truePizzas_val)
                union_val = set(combined)
                jac = len(correct_val)/len(union_val)
                all_jacs_val.append(jac)

                # For test set
                correct = [i for i in recPizzas if i in truePizzas]
                combined = recPizzas.copy()
                combined.extend(truePizzas)
                union = set(combined)
                jac = len(correct)/len(union)
                all_jacs.append(jac)


                # Get cosine similarity of results
                # Processing assistance found: https://stackoverflow.com/questions/28819272/python-how-to-calculate-the-cosine-similarity-of-two-word-lists
                # count word occurrences
                a_vals = Counter(recPizzas)
                b_vals_val = Counter(truePizzas_val)
                b_vals = Counter(truePizzas)

                # convert to word-vectors - val
                words  = list(a_vals.keys() | b_vals_val.keys())
                a_vect = [a_vals.get(word, 0) for word in words]
                b_vect = [b_vals_val.get(word, 0) for word in words]  
                cos = cosine_similarity(np.array(a_vect).reshape(1, -1), np.array(b_vect).reshape(1, -1))
                all_cos_val.append(cos)
                
                # For test set
                words  = list(a_vals.keys() | b_vals.keys())
                a_vect = [a_vals.get(word, 0) for word in words]
                b_vect = [b_vals.get(word, 0) for word in words]  
                cos = cosine_similarity(np.array(a_vect).reshape(1, -1), np.array(b_vect).reshape(1, -1))
                all_cos.append(cos)

                # Levenshtein distance
                all_lev_val.append(levenshtein_distance(truePizzas_val, recPizzas))
                all_lev.append(levenshtein_distance(truePizzas, recPizzas))

                # Ranked rmse
                # For val set
                # Get ascending counts as true ranks
                allTruePizzas = list(result_df_val.sort_values("Count", ascending=False).Pizza.values)
                # Store the union
                compareDf_val = pd.DataFrame()
                compareDf_val["Pizza"] = list(union_val)
                rec_ranks = []
                true_ranks = []
                # Iterate through the union and append ranks
                for pizza in union_val:
                    try:
                        rec_ranks.append(recPizzas.index(pizza)+1)
                    except:
                        rec_ranks.append(np.nan)
                    try:
                        true_ranks.append(allTruePizzas.index(pizza))
                    except:
                        true_ranks.append(np.nan)
                # Store the ranks
                compareDf_val["RecRank"] = rec_ranks
                compareDf_val["TrueRank"] = true_ranks
                # Drop when not applicable
                compareDf_val = compareDf_val.dropna()
                # Calculate RMSE
                rmse = sqrt(mean_squared_error(compareDf_val.TrueRank, compareDf_val.RecRank))
                all_rmse_val.append(rmse)

                # For test set
                # Get ascending counts as true ranks
                allTruePizzas = list(result_df.sort_values("Count", ascending=False).Pizza.values)
                # Store the union
                compareDf = pd.DataFrame()
                compareDf["Pizza"] = list(union)
                rec_ranks = []
                true_ranks = []
                # Iterate through the union and append ranks
                for pizza in union:
                    try:
                        rec_ranks.append(recPizzas.index(pizza)+1)
                    except:
                        rec_ranks.append(np.nan)
                    try:
                        true_ranks.append(allTruePizzas.index(pizza))
                    except:
                        true_ranks.append(np.nan)
                # Store the ranks
                compareDf["RecRank"] = rec_ranks
                compareDf["TrueRank"] = true_ranks
                # Drop when not applicable
                compareDf = compareDf.dropna()
                # Calculate RMSE
                rmse = sqrt(mean_squared_error(compareDf.TrueRank, compareDf.RecRank))
                all_rmse.append(rmse)

                # Calculate MAE score for val and test
                mae = mean_absolute_error(compareDf.TrueRank, compareDf.RecRank)
                all_mae.append(mae)
                mae = mean_absolute_error(compareDf_val.TrueRank, compareDf_val.RecRank)
                all_mae_val.append(mae)

            # Store results for each n
            outer_jacs_val.append(np.array(all_jacs_val).mean())
            outer_cos_val.append(np.array(all_cos_val).mean())
            outer_lev_val.append(np.array(all_lev_val).mean())
            outer_rmse_val.append(np.array(all_rmse_val).mean())
            outer_mae_val.append(np.array(all_mae_val).mean())
            outer_jacs.append(np.array(all_jacs).mean())
            outer_cos.append(np.array(all_cos).mean())
            outer_lev.append(np.array(all_lev).mean())
            outer_rmse.append(np.array(all_rmse).mean())
            outer_mae.append(np.array(all_mae).mean())

        # Store for every sample
        plot_jacs_val.append(np.array([np.array(outer_jacs_val)]))
        plot_cos_val.append(np.array([np.array(outer_cos_val)]))
        plot_lev_val.append(np.array([np.array(outer_lev_val)]))
        plot_rmse_val.append(np.array([np.array(outer_rmse_val)]))
        plot_mae_val.append(np.array([np.array(outer_mae_val)]))
        plot_jacs.append(np.array([np.array(outer_jacs)]))
        plot_cos.append(np.array([np.array(outer_cos)]))
        plot_lev.append(np.array([np.array(outer_lev)]))
        plot_rmse.append(np.array([np.array(outer_rmse)]))
        plot_mae.append(np.array([np.array(outer_mae)]))

    # Calculate means
    # Can perform mean of means here because we care about the inner mean value, and want the mean of it.
    output_data_jacs_val = np.array(plot_jacs_val).mean(axis=0)
    output_data_cos_val = np.array(plot_cos_val).mean(axis=0)
    output_data_lev_val = np.array(plot_lev_val).mean(axis=0)
    output_data_rmse_val = np.array(plot_rmse_val).mean(axis=0)
    output_data_mae_val = np.array(plot_mae_val).mean(axis=0)
    output_data_jacs = np.array(plot_jacs).mean(axis=0)
    output_data_cos = np.array(plot_cos).mean(axis=0)
    output_data_lev = np.array(plot_lev).mean(axis=0)
    output_data_rmse = np.array(plot_rmse).mean(axis=0)
    output_data_mae = np.array(plot_mae).mean(axis=0)

    # Store for every NN metric
    oo_jacs_val.append(output_data_jacs_val)
    oo_cos_val.append(output_data_cos_val)
    oo_lev_val.append(output_data_lev_val)
    oo_rmse_val.append(output_data_rmse_val)
    oo_mae_val.append(output_data_mae_val)
    oo_jacs.append(output_data_jacs)
    oo_cos.append(output_data_cos)
    oo_lev.append(output_data_lev)
    oo_rmse.append(output_data_rmse)
    oo_mae.append(output_data_mae)


cityblock
0
20
40
60
80
cosine
0
20
40
60
80
euclidean
0
20
40
60
80
l2
0
20
40
60
80
l1
0
20
40
60
80
manhattan
0
20
40
60
80
nan_euclidean
0
20
40
60
80


In [77]:
# Iterate through the NN metrics
metric_list_inner = [oo_jacs_val, oo_cos_val, oo_lev_val, oo_rmse_val, oo_mae_val]
print("ON VALIDATION SET, ")
print("Best Metrics for jaccard, cosine, levenshtein, rmse, and mae, respectively")

for j, metric in enumerate(metric_list_inner):
    print("______________________")
    if(j<2): # for similarities
        mm = 1# 0 = min, 1 = max
    else:
        mm = 0
    all_vals = []
    for i in range(7):
        # Show best NN distance metric for each recorded metric on the validation set
        # Create list of best values on any N for the given metric
        if(mm==1):
            all_vals.append(max(metric[i][0]))
        else:
            all_vals.append(min(metric[i][0]))
    if(mm==1):
        print(metric_list[np.array(all_vals).argmax()])
        print("Ordered best to worst:")
        print([metric_list[k] for k in np.array(all_vals).argsort()[-7:][::-1]])
    else:
        print(metric_list[np.array(all_vals).argmin()])
        print("Ordered best to worst:")
        print([metric_list[k] for k in np.array(all_vals).argsort()[-7:]])
print()
print("Cosine similarity is best in 4/5 metrics (Levenshtein only difference). So, cosine similarity is best to use for Nearest Neighbors")

ON VALIDATION SET, 
Best Metrics for jaccard, cosine, levenshtein, rmse, and mae, respectively
______________________
cosine
Ordered best to worst:
['cosine', 'manhattan', 'l2', 'nan_euclidean', 'cityblock', 'l1', 'euclidean']
______________________
cosine
Ordered best to worst:
['cosine', 'manhattan', 'l2', 'nan_euclidean', 'cityblock', 'l1', 'euclidean']
______________________
cityblock
Ordered best to worst:
['cityblock', 'cosine', 'euclidean', 'l2', 'l1', 'manhattan', 'nan_euclidean']
______________________
cosine
Ordered best to worst:
['cosine', 'cityblock', 'l2', 'manhattan', 'euclidean', 'nan_euclidean', 'l1']
______________________
cosine
Ordered best to worst:
['cosine', 'cityblock', 'manhattan', 'nan_euclidean', 'l2', 'euclidean', 'l1']

Cosine similarity is best in 4/5 metrics (Levenshtein only difference). So, cosine similarity is best to use for Nearest Neighbors


### Cosine still minimizes MAE at 5, which tells us to use cosine as a metric and 5 as a number of recommendations

In [37]:
# Create validation metric dataframe
metricDf_val = pd.DataFrame()
metricDf_val["JacSim"] = output_data_jacs_val[0]
metricDf_val["CosSim"] = output_data_cos_val[0]
metricDf_val["LevDis"] = output_data_lev_val[0]
metricDf_val["RMSE"] = output_data_rmse_val[0]
metricDf_val["MAE"] = output_data_mae_val[0]
metricDf_val["NumRec"] = metricDf_val.index + 1

In [25]:
# Store as csv
metricDf_val.to_csv("valMetrics.csv")

In [34]:
# Create test metric dataframe
metricDf = pd.DataFrame()
metricDf["JacSim"] = output_data_jacs[0]
metricDf["CosSim"] = output_data_cos[0]
metricDf["LevDis"] = output_data_lev[0]
metricDf["RMSE"] = output_data_rmse[0]
metricDf["MAE"] = output_data_mae[0]
metricDf["NumRec"] = metricDf.index + 1

In [24]:
# Store as csv
metricDf.to_csv("metrics.csv")

### Testing: Given a cart, average cosine sim for each item and display the optimal n (5)

In [25]:
# FINAL SYSTEM - USED LIVE AND INCLUDES ALL GIVEN DATA

n=5 # 5 pizza recommendations

# Initialize and fit the model using best params
knn = NearestNeighbors(metric='cosine', algorithm='brute')
knn.fit(itemitem.values)

# Store model values
distances, indices = knn.kneighbors(itemitem.values, n_neighbors=n+1) #+1 for removal of self similarity

# Get a map of the pizza
map_pizza = itemitem.index.to_list()



In [26]:
# TEST:
# Initialize a cart and number of recommendations
cart = ["bbq_ckn", "hawaiian", "five_cheese"]
n = 5
# Iterate through the cart
for i,item in enumerate(cart):
    # Select the overall index of the pizza type in the cart
    idx = pizza_types.index(item)
    # Create a dataframe of model results at this overall index
    currDf = pd.DataFrame()
    currDf["Distance"] = distances[idx]
    currDf["Indices"] = indices[idx]
    # Sort by indices
    currDf = currDf.sort_values(by="Indices")
    # Create total dataframe output
    if(i==0):
        totalDf = currDf
    else:
        totalDf = totalDf.append(currDf)

# Group the output dataframe by the indices(which map to pizza types) and get the mean distance, then sort by distance ascending
distanceDf = totalDf.groupby("Indices").agg({"Distance":"mean"}).sort_values(by="Distance").reset_index() 
# Select the pizzas (not in the cart) that are closest/most recommended
all_recs = [(pizza_types[i], 1-d) for i,d in zip(distanceDf.Indices,distanceDf.Distance) if pizza_types[i] not in cart]
# Diplay the top n pizzas and their similarity
all_recs[:n] # similarity shown

  totalDf = totalDf.append(currDf)
  totalDf = totalDf.append(currDf)


[('cali_ckn', 0.13534339737958456),
 ('thai_ckn', 0.1306199792665136),
 ('pepperoni', 0.12759280601912637),
 ('southw_ckn', 0.1273504991106329),
 ('classic_dlx', 0.12360246863054258)]