In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from Levenshtein import distance as levenshtein_distance
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import mean_absolute_error

In [2]:
od = pd.read_csv("pizza_sales/order_details.csv")
o = pd.read_csv("pizza_sales/orders.csv")
pt = pd.read_csv("pizza_sales/pizza_types.csv", encoding= 'unicode_escape') # codec decode error
p = pd.read_csv("pizza_sales/pizzas.csv")

In [3]:
ood = o.set_index('order_id').join(od.set_index('order_id')).reset_index()

In [4]:
ppt = p.set_index('pizza_type_id').join(pt.set_index('pizza_type_id')).reset_index()

In [5]:
df = ood.set_index('pizza_id').join(ppt.set_index('pizza_id')).reset_index().sort_values(by="order_details_id").reset_index(drop=True)

In [6]:
df.describe()

Unnamed: 0,order_id,order_details_id,quantity,price
count,48620.0,48620.0,48620.0,48620.0
mean,10701.479761,24310.5,1.019622,16.494132
std,6180.11977,14035.529381,0.143077,3.621789
min,1.0,1.0,1.0,9.75
25%,5337.0,12155.75,1.0,12.75
50%,10682.5,24310.5,1.0,16.5
75%,16100.0,36465.25,1.0,20.25
max,21350.0,48620.0,4.0,35.95


In [7]:
from pandas_profiling import ProfileReport
prof = ProfileReport(df)
prof.to_file(output_file='pizzaData.html')

  from pandas_profiling import ProfileReport


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

  return func(*args, **kwargs)


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [7]:
groups = df.groupby("order_id").agg({"pizza_type_id":"count"}).reset_index()
multiOrders = groups[groups.pizza_type_id>1].order_id.values

In [8]:
# multi-order dataframe
moDf = df[df.order_id.isin(multiOrders)].reset_index(drop=True)

## Ways to recommend:
### 1) User Similarity (similar prices and times)
### 2) Association (similar orders)
### 3) Item-item similarity * noted as best way from class notes
### 4) Popularity

### From class notes, best recommendation type is item to item
### In our case, this means pizzas that go with pizzas, not order

### First: trying simple co-occurence of pizza type using cosine similarity

In [10]:
simple = moDf[["order_id", "quantity", "pizza_type_id"]]
copyDf = simple.copy()
#copyDf = copyDf[copyDf.quantity==1]
copyDf['quantity'] = 1

In [16]:
copyDf = copyDf.drop_duplicates().reset_index(drop=True)

In [17]:
itemitem = copyDf.pivot(index='pizza_type_id', columns='order_id', values='quantity').fillna(0)

### Use training and testing set

In [18]:
ii = itemitem.transpose().reset_index()

In [19]:
map_pizza_test = itemitem.index.to_list()

In [20]:
pizza_types = itemitem.index.to_list()

In [21]:
# 100 random splits, independent on previous splits
plot_jacs = []
plot_cos = []
plot_lev = []
plot_rmse = []
plot_mae = []

for i in range(100):
    print(i)
    # Select the random sampled training set
    train_size = 0.75
    ii_train = ii.sample(n=int(np.floor(len(ii)*train_size)))
    ii_test = ii[~ii['order_id'].isin(ii_train["order_id"])]
    ii_train = ii_train.set_index('order_id').transpose()
    ii_test = ii_test.set_index('order_id').transpose()

    # Fit the nearest neighbors model using cosine similarity on the training set
    knn = NearestNeighbors(metric='cosine', algorithm='brute')
    knn.fit(ii_train.values)
    distances_train, indices_train = knn.kneighbors(ii_train.values, n_neighbors=len(ii_train))


    # Make n recommendations and calculate metrics
    ns = np.arange(1,15)
    all_mins = [] # store the minimum similarities
    outer_jacs = []
    outer_cos = []
    outer_lev = []
    outer_rmse = []
    outer_mae = []
    for n in ns:
        #all_precs = []
        all_jacs = [] # jaccard similarity
        all_cos = [] # cosine similarity
        all_lev = [] # lev distance = edit distance
        all_rmse = [] # RMSE
        all_mae = [] # mae
        idx = 0
        for i in pizza_types:
            bbq_ckn = ii_test.transpose()[ii_test.transpose()[i]==1]

            # Find the counts
            counts = []
            for i in bbq_ckn.columns:
                counts.append(sum(bbq_ckn[i]))

            # Create dataframe
            result_df = pd.DataFrame()
            result_df["Pizza"] = bbq_ckn.columns
            result_df["Count"] = counts

            # Iterate through different pizzas
            curr = indices_train[idx]
            dist = distances_train[idx]
            idx += 1

            # Extract predicted vs true
            recPizzas = [map_pizza_test[j] for j in curr[1:n+1]]
            truePizzas = list(result_df.sort_values("Count", ascending=False).Pizza.iloc[:n].values)

            # Get jaccard similarity
            correct = [i for i in recPizzas if i in truePizzas]
            combined = recPizzas.copy()
            combined.extend(truePizzas)
            union = set(combined)
            jac = len(correct)/len(union)
            all_jacs.append(jac)

            # Get cosine similarity of results
            # Processing assistance found: https://stackoverflow.com/questions/28819272/python-how-to-calculate-the-cosine-similarity-of-two-word-lists
            # count word occurrences
            a_vals = Counter(recPizzas)
            b_vals = Counter(truePizzas)

            # convert to word-vectors
            words  = list(a_vals.keys() | b_vals.keys())
            a_vect = [a_vals.get(word, 0) for word in words]
            b_vect = [b_vals.get(word, 0) for word in words]  
            cos = cosine_similarity(np.array(a_vect).reshape(1, -1), np.array(b_vect).reshape(1, -1))
            all_cos.append(cos)

            # Levenshtein distance
            all_lev.append(levenshtein_distance(truePizzas, recPizzas))

            # Ranked rmse
            allTruePizzas = list(result_df.sort_values("Count", ascending=False).Pizza.values)
            compareDf = pd.DataFrame()
            compareDf["Pizza"] = list(union)
            rec_ranks = []
            true_ranks = []
            for pizza in union:
                try:
                    rec_ranks.append(recPizzas.index(pizza)+1)
                except:
                    rec_ranks.append(np.nan)
                try:
                    true_ranks.append(allTruePizzas.index(pizza))
                except:
                    true_ranks.append(np.nan)
            compareDf["RecRank"] = rec_ranks
            compareDf["TrueRank"] = true_ranks
            compareDf = compareDf.dropna()
            rmse = sqrt(mean_squared_error(compareDf.TrueRank, compareDf.RecRank))
            all_rmse.append(rmse)
            
            # mae score
            mae = mean_absolute_error(compareDf.TrueRank, compareDf.RecRank)
            all_mae.append(mae)

        outer_jacs.append(np.array(all_jacs).mean())
        outer_cos.append(np.array(all_cos).mean())
        outer_lev.append(np.array(all_lev).mean())
        outer_rmse.append(np.array(all_rmse).mean())
        outer_mae.append(np.array(all_mae).mean())

    plot_jacs.append(np.array([np.array(outer_jacs)]))
    plot_cos.append(np.array([np.array(outer_cos)]))
    plot_lev.append(np.array([np.array(outer_lev)]))
    plot_rmse.append(np.array([np.array(outer_rmse)]))
    plot_mae.append(np.array([np.array(outer_mae)]))
    
# Can perform mean of means here because we care about the inner mean value, and want the mean of it.
output_data_jacs = np.array(plot_jacs).mean(axis=0)
output_data_cos = np.array(plot_cos).mean(axis=0)
output_data_lev = np.array(plot_lev).mean(axis=0)
output_data_rmse = np.array(plot_rmse).mean(axis=0)
output_data_mae = np.array(plot_mae).mean(axis=0)
    

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [22]:
metricDf = pd.DataFrame()
metricDf["JacSim"] = output_data_jacs[0]
metricDf["CosSim"] = output_data_cos[0]
metricDf["LevDis"] = output_data_lev[0]
metricDf["RMSE"] = output_data_rmse[0]
metricDf["MAE"] = output_data_mae[0]
metricDf["NumRec"] = metricDf.index + 1

In [23]:
metricDf

Unnamed: 0,JacSim,CosSim,LevDis,RMSE,MAE,NumRec
0,0.0,0.0,1.0,4.845625,4.845625,1
1,0.070729,0.106094,1.905625,5.292694,4.594531,2
2,0.143094,0.222708,2.772813,5.235412,4.296563,3
3,0.222232,0.338672,3.623125,5.33226,4.232031,4
4,0.291158,0.429688,4.481562,5.374591,4.20625,5
5,0.352357,0.505313,5.347188,5.483417,4.261927,6
6,0.405196,0.563973,6.207187,5.562849,4.322321,7
7,0.44975,0.609766,7.09,5.636389,4.386758,8
8,0.488093,0.64684,7.953125,5.693771,4.435729,9
9,0.524177,0.679375,8.804375,5.787543,4.509031,10


In [24]:
metricDf.to_csv("metrics.csv")

In [79]:
# Select where we don't see as drastic of a rise in error = 5 pizzas

### Given a cart, average cosine sim for each item and display the optimal n (5)

In [25]:
# FINAL SYSTEM - USED LIVE AND INCLUDES ALL GIVEN DATA

n=5 # 6 pizza recommendations

knn = NearestNeighbors(metric='cosine', algorithm='brute')
knn.fit(itemitem.values)
distances, indices = knn.kneighbors(itemitem.values, n_neighbors=n+1) #+1 for removal of self similarity

map_pizza = itemitem.index.to_list()



In [26]:
cart = ["bbq_ckn", "hawaiian", "five_cheese"]
n = 5
for i,item in enumerate(cart):
    idx = map_pizza_test.index(item)
    currDf = pd.DataFrame()
    currDf["Distance"] = distances[idx]
    currDf["Indices"] = indices[idx]
    currDf = currDf.sort_values(by="Indices")
    if(i==0):
        totalDf = currDf
    else:
        totalDf = totalDf.append(currDf)

distanceDf = totalDf.groupby("Indices").agg({"Distance":"mean"}).sort_values(by="Distance").reset_index() 
all_recs = [(map_pizza_test[i], 1-d) for i,d in zip(distanceDf.Indices,distanceDf.Distance) if map_pizza_test[i] not in cart]
all_recs[:n] # similarity shown

  totalDf = totalDf.append(currDf)
  totalDf = totalDf.append(currDf)


[('cali_ckn', 0.13534339737958456),
 ('thai_ckn', 0.1306199792665136),
 ('pepperoni', 0.12759280601912637),
 ('southw_ckn', 0.1273504991106329),
 ('classic_dlx', 0.12360246863054258)]