### imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict
import time
from sys import getsizeof

### objectives:
1. get frequent pairs & triples
2. calculate support, confidence, lift
3. calculate based on the ECLAT algorithm

In [2]:
# read the order_id, product_id table
op = pd.read_csv('order_products_mini.csv')

op.head()

Unnamed: 0,order_id,product_id
0,2,33120
1,2,28985
2,2,9327
3,2,45918
4,2,30035


### these are thresholds we need to set from now which will be useful later in the code. For example, when calculating pairs of items commonly sold together, we will not include any pair with an item found less than 500 times

In [3]:
single_threshold = 500
double_threshold = 200
triple_threshold = 100

create a dictionary to store the product id's in each order

In [4]:
orders = defaultdict(list)

for idx,i in enumerate(op.values):
    orders[i[0]] += [i[1]]

In [5]:
print('products ordered in order_id=2')
print(orders[2])

products ordered in order_id=2
[33120, 28985, 9327, 45918, 30035, 17794, 40141, 1819, 43668]


----------------------------------------
create a dictionary to store the order counts of each product

In [6]:
single_counts = defaultdict(int)
for _,order in orders.items():
    for i in range(len(order)):
        single_counts[order[i]] += 1

In [7]:
print('number of times product_id=28985 has been ordered')
print(single_counts[28985])

number of times product_id=28985 has been ordered
400


### display products with top sales

In [8]:
# convert dict into pandas series and sort descending
top_single_counts = pd.Series(single_counts).sort_values(ascending=False) \
                                    .reset_index().rename({'index':'product_id',0:'sales'},axis=1)

# keep only items sold above the threshold
top_single_counts = top_single_counts[top_single_counts['sales']>=single_threshold]

display(top_single_counts.head()), top_single_counts.shape

Unnamed: 0,product_id,sales
0,24852,2837
1,13176,2247
2,21137,1469
3,21903,1404
4,47209,1254


(None, (18, 2))

In [9]:
# initialize the pair counts dictionary
pair_counts = defaultdict(int)

# loop through all orders
for id,order in orders.items():
    
    # loop through items in order
    for i in range(len(order)):
        
        """
        if the count of that item is less than the minimum threshold, then any pair containing this 
        item cannot be greater than the minimum threshold. that's why we ignore it
        """
        if single_counts[order[i]] < double_threshold:
            continue
            
            
        # loop through the rest of the items which are after the current i pointer
        for j in range(i+1,len(order)):
            if single_counts[order[j]] < double_threshold:
                continue
                
            # if both items in the pair are above the threshold, then we include this pair
            pair_counts[tuple(sorted((order[i],order[j])))] += 1
    


In [10]:
print('number of times product_ids (13176, 27966) were in the same order')
pair_counts[(13176, 27966)]

number of times product_ids (13176, 27966) were in the same order


244

### display most commonly sold pairs of products

In [11]:
pair_sales = pd.Series(pair_counts).sort_values(ascending=False) \
                                    .reset_index().rename({'level_0':'item1','level_1':'item2',0:'sales'},axis=1)

pair_sales.head()

Unnamed: 0,item1,item2,sales
0,13176,47209,349
1,21137,24852,340
2,13176,21137,328
3,21903,24852,325
4,24852,47766,314


In [12]:
# initialize the triple counts dictionary
triple_counts = defaultdict(int)

# loop through all orders
for id,order in orders.items():
    
    # loop through items in order
    for i in range(len(order)):
        
        """
        if the count of that item is less than the minimum threshold, then any triple containing this 
        item cannot be greater than the minimum threshold. that's why we ignore it
        """
        if single_counts[order[i]] < triple_threshold:
            continue
            
        # loop through the rest of the items which are after the current i pointer
        for j in range(i+1,len(order)):
            
            """
            if the count of a pair of items is less than the minimum threshold, then any triple containing this 
            item cannot be greater than the minimum threshold. that's why we ignore it
            """
            if pair_counts[tuple(sorted((order[i],order[j])))] < triple_threshold:
                continue
                
            # loop through the rest of the items which are after the current j pointer
            for k in range(j+1,len(order)):
                
                """
                if the count of a pair of items is less than the minimum threshold, then any triple containing this 
                item cannot be greater than the minimum threshold. that's why we ignore it
                """
                if pair_counts[tuple(sorted((order[j],order[k])))] < triple_threshold:
                    continue
                if pair_counts[tuple(sorted((order[i],order[k])))] < triple_threshold:
                    continue
                    
                # if all possible pairs of items in the triplet are above the threshold, then we include this triplet
                triple_counts[tuple(sorted((order[i],order[j],order[k])))] += 1
            
            

### display most commonly sold triplets of products

In [13]:
triple_sales = pd.Series(triple_counts).sort_values(ascending=False) \
                                    .reset_index().rename({'level_0':'item1','level_1':'item2',
                                                           'level_2':'item3',0:'sales'},axis=1)

triple_sales.head()

Unnamed: 0,item1,item2,item3,sales
0,13176,27966,47209,77
1,13176,21137,47209,69
2,21137,21903,24852,60
3,13176,21137,21903,59
4,13176,21137,27966,58


## Lift
for 2 items A & B, lift= p(A,B) / ( P(A)*P(B) )

lift is a 2 way metric, similar to correlation. this means that corr(A,B) = corr(B,A). similarly, lift(A,B) = lift(B,A)

In [14]:
num_orders = len(orders)

In [15]:
# calculate lift for all item pairs
pair_lifts = defaultdict(int)
for pair,count in pair_counts.items():
    pair_lifts[pair] = count / ( single_counts[pair[0]]*single_counts[pair[1]] ) * num_orders

In [16]:
# convert dict into pandas series and sort descending
lifts = pd.Series(pair_lifts).sort_values(ascending=False) \
                                    .reset_index().rename({'level_0':'item1','level_1':'item2',
                                                           0:'lift'},axis=1)

lifts.head()

Unnamed: 0,item1,item2,lift
0,35221,44632,11.759346
1,20114,28842,10.470597
2,15290,24184,7.915901
3,28842,44142,7.292571
4,9076,43352,7.258404


## Confidence
for 2 items A & B, confidence(A -> B) = P(A,B) / P(A)

confidence is a 1 way metric. confidence(A->B) != confidence(B->A)

In [23]:
pair_confidence = defaultdict(int)

# loop through each item pair
for idx, (pair,count) in enumerate(pair_counts.items()):

    # calculate confidence(item1,item2)
    pair_confidence[(pair[0],pair[1])] = count / ( single_counts[pair[0]] )
    
    # calculate confidence(item2,item1)
    pair_confidence[(pair[1],pair[0])] = count / ( single_counts[pair[1]] )

In [24]:
# convert dict into pandas series and sort descending
confidence = pd.Series(pair_confidence).sort_values(ascending=False) \
                                    .reset_index().rename({'level_0':'item1','level_1':'item2',
                                                           0:'confidence'},axis=1)

confidence.head()

Unnamed: 0,item1,item2,confidence
0,41787,24852,0.37931
1,28204,24852,0.37428
2,45066,24852,0.356415
3,8174,13176,0.355649
4,49683,24852,0.350174
