This

In [1]:
import pandas as pd

In [None]:
df = pd.read_excel('Online Retail.xlsx')

In [None]:
df.head()

Because our goal is to recommend products purchased together by examining the frequency by which different items are purchased together we only need information that identifies individual orders and individual products.  Because it will be more convenient for display purposes we will also use the Description.  We don't need the rest of the columns for this project.

We will keep two DataFrames for this.

One for Building the recommendation system with the following features:
- `InvoiceNo`
- `StockCode`

And one for matching the description to the `StockCode`:
- `StockCode`
- `Description`



In [89]:
# Modify StockCode to always be a string

# Prepend '_' to StockCode
df['StockCode'] = df['StockCode'].apply(lambda x: '_'+str(x))

In [91]:
# DataFrame for building the recommendation system
orders = df[['InvoiceNo', 'StockCode']]
orders.head()

Unnamed: 0,InvoiceNo,StockCode
0,536365,_85123A
1,536365,_71053
2,536365,_84406B
3,536365,_84029G
4,536365,_84029E


In [418]:
# DataFrame for retrieving product descriptions
products = df[['StockCode', 'Description']].copy()
products.head()

Unnamed: 0,StockCode,Description
0,_85123A,WHITE HANGING HEART T-LIGHT HOLDER
1,_71053,WHITE METAL LANTERN
2,_84406B,CREAM CUPID HEARTS COAT HANGER
3,_84029G,KNITTED UNION FLAG HOT WATER BOTTLE
4,_84029E,RED WOOLLY HOTTIE WHITE HEART.


In [419]:
products['StockCode'] = products['StockCode'].str.upper()

In [420]:
products = products[~products.duplicated()]
products.head()

Unnamed: 0,StockCode,Description
0,_85123A,WHITE HANGING HEART T-LIGHT HOLDER
1,_71053,WHITE METAL LANTERN
2,_84406B,CREAM CUPID HEARTS COAT HANGER
3,_84029G,KNITTED UNION FLAG HOT WATER BOTTLE
4,_84029E,RED WOOLLY HOTTIE WHITE HEART.


In [421]:
products[products['StockCode'] == '_23236']

Unnamed: 0,StockCode,Description
218408,_23236,DOILEY STORAGE TIN
220496,_23236,DOILEY BISCUIT TIN
290770,_23236,STORAGE TIN VINTAGE DOILEY
292790,_23236,STORAGE TIN VINTAGE DOILY


In [422]:
# Drop descriptions that are not uppercase
products = products[
    products['Description'].str.upper() == products['Description']
]

In [423]:
# Keep only the first Description of each product
products = products[~products.duplicated(subset=['StockCode'])]
products

Unnamed: 0,StockCode,Description
0,_85123A,WHITE HANGING HEART T-LIGHT HOLDER
1,_71053,WHITE METAL LANTERN
2,_84406B,CREAM CUPID HEARTS COAT HANGER
3,_84029G,KNITTED UNION FLAG HOT WATER BOTTLE
4,_84029E,RED WOOLLY HOTTIE WHITE HEART.
...,...,...
504104,_23561,SET OF 6 RIBBONS PARTY
507867,_23609,SET 10 CARDS SNOWY ROBIN 17099
512588,_23617,SET 10 CARDS SWIRLY XMAS TREE 17104
527065,_90214U,"LETTER ""U"" BLING KEY RING"


In [424]:
# Set the index to StockCode
products = products.set_index('StockCode')

# Convert to Series for eve easier lookups
products = products['Description']

In [425]:
# Test it out
products['_21755']

'LOVE BUILDING BLOCK WORD'

In [426]:
# Number of unique products
len(products)

3796

# Number of orders

In [192]:
orders

Unnamed: 0,InvoiceNo,StockCode
0,536365,"[_85123A, _71053, _84406B, _84029G, _84029E, _..."
1,536366,"[_22633, _22632]"
2,536367,"[_84879, _22745, _22748, _22749, _22310, _8496..."
3,536368,"[_22960, _22913, _22912, _22914]"
4,536369,[_21756]
...,...,...
25895,C581484,[_23843]
25896,C581490,"[_22178, _23144]"
25897,C581499,[_M]
25898,C581568,[_21258]


In [188]:
# total number of orders
orders['InvoiceNo'].nunique()

25900

In [190]:
# orders with more than one item
num_items_in_order = orders.groupby('InvoiceNo').count()
num_items_in_order.columns = ['Count']
num_items_in_order

Unnamed: 0_level_0,Count
InvoiceNo,Unnamed: 1_level_1
536365,1
536366,1
536367,1
536368,1
536369,1
...,...
C581484,1
C581490,1
C581499,1
C581568,1


In [191]:
len(num_items_in_order[num_items_in_order['Count'] > 1])

0

There are 20k orders with more than one product.  That is about 80% of all orders.  People in this store often buy items together.  We are going to help new customers out by showing them which products are commonly purchased together.

# Restructure the data
We would like each Invoice Number to give us a list of stock codes.

In [102]:
def string_list(x):
    return [str(i) for i in x]

orders = orders.groupby('InvoiceNo')['StockCode'].apply(list).reset_index()
orders.head()

Unnamed: 0,InvoiceNo,StockCode
0,536365,"[_85123A, _71053, _84406B, _84029G, _84029E, _..."
1,536366,"[_22633, _22632]"
2,536367,"[_84879, _22745, _22748, _22749, _22310, _8496..."
3,536368,"[_22960, _22913, _22912, _22914]"
4,536369,[_21756]


In [27]:
orders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25900 entries, 0 to 25899
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   InvoiceNo  25900 non-null  object
 1   StockCode  25900 non-null  object
dtypes: object(2)
memory usage: 404.8+ KB


In [49]:
for item in orders['StockCode'][0]:
    print(type(item))

<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>


In [103]:
from mlxtend.preprocessing import TransactionEncoder

In [248]:
te = TransactionEncoder()

te.fit(orders['StockCode'])
orders_1hot = te.transform(orders['StockCode'])


In [251]:
orders_1hot = pd.DataFrame(orders_1hot, columns =te.columns_)
orders_1hot.head()

Unnamed: 0,_10002,_10080,_10120,_10123C,_10123G,_10124A,_10124G,_10125,_10133,_10134,...,_M,_PADS,_POST,_S,_gift_0001_10,_gift_0001_20,_gift_0001_30,_gift_0001_40,_gift_0001_50,_m
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


# Apriori

In [None]:
from mlxtend.frequent_patterns import apriori

In [196]:
%%timeit -n1 -r1

apriori(orders_1hot, min_support=0.01, use_colnames=True)

2min 52s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [277]:
is_ap = apriori(orders_1hot, min_support=0.01, use_colnames=True)

In [278]:
is_ap.head()

Unnamed: 0,support,itemsets
0,0.020193,(_15036)
1,0.012587,(_15056BL)
2,0.017876,(_15056N)
3,0.011236,(_16237)
4,0.01251,(_20675)


In [279]:
def itemset_to_ordered_string(itemset):
    return ','.join(sorted(list(itemset)))

itemset_to_ordered_string(is_ap['itemsets'][0])

'_15036'

In [295]:
ap_itemset_strings = is_ap['itemsets'].apply(itemset_to_ordered_string)
ap_itemset_strings = ap_itemset_strings.sort_values().reset_index(drop=True)
ap_itemset_strings

0         _15036
1       _15056BL
2        _15056N
3         _16237
4         _20675
          ...   
1082      _85152
1083     _85199S
1084        _DOT
1085          _M
1086       _POST
Name: itemsets, Length: 1087, dtype: object

In [None]:
from mlxtend.frequent_patterns import fpgrowth

In [229]:
%%timeit -n1 -r1

frequent_itemsets = fpgrowth(df_ary, min_support=0.01, use_colnames=True)#, max_len=2)

5.43 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [270]:
is_fp = fpgrowth(df_ary, min_support=0.01, use_colnames=True)#, max_len=2)

In [294]:
fp_itemset_strings = is_fp['itemsets'].apply(itemset_to_ordered_string)
fp_itemset_strings = fp_itemset_strings.sort_values().reset_index(drop=True)
fp_itemset_strings

0         _15036
1       _15056BL
2        _15056N
3         _16237
4         _20675
          ...   
1082      _85152
1083     _85199S
1084        _DOT
1085          _M
1086       _POST
Name: itemsets, Length: 1087, dtype: object

In [296]:
fp_itemset_strings.equals(ap_itemset_strings)

True

In [None]:
# try sorting the sets to see if they are the same.

# Calculate Association Rules

In [None]:
from mlxtend.frequent_patterns import association_rules

In [299]:
association_rules(frequent_itemsets, metric="support", min_threshold=0.01)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(_84879),(_85123A),0.056680,0.086718,0.012510,0.220708,2.545124,0.007595,1.171939
1,(_85123A),(_84879),0.086718,0.056680,0.012510,0.144256,2.545124,0.007595,1.102340
2,(_84879),(_22423),0.056680,0.083861,0.010425,0.183924,2.193197,0.005672,1.122614
3,(_22423),(_84879),0.083861,0.056680,0.010425,0.124309,2.193197,0.005672,1.077230
4,(_21755),(_21754),0.024363,0.030386,0.011467,0.470681,15.490025,0.010727,1.831815
...,...,...,...,...,...,...,...,...,...
1337,(_23293),(_23295),0.021506,0.017336,0.011853,0.551167,31.793373,0.011480,2.189376
1338,(_23296),(_23293),0.014826,0.021506,0.010077,0.679688,31.604859,0.009758,3.054811
1339,(_23293),(_23296),0.021506,0.014826,0.010077,0.468582,31.604859,0.009758,1.853857
1340,(_23355),(_22112),0.030502,0.033436,0.010463,0.343038,10.259450,0.009443,1.471263


In [300]:
association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(_22745),(_22748),0.016448,0.016988,0.012124,0.737089,43.387751,0.011844,3.738955
1,(_22748),(_22745),0.016988,0.016448,0.012124,0.713636,43.387751,0.011844,3.434626
2,(_22726),(_22727),0.038726,0.041737,0.024942,0.644068,15.431412,0.023326,2.692261
3,(_22727),(_22726),0.041737,0.038726,0.024942,0.597595,15.431412,0.023326,2.388821
4,(_22728),(_22727),0.030849,0.041737,0.018417,0.596996,14.303610,0.017129,2.377801
...,...,...,...,...,...,...,...,...,...
370,(_23322),(_23321),0.022124,0.024440,0.011544,0.521815,21.350725,0.011004,2.040131
371,(_23343),(_23344),0.018842,0.027915,0.012510,0.663934,23.784096,0.011984,2.892545
372,(_23295),(_23293),0.017336,0.021506,0.011853,0.683742,31.793373,0.011480,3.093971
373,(_23293),(_23295),0.021506,0.017336,0.011853,0.551167,31.793373,0.011480,2.189376


In [358]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=10)
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(_21755),(_21754),0.024363,0.030386,0.011467,0.470681,15.490025,0.010727,1.831815
1,(_21754),(_21755),0.030386,0.024363,0.011467,0.377382,15.490025,0.010727,1.566993
2,(_22745),(_22748),0.016448,0.016988,0.012124,0.737089,43.387751,0.011844,3.738955
3,(_22748),(_22745),0.016988,0.016448,0.012124,0.713636,43.387751,0.011844,3.434626
4,(_22726),(_22727),0.038726,0.041737,0.024942,0.644068,15.431412,0.023326,2.692261


In [359]:
rules.shape

(736, 9)

In [430]:
rules['antecedents'].value_counts()[10:20]

(_20724)             13
(_20725)             12
(_20726)             12
(_20723)             12
(_20719)             12
(_22384)             11
(_85099B, _21931)    10
(_22356)             10
(_21928)              9
(_22386, _85099B)     9
Name: antecedents, dtype: int64

In [360]:
# recomendations for (_84879)
recs = ar[ar['antecedents'] == {'_23293'}].consequents.apply(lambda x: next(iter(x)))
recs

731    _23295
733    _23296
Name: consequents, dtype: object

In [173]:
print('Antecedent:', products.loc['_23293'][0])

print()
print('Recommendations:')
for _, rec in recs[:6].iteritems():
    print(products.loc[rec][0])

Antecedent: SET OF 12 FAIRY CAKE BAKING CASES

Recommendations:
SET OF 12 MINI LOAF BAKING CASES
SET OF 6 TEA TIME BAKING CASES


In [398]:
def predict(antecedent, rules, max_results= 6):
    
    # get the rules for this antecedent
    preds = rules[rules['antecedents'] == antecedent]
    
    # a messy way to convert a frozen set with one element to string
    preds = preds['consequents'].apply(iter).apply(next)
    
    return preds[:max_results].reset_index(drop=True)

In [399]:
preds = predict({'_20727'}, rules)
preds

0    _20725
1    _22383
2    _22382
3    _22382
4    _22383
5    _20725
Name: consequents, dtype: object

In [387]:
# products = products.set_index('StockCode')

In [429]:
print(products['_20727'])

LUNCH BAG  BLACK SKULL.


In [428]:
for stockid in preds:  
    print(products[stockid])

LUNCH BAG RED RETROSPOT
LUNCH BAG SUKI  DESIGN 
LUNCH BAG SPACEBOY DESIGN 
LUNCH BAG SPACEBOY DESIGN 
LUNCH BAG SUKI  DESIGN 
LUNCH BAG RED RETROSPOT


Try another product.

In [431]:
print(products['_21928'])

JUMBO BAG SCANDINAVIAN PAISLEY


In [432]:
# get the predictions
preds = predict({'_21928'}, rules)

# Display the descriptions of the predictions
for stockid in preds:  
    print(products[stockid])

JUMBO SHOPPER VINTAGE RED PAISLEY
JUMBO  BAG BAROQUE BLACK WHITE
JUMBO BAG WOODLAND ANIMALS
JUMBO BAG PINK VINTAGE PAISLEY
JUMBO SHOPPER VINTAGE RED PAISLEY
JUMBO BAG PINK POLKADOT
