VzW Example: Market Basket Analysis  
sourced from VzW dataset  
pull dates: Jan 01 through Jan 07 2018  
265k observations

In [1]:
##############################################
# VzW Example: Market Basket Analysis        #
# Alvaro Muir, It Analytics Data Engineering #
# Feb 15 2018                                #
##############################################

aSRC_DIR = "../data"
DATASET = "data-01-01-18"
src     = ''.join([SRC_DIR,'/', DATASET,'-01-07-18_sample-269.1k','.csv.bz2'])
LIMIT   = 299999

df = pd.read_csv(src, compression='bz2', nrows=LIMIT)
print("Total rows including the header: {}".format(df.shape[0] + 1))

#
#  General cleanup


df = df.dropna(axis=1, how='all')
df = df.drop([col for col in [col for col in df.columns if col.endswith('_dt' or '_rev' or '_step')]], axis=1)
df = df.fillna(0)

for c in df.columns:
    if df[c].dtype == 'object':
        df[c] = df[c].str.lower()

for(c) in df.columns:
    if len(df[c].unique()) < 2:
        df = df.drop(c, axis=1)
        
df.columns = [col.replace('a.','') for col in df.columns.tolist()]
df.columns = [col.replace('b.','') for col in df.columns.tolist()]
df.columns = [col.replace('c.','') for col in df.columns.tolist()]

df['cust_id'] = df['cust_id'].astype('str').map(lambda x: x.rstrip('0').rstrip('.'))
df['invc_num'] = df['invc_num'].astype('str')

to_remove = ['apple tablet', 'android tablet', 'windows tablet',
             'verizon wireless home phone','windows tablet',
             'ip virtual devices']

df = df[~df['eqp_desc'].isin(to_remove)]

Total rows including the header: 269101


In [2]:
purchases = df[['cust_id','invc_num', 'device_prod_nm', 'item_desc', 'sales_qty']]
purchases.dropna(axis=0, subset=['invc_num'], inplace=True)

customers = purchases.cust_id.unique()
invoices = purchases.invc_num.unique()
devices  = purchases.device_prod_nm.unique()
items = [' '.join(i.split()) for i in purchases.item_desc.unique()] # there's weird spaces in there

print('unique counts')
pd.DataFrame(data={'customers': [len(customers)], 
                   'invoices': [len(invoices)], 
                   'devices': [len(devices)], 
                   'items': [len(items)]})


unique counts


Unnamed: 0,customers,devices,invoices,items
0,122682,808,154048,1759


In [3]:
# lets create a 'basket' for inspection

basket = (purchases[['invc_num', 'item_desc', 'sales_qty']]
          .groupby(['invc_num', 'item_desc'])['sales_qty']
          .sum().unstack().reset_index().fillna(0)
          .set_index('invc_num'))

basket_sets = basket.applymap(lambda x: 0 if x <= 0 else 1)

from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

frequent_itemsets = apriori(basket_sets, min_support=0.01, use_colnames=True)
frequent_itemsets.head()

Unnamed: 0,support,itemsets
0,0.019332,[blu apple airpods]
1,0.016346,[bpk replenishment of prepaid cards]
2,0.015229,[cas ob defender iphone7 black]
3,0.01025,[cas ob symmetry iphone7 black]
4,0.039637,[chg vzw 24a lightning tvl]


In [4]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules

Unnamed: 0,antecedants,consequents,support,confidence,lift
0,(scr zagg invisibleshieldglass iphone7),(cla vzw 24a lightning vpc),0.093841,0.116837,2.199505
1,(cla vzw 24a lightning vpc),(scr zagg invisibleshieldglass iphone7),0.05312,0.206404,2.199505
2,(cla vzw usbc pd20 vpc),(scr zagg invisishield glass gal8 curved),0.06028,0.199763,7.025823
3,(scr zagg invisishield glass gal8 curved),(cla vzw usbc pd20 vpc),0.028433,0.423516,7.025823
4,(scr zagg invisibleshieldglass iphone7),(ppw mophie powerstation 6000 space grey),0.093841,0.108813,2.469419
5,(ppw mophie powerstation 6000 space grey),(scr zagg invisibleshieldglass iphone7),0.044064,0.231732,2.469419


In [5]:
# gonna change this with larger dataset
rules[ (rules['lift'] >= 1) &
       (rules['confidence'] >= 0.6) ]

Unnamed: 0,antecedants,consequents,support,confidence,lift


In [6]:
basket['scr zagg invisishield glass gal8 curved'].sum()

4689.0