In [1]:
import pandas as pd
import numpy as np
from scipy import sparse
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

# Importing Data

In [None]:
# Information about departments 
names = ['dept', 'deptdesc', 'trash']
use = names[0:2]
dept_df = pd.read_csv('/Users/aarij/Desktop/python-projects/iems-308/dillards/DillardsPOS/deptinfo.csv', sep= ",",
                     header=None, names=names, usecols=use)

In [None]:
# Information about stores
names = ['store', 'city', 'state', 'zip', 'trash']
use = names[0:4]
store_df = pd.read_csv('/Users/aarij/Desktop/python-projects/iems-308/dillards/DillardsPOS/strinfo.csv', sep= ",",
                     header=None, names=names, usecols=use)

In [None]:
# Information about individual SKUs 
names = ['sku', 'dept', 'classid', 'upc', 'style', 'color', 'size', 'packsize', 'vendor', 'brand', 'trash']
use = ['sku', 'dept', 'vendor', 'brand']
sku_df = pd.read_csv('/Users/aarij/Desktop/python-projects/iems-308/dillards/DillardsPOS/skuinfo.csv', sep= ",",
                     header=None, names=names, usecols=use, dtype={'vendor': str, 'brand': str})

In [None]:
# Information about what stores have what SKUs 
names = ['sku', 'store', 'cost', 'retail', 'trash']
use = ['sku', 'store', 'cost', 'retail']
skst_df = pd.read_csv('/Users/aarij/Desktop/python-projects/iems-308/dillards/DillardsPOS/skstinfo.csv', sep=',', 
                      header=None, names=names, usecols=use)

In [2]:
# Information about transactions
names = ['sku', 'store', 'register', 'tran', 'seq', 'date', 'stype', 'trash', 'quantity', 'unsure2', 'unsure3', 
         'interid', 'mic', 'trash2']
use = ['sku', 'store', 'register', 'tran', 'date']

transact_df = pd.read_csv('/Users/aarij/Desktop/python-projects/iems-308/dillards/DillardsPOS/trnsact.csv', sep= ",",
                     header=None, names=names, usecols=use)

# Data Exploration

### Departments

In [None]:
dept_df

In [None]:
num_dept = dept_df.shape[0]
print(f'There are {num_dept} different departments')

### Stores

In [None]:
number_stores = store_df.shape[0]
num_states = len(set(store_df.state))
top_five = store_df.state.value_counts()[0:5]

In [None]:
print(f'There are {number_stores} stores')
print(f'They are in {num_states} different states')
print(f'The five states where there are the most Dillards are \n \nState Count \n{top_five}')

### SKUs

In [None]:
number_skus = sku_df.shape[0]
number_vendors = len(set(sku_df.vendor))
number_brands = len(set(sku_df.brand))

In [None]:
print(f'There are {number_skus} different SKUs')
print(f'They come from {number_vendors} different vendors')
print(f'Where in total there are {number_brands} different brands')

### Store SKUs 

In [None]:
# Create a column of profit margins 
skst_df['profit'] = skst_df['retail'] - skst_df['cost']

In [None]:
# Sorting by highest profit SKUs 
skst_df.groupby('sku').mean().sort_values(by='profit', ascending=False)

In [None]:
# Looking at different profit values
skst_df.groupby('sku').mean().describe()['profit']

In [None]:
# Getting average profit for SKUs across all stores they're in
avg_profit = skst_df.groupby('sku').mean()['profit'].mean().round(2)

In [None]:
print(f'The average profit margin is ${avg_profit}')

### Transactions

In [3]:
# Transactions 
# Casting datetimes as dates
transact_df.date = pd.to_datetime(transact_df.date)

In [None]:
# Getting the range of dates we're analyzing 
# Note, this command takes very long to run
earliest_date = min(transact_df['date']).date()
latest_date = max(transact_df['date']).date()

In [4]:
# Creating a new index which we can group by to get baskets 
transact_df['index'] = transact_df['store'] + transact_df['register'] + transact_df['tran'] + transact_df['date']

In [None]:
# Here, I use a hash function to convert the index to a number, allowing for faster processing times 
# objectid = pd.util.hash_pandas_object(index)
# transact_df['index'] = objectid

In [None]:
num_basket = len(transact_df.groupby('index').index)

In [None]:
num_tran = transact_df.shape[0]

In [None]:
print(f"We're looking at dates between {earliest_date} and {latest_date}")
print(f"We have {num_tran} total transactions")
print(f"We have {num_basket} total baskets")

# Subsetting Data of Interest

In [5]:
# Looking only at transactions made in december
dec_df = transact_df[transact_df['date'].dt.month == 12]

In [7]:
# Getting the 1000 most popular SKUs from those transactions
top_sku = dec_df['sku'].value_counts()[0:200].index.to_list()

In [8]:
# Looking at only those 1000 SKUs for those months
tdf = dec_df[dec_df['sku'].isin(top_sku)]

In [9]:
# Getting dummy variables for all different SKUs 
onehot = pd.get_dummies(tdf['sku'], prefix='sku')

In [10]:
# Concating the dummy variable to each line item in the transaction df
df = pd.concat([tdf, onehot], axis=1)

In [11]:
# Removing the unneccesary columns, but leave index
df.drop(['sku', 'store', 'register', 'tran', 'date'], axis=1, inplace=True)

In [12]:
df

Unnamed: 0,index,sku_173088,sku_176136,sku_208362,sku_247668,sku_264715,sku_279151,sku_297668,sku_348498,sku_426672,...,sku_9656021,sku_9666021,sku_9667426,sku_9702306,sku_9708505,sku_9722306,sku_9752306,sku_9786649,sku_9817723,sku_9896223
2105437,2004-12-16 00:00:00.000001312,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2105438,2004-12-24 00:00:00.000002512,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2105439,2004-12-24 00:00:00.000002612,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2105440,2004-12-23 00:00:00.000004312,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2105463,2004-12-26 00:00:00.000001222,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119709661,2004-12-05 00:00:00.000013519,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
119709663,2004-12-05 00:00:00.000012139,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
119709664,2004-12-05 00:00:00.000012939,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
119709665,2004-12-11 00:00:00.000015439,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [14]:
# Important - here I sum over all indices so I can create the baskets. A basket will have a 0 for all items it 
# doesn't have and a 1 for all items it does 
finaldf = df.groupby(['index']).sum()

In [16]:
# Convert to booleans for faster loading times 
finaldf = finaldf.astype(dtype=bool)

# Creating Association Rules

In [46]:
# After testing, a minsup of 0.005 allows us to see many rules without crashing the kernel 
frequentItemsets = apriori(finaldf, min_support=0.005, use_colnames=True)

In [47]:
frequentItemsets

Unnamed: 0,support,itemsets
0,0.036046,(sku_173088)
1,0.011351,(sku_176136)
2,0.013843,(sku_208362)
3,0.028288,(sku_247668)
4,0.023679,(sku_264715)
...,...,...
251,0.012471,"(sku_6656135, sku_7596135)"
252,0.005330,"(sku_6656135, sku_7636135)"
253,0.005006,"(sku_7596135, sku_6706135)"
254,0.005146,"(sku_8166822, sku_8156822)"


In [48]:
rules = association_rules(frequentItemsets, metric="lift")

In [49]:
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(sku_803921),(sku_3524026),0.038214,0.075473,0.005910,0.154651,2.049092,0.003026,1.093663
1,(sku_3524026),(sku_803921),0.075473,0.038214,0.005910,0.078303,2.049092,0.003026,1.043495
2,(sku_803921),(sku_3978011),0.038214,0.052774,0.005244,0.137216,2.600083,0.003227,1.097872
3,(sku_3978011),(sku_803921),0.052774,0.038214,0.005244,0.099359,2.600083,0.003227,1.067890
4,(sku_803921),(sku_4108011),0.038214,0.072495,0.005298,0.138630,1.912278,0.002527,1.076779
...,...,...,...,...,...,...,...,...,...
113,"(sku_3524026, sku_3898011)",(sku_3978011),0.010411,0.052774,0.006252,0.600484,11.378468,0.005702,2.370936
114,"(sku_3978011, sku_3898011)",(sku_3524026),0.010808,0.075473,0.006252,0.578474,7.664661,0.005436,2.193285
115,(sku_3524026),"(sku_3978011, sku_3898011)",0.075473,0.010808,0.006252,0.082836,7.664661,0.005436,1.078534
116,(sku_3978011),"(sku_3524026, sku_3898011)",0.052774,0.010411,0.006252,0.118466,11.378468,0.005702,1.122576
