In [1]:
import pandas as pd
import numpy as np

# Importing Data

In [2]:
# Information about departments 
names = ['dept', 'deptdesc', 'trash']
use = names[0:2]
dept_df = pd.read_csv('/Users/aarij/Desktop/python-projects/iems-308/dillards/DillardsPOS/deptinfo.csv', sep= ",",
                     header=None, names=names, usecols=use)

In [3]:
# Information about stores
names = ['store', 'city', 'state', 'zip', 'trash']
use = names[0:4]
store_df = pd.read_csv('/Users/aarij/Desktop/python-projects/iems-308/dillards/DillardsPOS/strinfo.csv', sep= ",",
                     header=None, names=names, usecols=use)

In [4]:
# Information about individual SKUs 
names = ['sku', 'dept', 'classid', 'upc', 'style', 'color', 'size', 'packsize', 'vendor', 'brand', 'trash']
use = ['sku', 'dept', 'vendor', 'brand']
sku_df = pd.read_csv('/Users/aarij/Desktop/python-projects/iems-308/dillards/DillardsPOS/skuinfo.csv', sep= ",",
                     header=None, names=names, usecols=use, dtype={'vendor': str, 'brand': str})

In [5]:
# Information about what stores have what SKUs 
names = ['sku', 'store', 'cost', 'retail', 'trash']
use = ['sku', 'store', 'cost', 'retail']
skst_df = pd.read_csv('/Users/aarij/Desktop/python-projects/iems-308/dillards/DillardsPOS/skstinfo.csv', sep=',', 
                      header=None, names=names, usecols=use)

In [6]:
# Information about transactions
names = ['sku', 'store', 'register', 'tran', 'seq', 'date', 'stype', 'trash', 'quantity', 'unsure2', 'unsure3', 
         'interid', 'mic', 'trash2']
use = ['sku', 'store', 'register', 'tran', 'date']

transact_df = pd.read_csv('/Users/aarij/Desktop/python-projects/iems-308/dillards/DillardsPOS/trnsact.csv', sep= ",",
                     header=None, names=names, usecols=use)

# Data Exploration

### Departments

In [7]:
dept_df

Unnamed: 0,dept,deptdesc
0,800,CLINIQUE
1,801,LESLIE
2,1100,GARY F
3,1107,JACQUES
4,1202,CABERN
5,1301,BE2
6,1704,R LAUREN
7,1905,R & Y
8,2102,CAB
9,2105,R TAYLOR


In [8]:
num_dept = dept_df.shape[0]
print(f'Therea are {num_dept} different departments')

Therea are 60 different departments


### Stores

In [9]:
number_stores = store_df.shape[0]
num_states = len(set(store_df.state))
top_five = store_df.state.value_counts()[0:5]

In [10]:
print(f'There are {number_stores} stores')
print(f'They are in {num_states} different states')
print(f'The five states where there are the most Dillards are \n \nState Count \n{top_five}')

There are 453 stores
They are in 31 different states
The five states where there are the most Dillards are 
 
State Count 
TX    79
FL    48
AR    27
AZ    26
OH    25
Name: state, dtype: int64


### SKUs

In [11]:
number_skus = sku_df.shape[0]
number_vendors = len(set(sku_df.vendor))
number_brands = len(set(sku_df.brand))

In [12]:
print(f'There are {number_skus} different SKUs')
print(f'They come from {number_vendors} different vendors')
print(f'Where in total there are {number_brands} different brands')

There are 1564178 different SKUs
They come from 2393 different vendors
Where in total there are 1960 different brands


### Store SKUs 

In [13]:
# Create a column of profit margins 
skst_df['profit'] = skst_df['retail'] - skst_df['cost']

In [14]:
# Sorting by highest profit SKUs 
skst_df.groupby('sku').mean().sort_values(by='profit', ascending=False)

Unnamed: 0_level_0,store,cost,retail,profit
sku,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1811281,3354.833333,1872.54,5850.00,3977.46
1220149,1607.000000,1462.56,5280.00,3817.44
6200173,1607.000000,2700.00,6017.00,3317.00
5480674,1607.000000,1536.72,4776.00,3239.28
5240674,1607.000000,1536.72,4776.00,3239.28
...,...,...,...,...
9738777,5604.000000,660.00,191.76,-468.24
8759889,8409.000000,912.00,395.70,-516.30
2809844,8555.153846,900.00,269.82,-630.18
1749971,8402.000000,912.00,239.76,-672.24


In [15]:
# Looking at different profit values
skst_df.groupby('sku').mean().describe()['profit']

count    760212.000000
mean         20.019637
std          77.545588
min        -710.320000
25%          -3.756502
50%           4.294452
75%          22.785820
max        3977.460000
Name: profit, dtype: float64

In [16]:
# Getting average profit for SKUs across all stores they're in
avg_profit = skst_df.groupby('sku').mean()['profit'].mean().round(2)

In [17]:
print(f'The average profit margin is ${avg_profit}')

The average profit margin is $20.02


### Transactions

In [18]:
# Transactions 
# Casting datetimes as dates
transact_df.date = pd.to_datetime(transact_df.date)

In [19]:
# Getting the range of dates we're analyzing 
# Note, this command takes very long to run
earliest_date = min(transact_df['date']).date()
latest_date = max(transact_df['date']).date()

In [20]:
# Creating a new index which we can group by to get baskets 
index = transact_df['store'] + transact_df['register'] + transact_df['tran'] + transact_df['date']
transact_df['index'] = index

In [21]:
num_basket = len(transact_df.groupby('index').index)

In [22]:
num_tran = transact_df.shape[0]

In [23]:
print(f"We're looking at dates between {earliest_date} and {latest_date}")
print(f"We have {num_tran} total transactions")
print(f"We have {num_basket} total baskets")

We're looking at dates between 2004-08-01 and 2005-08-27
We have 120916896 total transactions
We have 5028079 total baskets


# Subsetting Data of Interest

In [24]:
# Looking only at transactions made in december
dec_df = transact_df[transact_df['date'].dt.month == 12]

In [25]:
# Getting the 1000 most popular SKUs from those transactions
top_sku = dec_df['sku'].value_counts()[0:1000].index.to_list()

In [26]:
# Looking at only those 1000 SKUs
tdf = dec_df[dec_df['sku'].isin(top_sku)]

In [27]:
# Getting dummy variables for all different SKUs 
onehot = pd.get_dummies(tdf['sku'], prefix='sku')

In [28]:
# Concating the dummy variable to each line item in the transaction df
df = pd.concat([tdf, onehot], axis=1)

In [33]:
df

Unnamed: 0,index,sku_7915,sku_9633,sku_19633,sku_29633,sku_39633,sku_47738,sku_59633,sku_86774,sku_106343,...,sku_9883749,sku_9888506,sku_9896223,sku_9911900,sku_9964307,sku_9978362,sku_9992306,sku_9997198,sku_9999170,sku_9999950
65005,2004-12-30 00:00:00.000000533,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
65009,2004-12-31 00:00:00.000001833,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
65011,2004-12-30 00:00:00.000002733,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
65023,2004-12-29 00:00:00.000003023,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
65032,2004-12-18 00:00:00.000003163,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120916659,2004-12-09 00:00:00.000013849,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
120916704,2004-12-05 00:00:00.000013324,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
120916713,2004-12-15 00:00:00.000015639,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
120916714,2004-12-15 00:00:00.000015639,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [30]:
# Removing the unneccesary columns
df.drop(['sku', 'store', 'register', 'tran', 'date'], axis=1, inplace=True)

In [31]:
# Important - here I sum over all indices so I can create the baskets. A basket will have a 0 for all items it 
# doesn't have and a 1 for all items it does 
finaldf = df.groupby(['index']).sum()

In [32]:
# Reset the index
finaldf.reset_index()

Unnamed: 0,index,sku_7915,sku_9633,sku_19633,sku_29633,sku_39633,sku_47738,sku_59633,sku_86774,sku_106343,...,sku_9883749,sku_9888506,sku_9896223,sku_9911900,sku_9964307,sku_9978362,sku_9992306,sku_9997198,sku_9999170,sku_9999950
0,2004-12-01 00:00:00.000000333,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2004-12-01 00:00:00.000000373,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2004-12-01 00:00:00.000000410,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2004-12-01 00:00:00.000000420,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2004-12-01 00:00:00.000000464,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
334156,2004-12-31 00:00:00.000077680,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
334157,2004-12-31 00:00:00.000092746,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
334158,2004-12-31 00:00:00.000099600,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
334159,2004-12-31 00:00:00.000099926,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
