### Change this stuff

In [1]:
# choose the last day from which you want historical data
date_end = '2019-05-31'

# do you want to run the script AND write the CSV? or just run the script?
# the script will run regardless of what you choose here.
# choose "yes" or "no"
excel_write = 'no'

### Libraries

In [2]:
import os
tilde = os.path.expanduser('~')

import sys
sys.path.insert(0, tilde + '/Scripts/Fake Folder/Python Libraries')

from jb_libraries import *
%matplotlib inline

### Script Settings

In [3]:
six_months_ago = str((pd.to_datetime(date_end) - pd.DateOffset(months = 5)).date())
six_months_ago = six_months_ago[:7] + '-01'

three_months_ago = str((pd.to_datetime(date_end) - pd.DateOffset(months = 2)).date())
three_months_ago = three_months_ago[:7] + '-01'

csv_path = tilde + '/Scripts/Fake Folder/Fab/Projects/War Chest/CSVs/'

statuses = ['working', 'deprecated', 'pending', 'sample']

### Get Fab and Kitting skus

In [4]:
skus_main = pd.read_sql(
'''
SELECT
sku_date_modified,
sku_id,
k.part_id,
pd.products_name,
bom_type,
p.products_dont_sell_alone,
p.products_carrot_only,
p.products_meta_type,
p.products_coming_soon,
p.discontinue_status,
p.products_status,
p.products_stripes,
p.products_discontinued,
sku_outsourced_assembly
FROM skus k
JOIN products_description pd ON k.part_id = pd.part_id
JOIN parts p ON k.part_id = p.part_id
AND sku_status IN ''' + str(tuple(statuses)) + '''
''', db)

col_fix(skus_main)

for col in skus_main.columns:
    if skus_main[col].dtype == 'O':
        skus_main[col] = skus_main[col].str.lower()
        
# turn "resale - rapid prep" and "resale - no labor" into just "retail"
skus_main['bom type'] = skus_main['bom type'].str.split(' ', expand = True)[0]

### Flag outsourced, not, or both

In [5]:
a = skus_main.groupby('part id')[['sku outsourced assembly']].sum()

def outsourced(x):
    
    val = x['sku outsourced assembly']
    
    if val == 0:
        ret = 'not outsourced'
    elif val == 1:
        ret = 'outsourced'
    else:
        ret = 'both'
    return ret

a['outsourced'] = a.apply(outsourced, axis = 1)

print('proportions')
vc = a['outsourced'].value_counts()
print(vc/vc.sum())

dict_ = dict(a['outsourced'])

skus_main['outsourced'] = skus_main['part id'].map(dict_)

skus_main.drop('sku outsourced assembly', 1, inplace = True)

proportions
not outsourced    0.881746
outsourced        0.112592
both              0.005663
Name: outsourced, dtype: float64


### Shift data from sku level to part level

In [6]:
# if there is more than one sku id per part id, then put them on one line
df1 = pd.DataFrame(skus_main.groupby('part id')['sku id'].apply(lambda x: ', '.join(x.map(str))))
skus_main['all sku ids'] = skus_main['part id'].map(dict(zip(df1.index.to_series(), df1['sku id'])))

# if there is more than one bom type per part id, then put them on one line
df2 = pd.DataFrame(skus_main.drop_duplicates(['part id','bom type']).groupby('part id')['bom type'].apply(lambda x: ', '.join(x.map(str))))
skus_main['all bom types'] = skus_main['part id'].map(dict(zip(df2.index.to_series(), df2['bom type'])))

skus_main.sort_values(['part id','sku date modified'], inplace = True)
skus_main.drop_duplicates('part id', keep = 'last', inplace = True)

skus_main.drop(['sku id','bom type'],1,inplace = True)

### Modify BOMs
Some parts have multiple BOMs, for simplicity let's create a single BOM per part

In [39]:
# in order of importance we'll do pnp, kitting, then resale

def bom_fix(df):
    if 'pnp' in df['all bom types']:
        return 'pnp'
    elif 'kitted' in df['all bom types']:
        return 'kitted'
    elif 'resale' in df['all bom types']:
        return 'resale'
    else:
        return df['all bom types']
    
skus_main['all bom types'] = skus_main.apply(bom_fix, axis = 1)
skus_main.rename(columns = {'all bom types':'bom type'}, inplace = True)

### Get all sales data

In [7]:
pd.read_sql(
'''
SELECT
*
FROM orders_status
ORDER BY orders_status_id
''', db)

Unnamed: 0,orders_status_id,language_id,orders_status_name
0,1,1,Pending
1,2,1,Processing
2,3,1,Shipped
3,4,1,Update
4,5,1,Printed
5,6,1,Billed
6,7,1,Payment Received
7,8,1,Fraud - Pending
8,9,1,Fraud - Confirmed
9,10,1,Return


In [8]:
s = dt.datetime.now()

sales_super_main = pd.read_sql(
'''
SELECT
DATE(o.date_purchased) AS date_purchased,
DATE_FORMAT(o.date_purchased, '%Y-%m') AS year_and_month,
op.orders_id,
op.part_id,
op.products_quantity AS qty_total, 
op.products_price
FROM orders_products op
JOIN orders o ON op.orders_id = o.orders_id
# fraud - pending, fraud - confirmed, return, replaced defective, refunded defective, voided, fraud - void
AND o.orders_status NOT IN (8,9,10,11,12,14,15)
AND DATE(o.date_purchased) <= ' '''+ date_end +''' '
WHERE op.part_id IN '''+ str(tuple(skus_main['part id'].tolist())) +''' # get all valid skus
OR op.part_id IN (SELECT part_id FROM products_to_stuff WHERE part_id > 0) # get all combos
''', db)

col_fix(sales_super_main)

# change to datetime
sales_super_main['date purchased'] = pd.to_datetime(sales_super_main['date purchased'])

d1 = sales_super_main['date purchased'].min().date()
d2 = sales_super_main['date purchased'].max().date()

print('data is from {} to {}\n'.format(d1,d2))

e = dt.datetime.now()
print(e-s)

data is from 2005-10-21 to 2019-05-31

0:05:16.416761


### Make a copy

In [40]:
sales_main = sales_super_main.copy()

### Get the combos that some parts are contained in
We want to forecast the number of parts sold. If Part A is sold alone and there are x2 units in Combo Z, and if we sell x1 unit of Part A and x1 unit of Combo Z, then we are really selling x3 units of Part A. We need to make this adjustment to our data.

In [41]:
pts_main = pd.read_sql(
'''
SELECT
part_id AS combo_part_id,
contains_part_id AS part_id,
pts_quantity
FROM products_to_stuff
# get combos only
WHERE part_id > 0
''', db)

col_fix(pts_main)

# flag "parts" or "combos"
sales_main['part type'] = np.where(sales_main['part id'].isin(pts_main['combo part id'].tolist()), 'combo','part')

### Seperate out and remove combos from sales data

In [42]:
combos = sales_main[sales_main['part type'] == 'combo'].copy()

combos.rename(columns = {'part id':'combo part id',
                         'qty total':'combo qty total'}, inplace = True)

### "Unpack" combos into their parts

In [43]:
# merge products_to_stuff, which contains the combo_part_id and all of the parts that that combo is made of
combos = pd.merge(combos, pts_main, how = 'left', on = 'combo part id')

# continuing the logic mentioned a few sections above:
    # if a combo contains x2 units of Part A, and we sell x2 units of this combo, then we have really sold x4 units of Part A.
    # make this adjustment here
combos['qty total'] = combos['combo qty total'] * combos['pts quantity']

# the price as it stands now in the combos dataframe is the price of the combo
# but we just unpacked each combo into it's parts
# so we need to overwrite the price of the combo with the price of each part
price = pd.read_sql(
'''
SELECT
part_id,
products_price
FROM parts
''', db)

col_fix(price)

combos['products price'] = combos['part id'].map(dict(zip(price['part id'], price['products price'])))

# check nulls
n = combos[combos.isnull().any(1)]
if n.empty:
    print('no nulls')
else:
    print('you have %i nulls' % len(n))
    display(n.head())
    raise 0

no nulls


### Check out this example
Valid as of 06/21/2019. Use the random OID below.

In [44]:
oid = 2000914

This order consists of x1 unique combo and x3 unique parts

In [45]:
sales_main[sales_main['orders id'] == oid]

Unnamed: 0,date purchased,year and month,orders id,part id,qty total,products price,part type
6787981,2019-03-10,2019-03,2000914,1405,100.0,24.47,combo
6787982,2019-03-10,2019-03,2000914,1247,19.0,11.96,part
6787983,2019-03-10,2019-03,2000914,1126,100.0,2.77,part
6787984,2019-03-10,2019-03,2000914,1170,100.0,1.37,part


Zoom into the combo, which is PN 1405. This combo conists of many parts. Take note below of the changes we made, specifically the products_price (which is now NOT the price of the combo, but the price of each part that this combo is made of), and the qty_total (which is combo_qty_total * pts_quantity).

Going further with the qty_total, zoom into PN 617 below. Combo PN 1405 contains x2 units of PN 617. This person bought x100 units of Combo PN 1407, which means they bought 100 * 2 = 200 units of PN 617.

In [46]:
combos[combos['orders id'] == oid]

Unnamed: 0,date purchased,year and month,orders id,combo part id,combo qty total,products price,part type,part id,pts quantity,qty total
809853,2019-03-10,2019-03,2000914,1405,100.0,14.95,combo,659,1.0,100.0
809854,2019-03-10,2019-03,2000914,1405,100.0,7.95,combo,1260,1.0,100.0
809855,2019-03-10,2019-03,2000914,1405,100.0,1.95,combo,727,1.0,100.0
809856,2019-03-10,2019-03,2000914,1405,100.0,9.95,combo,641,1.0,100.0
809857,2019-03-10,2019-03,2000914,1405,100.0,1.95,combo,615,1.0,100.0
809858,2019-03-10,2019-03,2000914,1405,100.0,1.95,combo,1131,1.0,100.0
809859,2019-03-10,2019-03,2000914,1405,100.0,3.95,combo,1008,1.0,100.0
809860,2019-03-10,2019-03,2000914,1405,100.0,2.95,combo,592,1.0,100.0
809861,2019-03-10,2019-03,2000914,1405,100.0,0.95,combo,617,2.0,200.0


### Merge combos dataframe with sales dataframe

In [47]:
# drop combos from sales_main
sales_main.drop(sales_main[sales_main['part type'] == 'combo'].index, inplace = True)

# structure combos to match sales_main
s1 = set(combos.columns)
s2 = set(sales_main.columns)
s3 = list(s1-s2)

# drop any columns from combos which are not in sales_main
combos.drop(s3, 1, inplace = True)

# concat combos and sales_main
sales_main = pd.concat([sales_main, combos], sort = False)
sales_main.reset_index(inplace = True, drop = True)

# drop this column
sales_main.drop('part type', 1, inplace = True)

### Exclude some parts
Because they possess at least one of the attributes below

In [54]:
def exclude_flag(x):
    if (x['products dont sell alone'] == 1
        or x['products carrot only'] == 1
        or x['products meta type'] == 'base product'
        or x['products coming soon'] == 1
        or x['discontinue status'] == 'discontinued'
        or (x['products status'] == 1 and x['products stripes'] != 0)
        or x['bom type'] == 'unspecified'):
        res = 'yes'
    else:
        res = 'no'
        
    return res
        
skus_main['exclude'] = skus_main.apply(exclude_flag, axis = 1)

ls = skus_main[skus_main['exclude'] == 'yes']['part id'].tolist()
b = len(skus_main)

print('we will exclude {:,.0f} skus out of {:,.0f}, which is {:,.2f}%'.format(len(ls), b, len(ls)/b*100))

sales_main.drop(sales_main[sales_main['part id'].isin(ls)].index, inplace = True)
sales_main.reset_index(inplace = True, drop = True)

we will exclude 94 skus out of 3,002, which is 3.13%


### Map some sku data to sales data

In [55]:
ls = ['products name',
      'outsourced',
      'all sku ids',
      'bom type']

for col in ls:
    sales_main[col] = sales_main['part id'].map(dict(zip(skus_main['part id'], skus_main[col])))

### Check BOM proportions

In [56]:
vc = sales_main['bom type'].value_counts()
vc/vc.sum()

resale    0.611293
pnp       0.212029
kitted    0.176677
Name: bom type, dtype: float64

### Determine popularity tiers
Use the latest three months worth of data to determine this. Use the data before we unpacked the combos.

In [58]:
# get the latest three months of data
a = sales_super_main[sales_super_main['date purchased'] >= three_months_ago].copy()

# drop combos
a.drop(a[a['part id'].isin(list(set(pts_main['combo part id'])))].index, inplace = True)
a.reset_index(inplace = True, drop = True)

# map bom types
a['bom type'] = a['part id'].map(dict(zip(sales_main['part id'], sales_main['bom type'])))

# get revenue
a['revenue'] = a['qty total'] * a['products price']

# get totals by bom and part_id
b = a.groupby(['bom type','part id'])[['qty total','revenue']].sum()

In [82]:
alpha = 0.15

for bom in set(b.index.get_level_values(0)):
    
    # isolate the BOM
    df = b[b.index.get_level_values(0) == bom].copy()
    df.reset_index(inplace = True)
    
    # get top revenue and top qty
    q1 = df['qty total'].quantile(1-alpha)
    q2 = df['revenue'].quantile(1-alpha)
    
    # get key parts which are parts we sell the most of AND make the most money on
    key = df[(df['qty total'] >= q1) & (df['revenue'] >= q2)]
    
    # remove the key parts
    df.drop(df[df['part id'].isin(key['part id'].tolist())].index, inplace = True)
    df.reset_index(drop = True, inplace = True)
    
    # now get the high (above 75%), med (between 50% and 75%), and low (below 50%)
    ls1 = [0.75, 0.50]
    ls2 = []
    for p in ls1:
        val = df['qty total'].quantile(p)
        ls2.append(val)
        
    high = df[df['qty total'] >= ls2[0]]
    med = df[np.logical_and(df['qty total'] < ls2[0], df['qty total'] >= ls2[1])]
    low = df[df['qty total'] < ls2[1]]
    
    for x in [key, high, med, low]:
        

In [133]:
np.random.seed(1)
ls1 = np.random.randint(1,20,7)
ls2 = np.random.randint(1,10,7)

In [138]:
ix = ['mon','tue','wed','thu','fri','sat','sun']
df = pd.DataFrame([ls1,ls2]).T
df.index = [ix]
df.columns = ['credit','debit']
df['net'] = df['credit'] - df['debit']

In [141]:
fmt = ['m0','m0','m0']
df.format_(fmt)

Unnamed: 0,Credit,Debit,Net
mon,$6,$1,$5
tue,$12,$1,$11
wed,$13,$2,$11
thu,$9,$8,$1
fri,$10,$7,$3
sat,$12,$3,$9
sun,$6,$5,$1


In [146]:
s1 = df['credit'].sum()
s2 = df['debit'].sum()

s2-s1

-41

In [None]:

# get totals by part id
b = a.groupby('part id')[['qty total','revenue']].sum()

# get top revenue and top qty
q1, q2 = b.quantile(0.90)

# get key parts, parts that we make the most on AND sell the most of
key = b[(b['revenue'] >= q1) & (b['qty total'] >= q2)].index.tolist()

# drop key parts
a.drop(a[a['part id'].isin(key)].index, inplace = True)
a.reset_index(drop = True, inplace = True)

# groupby part_id and sum qty
c = a.groupby('part id')[['qty total']].sum()

# get the cutoffs for high (75% and above), med (between 50% and 75%), and low (below 50%) parts
ls = [0.75, 0.50]
ls2 = []
for p in ls:
    ls2.append(c.quantile(p).item())

In [137]:
b[(b['revenue'] >= q1) & (b['qty total'] >= q2)]

Unnamed: 0_level_0,qty total,revenue
part id,Unnamed: 1_level_1,Unnamed: 2_level_1
161,17377.0,12162.56
909,9224.0,30240.74
1201,14063.0,20809.33
1609,14990.0,66380.19
1675,12647.0,24921.26
1995,9546.0,64211.25
3333,9834.0,213818.11


In [138]:
sales_main[sales_main['part id'].isin(key)].drop_duplicates('part id')['all bom types'].value_counts()

resale            4
kitted            1
resale, kitted    1
pnp               1
Name: all bom types, dtype: int64

In [133]:
key

[161, 1609]

### Determine popularity tiers
Use umbrella part id and mod qty total

In [None]:
print('popularity tiers are based on data from %s to %s' % (three_months_ago, date_end))

n = int(np.round((pd.to_datetime(date_end) - pd.to_datetime(three_months_ago)).days/30))

print('which is about {} months'.format(n))

In [None]:
# first get all sales data for the last 3 months
a = sales_clean[sales_clean['date purchased'].between(three_months_ago, date_end)]

# get the top percentile of revenue for the key parts
b = a.groupby(['bom type','umbrella part id'])[['revenue','qty total']].sum().reset_index()
key1 = b.groupby(['bom type'])[['revenue']].quantile(0.85)
key2 = b.groupby(['bom type'])[['qty total']].quantile(0.85)

dict_ = {}
ls = []
for index, row in b.iterrows():
    if row['revenue'] >= key1.loc[row['bom type']]['revenue'] and row['qty total'] >= key2.loc[row['bom type']]['qty total']:
        dict_[row['umbrella part id']] = '01 - key'
        
# now from the main df drop all key parts
c = a.drop(a[a['umbrella part id'].isin(list(dict_.keys()))].index)

# for the rest of the popularity tiers we'll use the mod qty total
d = c.groupby(['bom type','umbrella part id'])[['mod qty total']].sum().reset_index()
qty = d.groupby('bom type')[['mod qty total']].describe()
qty.columns = qty.columns.droplevel(0)

# flag all other popularity tiers
for index, row in d.iterrows():
    if row['mod qty total'] >= qty.loc[row['bom type']]['75%']:
        ret = '02 - high'
    elif row['mod qty total'] >= qty.loc[row['bom type']]['50%'] and row['mod qty total'] < qty.loc[row['bom type']]['75%']:
        ret = '03 - med'
    else:
        ret = '04 - low'
    
    dict_[row['umbrella part id']] = ret
    
# map popularity tiers to main df    
sales_clean['popularity tier'] = sales_clean['umbrella part id'].map(dict_)    

# any part id not sold in the last three months, label with "not enough data"
sales_clean['popularity tier'].fillna('05 - not enough data', inplace = True)

In [None]:
check = sales_clean.drop_duplicates('umbrella part id').groupby(['popularity tier','bom type'])[['popularity tier']].count().unstack(1)
check['row total'] = check.sum(1)
check.loc['column total'] = check.sum()

In [None]:
check

### Write Excluded parts to Excel, for R script

In [None]:
ex = skus_main[skus_main['exclude'] == 'yes'].copy()
ls = ['sku date modified',
      'sku id',
      'exclude']
ex.drop(ls, 1, inplace = True)
ex.sort_values(['bom type','part id'], inplace = True)

if excel_write == 'yes':
    writer = pd.ExcelWriter(csv_path + 'Excluded Parts.xlsx', engine = 'xlsxwriter')
    ex.to_excel(writer, 'parts excluded', index = False)
    writer.save()

### Prepare data for forecasting

In [None]:
for_forecast = pd.DataFrame()

for pn in list(set(sales_clean['umbrella part id'])):
    
    # get qty per month
    df1 = sales_clean[sales_clean['umbrella part id'] == pn].groupby('year and month')[['mod qty total']].sum()

    # fill in months where no one bought anything
    # make the final year and month the date_end of this script
    d1 = df1.index.min()
    d2 = date_end[:7]
    
    dates = pd.date_range(d1, d2, freq = 'MS')
    
    # make it so that every PN has at least 6 months of data
    n = 6
    l = len(dates)
    if l < n:
        new_d1 = pd.to_datetime(d1) - pd.DateOffset(months = (n-l))
        new_d1 = str(new_d1)[:7]
        dates = pd.date_range(new_d1, d2, freq = 'MS')
    
    dates = pd.DataFrame(dates, columns = ['year and month']).set_index('year and month')
    # join them
    df2 = dates.join(df1).fillna(0) 
    df2['umbrella part id'] = pn
    df2.reset_index(inplace = True)
    
    # create one big dataframe, for the forecast
    for_forecast = for_forecast.append(df2, ignore_index = True)

# map popularity tier and bom
for col in ['bom type','popularity tier']:
    for_forecast[col] = for_forecast['umbrella part id'].map(dict(zip(sales_clean['umbrella part id'], sales_clean[col])))
    
# rename these for a layman to understand
for_forecast.rename(columns = {'umbrella part id':'part id',
                              'mod qty total':'qty total'}, inplace = True)

### Get sku ids and part names

In [None]:
s = pd.read_sql(
'''
SELECT
k.sku_id,
k.part_id,
pd.products_name
FROM skus k
JOIN products_description pd ON k.part_id = pd.part_id
WHERE k.sku_status IN ''' + str(tuple(statuses)) + '''
AND k.part_id IN '''+ str(tuple(for_forecast['part id'].tolist())) +'''
''', db)

col_fix(s)

In [None]:
for col in ['sku id','products name']:
    for_forecast[col] = for_forecast['part id'].map(dict(zip(s['part id'], s[col])))

### Check nulls

In [None]:
n = np.sum(for_forecast.isnull().any(1))
if n == 0:
    pass
else:
    raise 0

### Check count of data points per PN

In [None]:
c = np.sum(for_forecast.groupby('part id')[['year and month']].count() < n).values[0]
if c == 0:
    pass
else:
    raise 0

### Check unique PN count

In [None]:
v1 = len(set(for_forecast['part id']))
v2 = len(set(sales_clean['umbrella part id']))
if v1 == v2:
    print('match')
    print('the total unique umbrella part id count is {}'.format(v1))
else:
    raise 0

### Check quantities

In [None]:
q1 = for_forecast['qty total'].sum()
q2 = sales_clean['mod qty total'].sum()

if np.abs(q1 - q2) < 0.01:
    pass
else:
    raise 0
    
# fill zeros with 0.1, to ease forecasting
for_forecast['qty total'] = np.where(for_forecast['qty total'] == 0, 0.1, for_forecast['qty total'])        

### Map sku and part data

In [None]:
cols = skus_main.columns.tolist()
for x in ['part id','products name']:
    cols.remove(x)

for c in cols:
    for_forecast[c] = for_forecast['part id'].map(dict(zip(skus_main['part id'], skus_main[c])))

### To Excel, for R

In [None]:
if excel_write == 'yes':
    
    writer = pd.ExcelWriter(csv_path + 'War Chest Part sales for R.xlsx', engine = 'xlsxwriter')
    for_forecast.to_excel(writer, 'data', index = False)
    writer.save()

In [None]:
print('done')