### Change this stuff

In [None]:
# choose the last day from which you want historical data
date_end = '2019-05-31'

# do you want to run the script AND write the CSV? or just run the script?
# the script will run regardless of what you choose here.
# choose "yes" or "no"
excel_write = 'no'

### Libraries

In [None]:
import os
tilde = os.path.expanduser('~')

import sys
sys.path.insert(0, tilde + '/Scripts/Fake Folder/Python Libraries')

from jb_libraries import *
%matplotlib inline

### Script Settings

In [None]:
six_months_ago = str((pd.to_datetime(date_end) - pd.DateOffset(months = 5)).date())
six_months_ago = six_months_ago[:7] + '-01'

three_months_ago = str((pd.to_datetime(date_end) - pd.DateOffset(months = 2)).date())
three_months_ago = three_months_ago[:7] + '-01'

csv_path = tilde + '/Scripts/Fake Folder/Fab/Projects/War Chest/CSVs/'

statuses = ['working', 'deprecated', 'pending', 'sample']

### Get Fab and Kitting skus

In [None]:
skus_main = pd.read_sql(
'''
SELECT
sku_date_modified,
sku_id,
k.part_id,
pd.products_name,
bom_type,
p.products_dont_sell_alone,
p.products_carrot_only,
p.products_meta_type,
p.products_coming_soon,
p.discontinue_status,
p.products_status,
p.products_stripes,
p.products_discontinued,
sku_outsourced_assembly,
k.sku_status
FROM skus k
LEFT JOIN products_description pd ON k.part_id = pd.part_id
LEFT JOIN parts p ON k.part_id = p.part_id
WHERE sku_status IN ''' + str(tuple(statuses)) + '''
''', db)

col_fix(skus_main)

for col in skus_main.columns:
    if skus_main[col].dtype == 'O':
        skus_main[col] = skus_main[col].str.lower()
        
# turn "resale - rapid prep" and "resale - no labor" into just "retail"
skus_main['bom type'] = skus_main['bom type'].str.split(' ', expand = True)[0]

### Flag outsourced, not, or both

In [None]:
a = skus_main.groupby('part id')[['sku outsourced assembly']].sum()

def outsourced(x):
    
    val = x['sku outsourced assembly']
    
    if val == 0:
        ret = 'not outsourced'
    elif val == 1:
        ret = 'outsourced'
    else:
        ret = 'both'
    return ret

a['outsourced'] = a.apply(outsourced, axis = 1)

print('proportions')
vc = a['outsourced'].value_counts()
print(vc/vc.sum())

dict_ = dict(a['outsourced'])

skus_main['outsourced'] = skus_main['part id'].map(dict_)

skus_main.drop('sku outsourced assembly', 1, inplace = True)

### Shift data from sku level to part level

In [None]:
# if there is more than one sku id per part id, then put them on one line
df1 = pd.DataFrame(skus_main.groupby('part id')['sku id'].apply(lambda x: ', '.join(x.map(str))))
skus_main['all sku ids'] = skus_main['part id'].map(dict(zip(df1.index.to_series(), df1['sku id'])))

# if there is more than one bom type per part id, then put them on one line
df2 = pd.DataFrame(skus_main.drop_duplicates(['part id','bom type']).groupby('part id')['bom type'].apply(lambda x: ', '.join(x.map(str))))
skus_main['all bom types'] = skus_main['part id'].map(dict(zip(df2.index.to_series(), df2['bom type'])))

skus_main.sort_values(['part id','sku date modified'], inplace = True)
skus_main.drop_duplicates('part id', keep = 'last', inplace = True)

skus_main.drop(['sku id','bom type'],1,inplace = True)

### Modify BOMs
Some parts have multiple BOMs, for simplicity let's create a single BOM per part

In [None]:
# in order of importance we'll do pnp, kitting, then resale

def bom_fix(df):
    if 'pnp' in df['all bom types']:
        return 'pnp'
    elif 'kitted' in df['all bom types']:
        return 'kitted'
    elif 'resale' in df['all bom types']:
        return 'resale'
    else:
        return df['all bom types']
    
skus_main['all bom types'] = skus_main.apply(bom_fix, axis = 1)
skus_main.rename(columns = {'all bom types':'bom type'}, inplace = True)

### Get all sales data

In [None]:
pd.read_sql(
'''
SELECT
*
FROM orders_status
ORDER BY orders_status_id
''', db)

In [None]:
s = dt.datetime.now()

sales_super_main = pd.read_sql(
'''
SELECT
DATE(o.date_purchased) AS date_purchased,
DATE_FORMAT(o.date_purchased, '%Y-%m') AS year_and_month,
op.orders_id,
op.part_id,
op.products_quantity AS qty_total, 
op.products_price
FROM orders_products op
JOIN orders o ON op.orders_id = o.orders_id
# fraud - pending, fraud - confirmed, return, replaced defective, refunded defective, voided, fraud - void
AND o.orders_status NOT IN (8,9,10,11,12,14,15)
AND DATE(o.date_purchased) <= ' '''+ date_end +''' '
WHERE (op.part_id IN '''+ str(tuple(skus_main['part id'].tolist())) +''' # get all valid skus
OR op.part_id IN (SELECT part_id FROM products_to_stuff WHERE part_id > 0) # get all combos)
AND op.part_id != 0
''', db)

col_fix(sales_super_main)

# change to datetime
sales_super_main['date purchased'] = pd.to_datetime(sales_super_main['date purchased'])

d1 = sales_super_main['date purchased'].min().date()
d2 = sales_super_main['date purchased'].max().date()

print('data is from {} to {}\n'.format(d1,d2))

e = dt.datetime.now()
print(e-s)

### Make a copy

In [None]:
sales_main = sales_super_main.copy()

### Get the combos that some parts are contained in
We want to forecast the number of parts sold. If Part A is sold alone and there are x2 units in Combo Z, and if we sell x1 unit of Part A and x1 unit of Combo Z, then we are really selling x3 units of Part A. We need to make this adjustment to our data.

In [None]:
pts_main = pd.read_sql(
'''
SELECT
part_id AS combo_part_id,
contains_part_id AS part_id,
pts_quantity
FROM products_to_stuff
# get combos only
WHERE part_id > 0
''', db)

col_fix(pts_main)

# flag "parts" or "combos"
sales_main['part type'] = np.where(sales_main['part id'].isin(pts_main['combo part id'].tolist()), 'combo','part')

### Seperate out and remove combos from sales data

In [None]:
combos = sales_main[sales_main['part type'] == 'combo'].copy()

combos.rename(columns = {'part id':'combo part id',
                         'qty total':'combo qty total'}, inplace = True)

### "Unpack" combos into their parts

In [None]:
# merge products_to_stuff, which contains the combo_part_id and all of the parts that that combo is made of
combos = pd.merge(combos, pts_main, how = 'left', on = 'combo part id')

# continuing the logic mentioned a few sections above:
    # if a combo contains x2 units of Part A, and we sell x2 units of this combo, then we have really sold x4 units of Part A.
    # make this adjustment here
combos['qty total'] = combos['combo qty total'] * combos['pts quantity']

# the price as it stands now in the combos dataframe is the price of the combo
# but we just unpacked each combo into it's parts
# so we need to overwrite the price of the combo with the price of each part
price = pd.read_sql(
'''
SELECT
part_id,
products_price
FROM parts
''', db)

col_fix(price)

combos['products price'] = combos['part id'].map(dict(zip(price['part id'], price['products price'])))

# check nulls
n = combos[combos.isnull().any(1)]
if n.empty:
    print('no nulls')
else:
    print('you have %i nulls' % len(n))
    display(n.head())
    raise 0

### Check out this example
Valid as of 06/21/2019. Use the random OID below.

In [None]:
oid = 2000914

This order consists of x1 unique combo and x3 unique parts

In [None]:
sales_main[sales_main['orders id'] == oid]

Zoom into the combo, which is PN 1405. This combo conists of many parts. Take note below of the changes we made, specifically the products_price (which is now NOT the price of the combo, but the price of each part that this combo is made of), and the qty_total (which is combo_qty_total * pts_quantity).

Going further with the qty_total, zoom into PN 617 below. Combo PN 1405 contains x2 units of PN 617. This person bought x100 units of Combo PN 1407, which means they bought 100 * 2 = 200 units of PN 617.

In [None]:
combos[combos['orders id'] == oid]

### Merge combos dataframe with sales dataframe

In [None]:
# drop combos from sales_main
sales_main.drop(sales_main[sales_main['part type'] == 'combo'].index, inplace = True)

# structure combos to match sales_main
s1 = set(combos.columns)
s2 = set(sales_main.columns)
s3 = list(s1-s2)

# drop any columns from combos which are not in sales_main
combos.drop(s3, 1, inplace = True)

# concat combos and sales_main
sales_main = pd.concat([sales_main, combos], sort = False)
sales_main.reset_index(inplace = True, drop = True)

# drop this column
sales_main.drop('part type', 1, inplace = True)

### Exclude some parts
Because they possess at least one of the attributes below

In [None]:
print('the sku statues we are considering are %s' % ', '.join(statuses))

def exclude_flag(x):
    if (x['products dont sell alone'] == 1
        or x['products carrot only'] == 1
        or x['products meta type'] == 'base product'
        or x['products coming soon'] == 1
        or x['discontinue status'] == 'discontinued'
        or (x['products status'] == 1 and x['products stripes'] != 0)
        or x['bom type'] == 'unspecified'
        or x['sku status'] not in statuses):
        res = 'yes'
    else:
        res = 'no'
        
    return res
        
skus_main['exclude'] = skus_main.apply(exclude_flag, axis = 1)

# get parts to exclude based on conditions above
ls1 = skus_main[skus_main['exclude'] == 'yes']['part id'].tolist()

# any part that is in the sales data BUT NOT IN our skus data should be removed
# recall that the skus data contains all valid parts
ls2 = list(set(sales_main[~sales_main['part id'].isin(skus_main['part id'].tolist())]['part id']))

# put them together
ls3 = ls1 + ls2

# get the parts we will exclude from the sales data
x = list(set(sales_main[sales_main['part id'].isin(ls3)]['part id']))

# exclude them
sales_main.drop(sales_main[sales_main['part id'].isin(x)].index, inplace = True)
sales_main.reset_index(inplace = True, drop = True)

t = len(list(set(sales_main['part id'])))
print('\nwe will exclude {:,.0f} parts out of {:,.0f}, which is {:,.2f}% of the total count.'.format(len(x), t, len(x)/t*100))

### Map some sku data to sales data

In [None]:
ls = ['products name',
      'outsourced',
      'all sku ids',
      'bom type']

for col in ls:
    sales_main[col] = sales_main['part id'].map(dict(zip(skus_main['part id'], skus_main[col])))

### Check BOM proportions

In [None]:
vc = sales_main['bom type'].value_counts()
vc/vc.sum()

### Determine popularity tiers
Use the qty sold and avg price, and use the latest three months worth of data. Note that we are using the data AFTER we have unpacked the combos. To determine the high, med, and low tiers we will use the qty sold only, but to determine the key tier we will use the revenue and qty sold of the UNPACKED COMBOS.

In [None]:
# get the latest three months of data
a = sales_main[sales_main['date purchased'] >= three_months_ago].copy()

# map bom types
a['bom type'] = a['part id'].map(dict(zip(sales_main['part id'], sales_main['bom type'])))

# get revenue
a['revenue'] = a['qty total'] * a['products price']

# get totals by bom and part_id
b = a.groupby(['bom type','part id'])[['qty total','revenue']].sum()

In [None]:
# determine popularity tiers for each BOM
alpha = 0.15
popularity_tiers = pd.DataFrame()

for bom in set(b.index.get_level_values(0)):
    
    # isolate the BOM
    df = b[b.index.get_level_values(0) == bom].copy()
    df.reset_index(inplace = True)
    
    # any part with fewer than three months of data gets a tier of "not enough data"
    g = a[a['bom type'] == bom].groupby('part id')[['year and month']].nunique()
    p = g[(g < 3).any(1)].index.tolist()
    not_enough_data = df[df['part id'].isin(p)].copy()
    
    # remove these "not enough data parts" from data
    df.drop(df[df['part id'].isin(not_enough_data)].index, inplace = True)
    df.reset_index(drop = True, inplace = True)
    
    # get top revenue and top qty
    q1 = df['qty total'].quantile(1-alpha)
    q2 = df['revenue'].quantile(1-alpha)
    
    # get key parts which are parts we sell the most of AND make the most money on
    key = df[(df['qty total'] >= q1) & (df['revenue'] >= q2)].copy()
    
    # remove the key parts
    df.drop(df[df['part id'].isin(key['part id'].tolist())].index, inplace = True)
    df.reset_index(drop = True, inplace = True)
    
    # now get the high (above 75%), med (between 50% and 75%), and low (below 50%)
    ls1 = [0.75, 0.50]
    ls2 = []
    for p in ls1:
        val = df['qty total'].quantile(p)
        ls2.append(val)
        
    high = df[df['qty total'] >= ls2[0]].copy()
    med = df[np.logical_and(df['qty total'] < ls2[0], df['qty total'] >= ls2[1])].copy()
    low = df[df['qty total'] < ls2[1]].copy()
    
    # store the tiers
    tiers = {'key':key,
             'high':high,
             'med':med,
             'low':low,
             'not enough data':not_enough_data}
    
    # add this column to each tier
    for k in tiers.keys():
        tiers[k]['popularity tier'] = k
        popularity_tiers = pd.concat([popularity_tiers, tiers[k]], ignore_index = True)

In [None]:
# merge popularity tier data with sales data
sales_main = pd.merge(sales_main,
                      popularity_tiers[['bom type','part id','popularity tier']],
                      on = ['bom type','part id'],
                      how = 'left')

# get the popularity tier nulls
# a true popularity tier null means that we do not have sales data within the last three months 
x = list(set(sales_main[sales_main['popularity tier'].isnull()]['part id']))

# check to make sure these nulls really mean that we don't have data within the last three months
# nulls which should not be nulls will appear here
df = sales_main[sales_main['part id'].isin(x)].groupby('part id')[['date purchased']].max()

# if all nulls are really a result of us not having data within the last three months, fill these nulls with "low"
# if we don't have sales data from the last three months, then no one bought part in the last three months, and this warrants a low popularity tier
if np.sum(df['date purchased'] >= three_months_ago) == 0:
    sales_main['popularity tier'].fillna('low', inplace = True)
else:
    raise ValueError('you have popularity tier nulls which should not be nulls.\ntfix this before moving on.')

In [None]:
print('count of unique parts within each BOM and popularity tier')
check = sales_main.groupby(['bom type','popularity tier'])[['part id']].nunique().unstack(1)
check.columns = check.columns.droplevel(0)
check = check[['key','high','med','low','not enough data']]
check

### Run part forecasts

In [None]:
# load the forecast library
import statsmodels.api as sm

# structure the data for forecasting
df = sales_main.groupby(['bom type','popularity tier','part id','year and month'])[['qty total']].sum().unstack(3).fillna(0)

# determine train (80%) and test (20%) sets
cut_off = int(np.round(len(df.columns) * 0.80))

train = df.iloc[:, :cut_off]
test = df.iloc[:, cut_off:]

# check
if len(train.columns) + len(test.columns) != len(df.columns):
    raise ValueError('the sum of the train and test columns do not match the number of columns in the original dataframe')

In [None]:
# auto_arima for python
#https://github.com/tgsmith61591/pmdarima/blob/master/examples/quick_start_example.ipynb

In [None]:
'''
# determine parameters for ARIMA models
ls1 = [0,1,2,3]
ls2 = [0,1]
ls3 = []
ls4 = []
for i in ls1:
    for j in ls2:
        ls3.append(tuple([i,j,i]))
        ls4.append(tuple([i,j,i,12]))

params = [(x,y) for x in ls3 for y in ls4]

# for each part, run the forecasts
#for i in range(len(df)):
for i in range(5):
    counter = 0
    values = train.iloc[i,:].values
    for p in params:
        try:
            model = sm.tsa.statespace.SARIMAX(values,
                                              order = p[0],
                                              seasonal_order = p[1])
            fit = model.fit()
            counter += 1
        except:
            pass
'''

### Prepare data for forecasting

In [None]:
for_forecast = pd.DataFrame()

for pn in list(set(sales_clean['umbrella part id'])):
    
    # get qty per month
    df1 = sales_clean[sales_clean['umbrella part id'] == pn].groupby('year and month')[['mod qty total']].sum()

    # fill in months where no one bought anything
    # make the final year and month the date_end of this script
    d1 = df1.index.min()
    d2 = date_end[:7]
    
    dates = pd.date_range(d1, d2, freq = 'MS')
    
    # make it so that every PN has at least 6 months of data
    n = 6
    l = len(dates)
    if l < n:
        new_d1 = pd.to_datetime(d1) - pd.DateOffset(months = (n-l))
        new_d1 = str(new_d1)[:7]
        dates = pd.date_range(new_d1, d2, freq = 'MS')
    
    dates = pd.DataFrame(dates, columns = ['year and month']).set_index('year and month')
    # join them
    df2 = dates.join(df1).fillna(0) 
    df2['umbrella part id'] = pn
    df2.reset_index(inplace = True)
    
    # create one big dataframe, for the forecast
    for_forecast = for_forecast.append(df2, ignore_index = True)

# map popularity tier and bom
for col in ['bom type','popularity tier']:
    for_forecast[col] = for_forecast['umbrella part id'].map(dict(zip(sales_clean['umbrella part id'], sales_clean[col])))
    
# rename these for a layman to understand
for_forecast.rename(columns = {'umbrella part id':'part id',
                              'mod qty total':'qty total'}, inplace = True)

### Get sku ids and part names

In [None]:
s = pd.read_sql(
'''
SELECT
k.sku_id,
k.part_id,
pd.products_name
FROM skus k
JOIN products_description pd ON k.part_id = pd.part_id
WHERE k.sku_status IN ''' + str(tuple(statuses)) + '''
AND k.part_id IN '''+ str(tuple(for_forecast['part id'].tolist())) +'''
''', db)

col_fix(s)

In [None]:
for col in ['sku id','products name']:
    for_forecast[col] = for_forecast['part id'].map(dict(zip(s['part id'], s[col])))

### Check nulls

In [None]:
n = np.sum(for_forecast.isnull().any(1))
if n == 0:
    pass
else:
    raise 0

### Check count of data points per PN

In [None]:
c = np.sum(for_forecast.groupby('part id')[['year and month']].count() < n).values[0]
if c == 0:
    pass
else:
    raise 0

### Check unique PN count

In [None]:
v1 = len(set(for_forecast['part id']))
v2 = len(set(sales_clean['umbrella part id']))
if v1 == v2:
    print('match')
    print('the total unique umbrella part id count is {}'.format(v1))
else:
    raise 0

### Check quantities

In [None]:
q1 = for_forecast['qty total'].sum()
q2 = sales_clean['mod qty total'].sum()

if np.abs(q1 - q2) < 0.01:
    pass
else:
    raise 0
    
# fill zeros with 0.1, to ease forecasting
for_forecast['qty total'] = np.where(for_forecast['qty total'] == 0, 0.1, for_forecast['qty total'])        

### Map sku and part data

In [None]:
cols = skus_main.columns.tolist()
for x in ['part id','products name']:
    cols.remove(x)

for c in cols:
    for_forecast[c] = for_forecast['part id'].map(dict(zip(skus_main['part id'], skus_main[c])))

### To Excel, for R

In [None]:
if excel_write == 'yes':
    
    writer = pd.ExcelWriter(csv_path + 'War Chest Part sales for R.xlsx', engine = 'xlsxwriter')
    for_forecast.to_excel(writer, 'data', index = False)
    writer.save()

In [None]:
print('done')