## Inits

+ Standard modules:

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as pl
from time import time

+ Modules for LP/IP:

In [2]:
import cplex
from cplex import Cplex
from cplex.exceptions import CplexError

+ My own modules:

In [3]:
import input_load
from input_load import *

import split
from split import DataSplitter

import demand_pred
from demand_pred import DemandPredictor

import trigger
from trigger import *

import date_util
from date_util import *

+ Helpers:

In [4]:
def rand_int_in(low, high): # include both low & high
    return np.random.randint(low, high + 1, 1)[0]

## Load global vars

In [5]:
CLEAN_DIR = '~/data/venture=Zalora/sg/clean/'
RES_DIR = '~/projects/daas-markdown/res/'
group_dir = CLEAN_DIR + 'groups/'
feat_dir = group_dir + 'feats/'
# TODAY = pd.datetime.today().date()

## IP formulation

### Input, Output

Given a date $d$, input & output of the `price optimizer` module are follows:

__Input__:

+ the set of SKU configs whose prices on $d$ need to be optimized
+ the set of possible prices for the configs
+ inventory constraints on $d$, i.e. the number of remaining units in inventory of each config on $d$

__Output__: the list of optimal prices for the configs

### Math formulation of the problem

\begin{align}
	\label{IP_s}
	\max & \sum_{i \in I_o} \sum_{j \in J} p_j D_{i, j, s} x_{i, j} \\
	s.t. & \sum_{j \in J} x_{i, j} = 1, ~\forall i \in I_o  \\
	 	 & \sum_{i \in I_o} \sum_{j \in J} p_j x_{i, j} = s  \\
		 & x_{i, j} \in \{0, 1 \} \\
		 & D_{i, j, s} \le n_i 
\end{align}

### Components of the IP

+ objective function
+ single-price constraints
+ sum constraints
+ binary type constraints
+ inventory constraints

# Helpers to populate LP

## To set variable type

In [6]:
def set_var_type(prob, binary=True):
    var_names = prob.variables.get_names()
    if binary:
        types = [prob.variables.type.binary]*len(var_names)
    else:
        types = [prob.variables.type.continuous]*len(var_names)
        
    prob.variables.set_types(list(zip(var_names, types)))
    print('+ set variable types as {}'.format(prob.variables.get_types()))
    return prob

## To add constraints

In [7]:
def mk_var_names(configs, prices):
    
    n_config, n_price = len(configs), len(prices)
    var_names = []
    config_idx, price_idx = np.arange(n_config) + 1, np.arange(n_price) + 1
    
    for i in config_idx:
        prefix = 'x_{}'.format(i)
        row = [prefix + str(j) for j in price_idx]
        var_names += [row]
        
    return np.array(var_names)

def mk_opc_lhs(configs, prices, var_names):
    lhs = []
    coefs = np.ones(len(prices))
    
    for i, cf in enumerate(configs):
        count = i+1
        if count % 100 == 0:
            print('\t created one-price lhs for {} config ...'.format(count))
        row = cplex.SparsePair(ind=var_names[i], val=coefs)
        lhs += [row]
        
    return lhs

In [8]:
def add_one_price_constraints(prob, configs, prices, var_names): # one per config
    
    n_config, n_price = len(configs), len(prices)
    n_var = n_config * n_price
    lhs = mk_opc_lhs(configs, prices, var_names)
    
    my_sense = ['E'] * n_config
    my_rhs = np.ones(n_config)
    my_rownames = ['opc_config_{}'.format(i) for i in range(n_config)]
    
    prob.linear_constraints.add(lin_expr=lhs, senses=my_sense, rhs=my_rhs, names=my_rownames)
    
    print('+ added {} one-price constraints for {} configs'.format(n_config, n_config))
    return prob

In [9]:
def add_sum_constraints(prob, s, prices, configs, var_names):
    
    coefs = np.array([prices] * len(configs))
    lhs = cplex.SparsePair(ind=var_names.ravel(), val=coefs.ravel())
    
    prob.linear_constraints.add(lin_expr=[lhs], senses=['E'], rhs=[s], names=['sum_constraint'])
    print('+ added sum constraint')
    return prob

__Note:__ inventory constraints are non-linear, stochastic ones. Thus, they will be added in later version.

In [10]:
def add_inventory_constraints(prob, predicted_demands, inv_amount):
    print('+ added inventory constraints')
    pass

In [11]:
def add_bounds(prob, lb, ub):
    var_names = prob.variables.get_names()
    n_var = len(var_names)
    lower_bounds, upper_bounds = [lb]*n_var, [ub]*n_var
    prob.variables.set_lower_bounds(list(zip(var_names, lower_bounds)))
    prob.variables.set_upper_bounds(list(zip(var_names, upper_bounds)))
    print('+ added bounds')
    return prob

## To add objective function

In [12]:
def cal_feats(configs, dd, group_feats):
    pass

def predict_demand(configs, dd, group):
    group_feats = form_feat_mat(group)
    feat_mat = cal_feats(configs, dd, group_feats)
    predicted_demands = demand_predictor.predict(feat_mat)
    
    return predicted_demands

In [13]:
def cal_obj_coefs(prices, predicted_demands):
    concat_prices = np.array([prices] * len(configs)).ravel() # same formula as add_sum_constraints(), only diff is to mult with demands
    return np.multiply(concat_prices, predicted_demands)

In [14]:
def add_obj(prob, configs, prices, dd, predicted_demands=None):
    '''
    add obj functions & binary vars declaration
    '''
#     predicted_demands = predict_demand(configs, dd)
    obj_coefs = cal_obj_coefs(prices, predicted_demands)
    var_names = prob.variables.get_names()
    prob.objective.set_linear(list(zip(var_names, obj_coefs)))
    print('+ added objective function')
    return prob

## To get components

In [15]:
def get_constraints(prob):
    
    def pretty_lhs(spair):
        '''
        Given lhs of a constraint in `cplex.SparsePair` form, convert it to human readable format
        '''
        idx = spair.ind
        variables = prob.variables.get_names(idx)
        coefs = spair.val
        terms = [''.join([str(coefs[i]), v]) for i, v in enumerate(variables)]
        return ' + '.join(terms)
    
    lin_con = prob.linear_constraints
    names = lin_con.get_names()
    sparse_pairs = lin_con.get_rows()
    lhs = [pretty_lhs(spair) for spair in sparse_pairs]
    
    senses = lin_con.get_senses()
    rhs = lin_con.get_rhs()
    
    constraints = pd.DataFrame({'name': names, 'lhs': lhs, 'sense': senses, 'rhs': rhs})
    cols = ['name', 'lhs', 'sense', 'rhs']
    return constraints[cols]

# Collect inputs

## Configs to be optimized

Given a group of SKU configs, we want to detect which SKU configs we need to optimize prices. Those are configs which are either selling __too slow__ or __too fast__. Thus, we can detect those by comparing target sale velocity with recent sale velocity (in two recent weeks). This is implemented in our `trigger` module.

## Possible prices 

We now calculate possible prices for each config $cf$. Those are prices discounted from the black price of $cf$, starting from black price minus $5\%$, each time we discount $5\%$. Thus, the possible prices will have the following form:

$$
black\_price(cf) - k * unit\_discount, k \in 1,\dots, 16
$$
where
$$
unit\_discount = 0.05 * black\_price(cf)
$$

In [16]:
def get_correct_price(cf, tmp_rrp):
    #     in case of several black prices, use the latest one (from mtg on 10 Nov 2017)
    sub_df = tmp_rrp[tmp_rrp.sku_config == cf]
    sub_df = sub_df.drop_duplicates('black_price').sort_values('snapshot_date', ascending=False)
    
    if sub_df.shape[0] > 1: # for debug
        print(sub_df)
    
    latest_price = sub_df['black_price'].iat[0]
    return latest_price

def query_black_price(configs, group_df):
    
    print('Querying black prices...')
    print('Below are SKUs with changing black price, we are using the latest one for black price')
    
    cols = ['sku_config', 'snapshot_date', 'price']
    tmp_rrp = group_df[group_df.sku_config.isin(configs)][cols].drop_duplicates()
    tmp_rrp = tmp_rrp.rename(columns={'price': 'black_price'})
    
    correct_prices = [get_correct_price(cf, tmp_rrp) for cf in configs]
    return pd.DataFrame({'sku_config': configs, 'black_price': correct_prices})

__Notes:__ 

+ There are SKUs with too large black prices. These must be errors from data input.
+ some SKUs have more than $1$ black price. How to handle these cases? Should we just take the smallest price?

Use latest black price (mtg on 10 Nov 2017)

__Note__: for prototype purpose, I try only 3 possible values for each config. Later will try a full range.

In [17]:
ks = np.arange(0, 3) + 1
def cal_prices(cf, rrp_df, count=1, n_digit=2): # round to 2 decimals
    if (count % 100) == 0:
        print('{} configs and counting...'.format(count))
    
    black_price = rrp_df.query('sku_config == "{}"'.format(cf))['black_price'].iat[0]
    unit_discount = 0.05*black_price
    return np.round(black_price - ks*unit_discount, decimals=n_digit)

# Find global optimum prices

For each value of $s$, we solve the corresponding ($IP_s$) to find the local optimum prices $P^*_s$. Then we determine the global optimum prices $P^*$ as the one that provides maximum revenue among these $P^*_s$.

## Build optimization problems $IP_s$

We are using the Python API of CPLEX library. In this library, to build an IP, we need to populate its components (i.e. objective function and constraints). The mentioned components of each ($IP_s$) is the following:

+ objective function & variable type declaration (binary)
+ one-price constraints (OPC)
+ sum constraints (SC)
+ inventory constraints (IC)

In [18]:
def get_type(prob):
    type_code = prob.get_problem_type()
    return prob.problem_type[type_code]

In [19]:
def populatebyrow(prob, total_price, dd, configs, prices, inv_amount):
    '''
    build an IP/LP problem by populating its components
    '''
    ## tell the solver that this is a maximize problem
    prob.objective.set_sense(prob.objective.sense.maximize)
    
    var_names = mk_var_names(configs, prices) # var names in 2D array
    
    prob = add_one_price_constraints(prob, configs, prices, var_names)
    prob = add_sum_constraints(prob, total_price, prices, configs, var_names)
    
    n_var = len(configs) * len(prices)
    predicted_demands = np.random.randint(1, 10, size=n_var) # for testing only, will replace by actual predicted demand

    #     prob = add_inventory_constraints(prob, predicted_demands, inv_amount)
    prob = add_obj(prob, configs, prices, dd, predicted_demands)
    
    return prob

In [20]:
def populateLP(total_price, dd, configs, prices, inv_amount, binary=True): # wrapper to populate (IPs)
    '''
    dd: given date
    configs: SKU configs whose prices on the date need to be optimized
    prices: possible prices for the configs
    inv_amount: inventory constraints on the given date
    '''
    print(r'Populating $LP_{}$...'.format(total_price))
    prob = Cplex()
    
    var_names = mk_var_names(configs, prices)
    prob.variables.add(names=var_names.ravel())
    
    prob = set_var_type(prob, binary)
    
    if not binary:
        prob = add_bounds(prob, lb=0, ub=1)
    
    prob = populatebyrow(prob, total_price, dd, configs, prices, inv_amount)

    prob.set_problem_type(Cplex.problem_type.LP)
    print('Type of problem: {}'.format(get_type(prob)))
    
    return prob

## Solve $IP_s$ for local optimum

For each possible value of $s$, I follow the steps in algorithm 1 in the [work](http://www.hbs.edu/faculty/Publication%20Files/kris%20Analytics%20for%20an%20Online%20Retailer_6ef5f3e6-48e7-4923-a2d4-607d3a3d943c.pdf):

+ solve relaxed $LP_s$ of $IP_s$ (e.g. integer constraints are removed)
+ calculate the lower bound for objective value of $IP_s$ via objective value of $LP_s$

### Solve relaxed $LP_s$

In [None]:
prob = populateLP(s, today, configs, prices, inv_amount, binary=False)
print()
print('Started solver')
prob.solve()

#### Examine solution

In [None]:
def show_sol_info(prob):
    numrows = prob.linear_constraints.get_num()
    numcols = prob.variables.get_num()
    print()
    # solution.get_status() returns an integer code
    print("Solution status = ", prob.solution.get_status(), ":", end=' ')
    # which is used to print the corresponding string
    print(prob.solution.status[prob.solution.get_status()])

    print("Solution value  = ", prob.solution.get_objective_value())
    slack = prob.solution.get_linear_slacks()
    pi = prob.solution.get_dual_values()
    x = prob.solution.get_values()
    dj = prob.solution.get_reduced_costs()
    for i in range(numrows):
        print("Row %d:  Slack = %10f  Pi = %10f" % (i, slack[i], pi[i]))
    for j in range(numcols):
        print("Column %d:  Value = %10f Reduced cost = %10f" %
              (j, x[j], dj[j]))

### Calculate lower bound for $IP_s$

## Find global optimum

# Tests

## Toy test case

In [None]:
# set up test
configs, prices = ['cf1', 'cf2'], range(5, 20, 5)
n_config, n_price = len(configs), len(prices)
inv_amount = np.random.randint(1, 10, n_config)
today = pd.datetime.today()

min_s, max_s = n_config * min(prices), n_config * max(prices)
s = rand_int_in(min_s, max_s)
today = pd.datetime.today()

In [None]:
prob = populateLP(s, today, configs, prices, inv_amount, binary=False)
print()
print('Started solver')
prob.solve()
show_sol_info(prob) 

## Real data

In [22]:
# only needed when there are changes in code base
reload(input_load)
from input_load import *

reload(trigger)
from trigger import *

reload(date_util)
from date_util import *

reload(split)
from split import DataSplitter

In [None]:
input_loader = InputLoader(DataPrep())
my_trigger = Trigger(input_loader)
data_splitter = DataSplitter(split_ratio=0.8)

In [23]:
global_X = input_loader.load_global_feat(yr=2017)

Loading global feats in yr 2017 from /Users/gfg/data/venture=zalora/sg/clean/2017/glob_feat.csv...


### Find configs to be optimized
Via `trigger`

In [24]:
gid = ('dresses', 'female', 'autumn-winter')
gname = '_'.join(list(gid))

In [25]:
# load group sales history
group_sales = my_trigger.query_group_sales(gid)

fd, ld = date_range(group_sales)
dd = ld - timedelta(days=30)

Querying sale history of group dresses...


In [None]:
#NOT run, this has some bug
slow_sku_df = my_trigger.find_slow_configs(gid, group_sales, dd)
print(slow_sku_df.head())

In [56]:
# slow_configs = slow_sku_df['sku_config'].unique()
slow_sku_df = pd.read_csv(RES_DIR + 'slow_sku.csv')
slow_configs = sorted(slow_sku_df['sku_config'].unique())
print('# slow configs: {}'.format(len(slow_configs)))

# slow configs: 254


In [57]:
configs = np.random.choice(slow_configs, size=10)

### Set possible prices

In [67]:
def load_group_snapshots(gname, group_dir):
    date_cols = ['snapshot_date', 
                 'special_from_date', 'special_to_date', 
                 'permanent_markdown_from_date']

    return pd.read_csv(group_dir + gname + '.csv', parse_dates=date_cols)

In [112]:
def set_possible_prices(configs, group_df):
    # get correct black price
    rrp_df = query_black_price(configs, group_df)

    # set possible prices
    possible_prices = [ cal_prices(cf, rrp_df, count=i+1) for i, cf in enumerate(configs)]

    price_df = rrp_df.copy()
    price_df['possible_prices'] = possible_prices
    return price_df

In [114]:
def get_possible_prices(cf, df):
    return df[df.sku_config == cf]['possible_prices'].iloc[0]

In [None]:
group_df = load_group_snapshots(gname, group_dir)

In [113]:
price_df = set_possible_prices(configs, group_df)

SKU with changing black price, use the latest price
              sku_config snapshot_date  black_price
828183  79BF4AAFB6584DGS    2017-07-21         24.9
134674  79BF4AAFB6584DGS    2016-12-30         39.9


__Note:__ Round possible prices to 2 decimals (mtg on 10 Nov 2017)

### Query inventory amounts

In [71]:
def query_inventory(configs, dd, group_stock):
    stock_df = group_stock[group_stock['date'] == dd]
    cols = ['sku_config', 'date', 'stock']
    return stock_df[stock_df.sku_config.isin(configs)][cols].drop_duplicates()

In [62]:
group_stock = my_trigger.query_stock_data(gid)

In [72]:
stock_df = query_inventory(configs, dd, group_stock)

### Merge to a single DF

In [115]:
slow_df = pd.merge(price_df, stock_df)
slow_df.sort_values('black_price')

Unnamed: 0,black_price,sku_config,possible_prices,date,stock
6,24.9,79BF4AAFB6584DGS,"[23.65, 22.41, 21.16]",2017-05-28,72
1,34.9,9EB52AA5DA4D3DGS,"[33.16, 31.41, 29.66]",2017-05-28,32
3,34.9,D4094AA895E68CGS,"[33.16, 31.41, 29.66]",2017-05-28,148
5,34.9,1F22DAA71C149FGS,"[33.16, 31.41, 29.66]",2017-05-28,70
7,34.9,FB590AA19EAE04GS,"[33.16, 31.41, 29.66]",2017-05-28,125
9,34.9,86D69AA83B2A30GS,"[33.16, 31.41, 29.66]",2017-05-28,59
2,39.9,1EF9EAA17AB709GS,"[37.9, 35.91, 33.92]",2017-05-28,157
4,44.9,0AE69AACA1018FGS,"[42.66, 40.41, 38.16]",2017-05-28,113
0,79.9,1376FAAE0DBBF1GS,"[75.9, 71.91, 67.92]",2017-05-28,49
8,79.9,D803EAA23402BFGS,"[75.9, 71.91, 67.92]",2017-05-28,49


### Prepare a sales predictor

In [219]:
reload(demand_pred)
from demand_pred import *

In [220]:
demand_predictor = DemandPredictor()

In [32]:
feat_mat = input_loader.form_feat_mat(group=gname, global_X=global_X)
ready_df = input_loader.attach_response(feat_mat, my_trigger.sale_hist)
train_df, test_df = data_splitter.split_train_test(ready_df)

Loading features of group dresses_female_autumn-winter from /Users/gfg/data/venture=zalora/sg/clean/groups/feats/dresses_female_autumn-winter.csv...
	 dropped NA configs
Obtained feature matrix of group dresses_female_autumn-winter
	 info on the matrix
<class 'pandas.core.frame.DataFrame'>
Int64Index: 160080 entries, 0 to 160079
Data columns (total 17 columns):
sku_config           160080 non-null object
current_price        160080 non-null float64
discount_from_rrp    160080 non-null float64
n_competitor         160080 non-null int64
rel_price            160080 non-null float64
product_name         160080 non-null object
color                160080 non-null object
color_pop            160080 non-null float64
sub_cat              160080 non-null object
brand_name           160080 non-null object
brand_type           160080 non-null object
date                 160080 non-null object
total_views          160080 non-null int64
total_impressions    160080 non-null int64
mean_views         

In [33]:
import gc

In [34]:
# del global feat matrix to free mem for training
del global_X
gc.collect()

433

In [221]:
best_xgb = demand_predictor.fit_xgb(train_df)

Fitting XGBoost to train data...
()
Performing CV for xgboost...
Fitting 3 folds for each of 40 candidates, totalling 120 fits
-0.2060728528
{'n_estimators': 20, 'max_depth': 3}


[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:   40.6s finished


### Build feature matrix for each price setting

Given possible price settings of the slow configs and their stocks on a date `dd`, we now build feature matrices for the price settings s.t. we can predict sales associated with each setting. For this, we need to update `current_price` of the configs and relevant features:

+ `discount_from_rrp`
+ `relative_price`

#### Helpers

In [96]:
import feat_eng
from feat_eng import FeatureEngineer

In [176]:
fe = FeatureEngineer(CLEAN_DIR)

In [198]:
def update_price(df, configs, price_setting):
    idx = df.sku_config.isin(configs)
    df.loc[idx, 'current_price'] = price_setting

def update_discount(df, configs):
    idx = df.sku_config.isin(configs)
    df.loc[idx, 'discount_from_rrp']= 100*(df.loc[idx, 'black_price'] - df.loc[idx, 'current_price'])/df.loc[idx, 'black_price']

def update_rel_price(df):
    m = np.mean(df['current_price'])
    df['rel_price'] = df['current_price']/m

def build_feat_mat(fm, configs, price_setting, dd):
    on_date_feat = fm[fm['date'] == dd].copy()
    update_price(on_date_feat, configs, price_setting)
    update_discount(on_date_feat, configs)
    update_rel_price(on_date_feat)
    del on_date_feat['black_price'] # black_price currently is not a feature
    
    return on_date_feat

In [None]:
def prev_predict_sales(estimator, dd, price_setting, count=1): # predict sales on given date and for given price setting
    print('price setting {}'.format(count))
#     print(price_setting)
    new_df = build_feat_df(feat_mat, test_configs, price_setting, dd)

    res = new_df.copy()
    res['predict_sales'] = estimator.predict(new_df[feats])
    res['price_setting'] = count
    res['sum'] = sum(price_setting)
    
    cols = ['sku_config', 'current_price', 'predict_sales', 'price_setting', 'sum']
    return res[res.sku_config.isin(test_configs)][cols]

#### Pick a few configs for testing

In [116]:
test_configs = list(slow_df.query('black_price == 34.9')['sku_config'].head(3))
test_prices = [33.16, 31.41, 29.66]

s = sum(test_prices)

# choose a date for testing, need to have all test configs
tmp = feat_mat[feat_mat.sku_config.isin(test_configs)].sort_values('date', ascending=False)
tmp.date.value_counts()

dd = pd.to_datetime('2017-03-12').date()

#### Test first price setting

+ Test building feature matrix:

In [235]:
new_feat_mat = build_feat_mat(feat_mat, configs=test_configs, price_setting=test_prices, dd=dd)

cols = ['sku_config', 'date', 'current_price', 'discount_from_rrp', 'rel_price']
new_feat_mat[new_feat_mat.sku_config.isin(test_configs)][cols].sort_values('sku_config')

Unnamed: 0,sku_config,date,current_price,discount_from_rrp,rel_price
88715,1F22DAA71C149FGS,2017-03-12,31.41,10.0,0.568533
88572,9EB52AA5DA4D3DGS,2017-03-12,33.16,4.985673,0.600208
96431,D4094AA895E68CGS,2017-03-12,29.66,15.014327,0.536857


In [193]:
old_feat_mat = tmp[tmp.date == dd]
old_feat_mat.sort_values('sku_config')[cols]

Unnamed: 0,sku_config,date,current_price,discount_from_rrp,rel_price
52616,1F22DAA71C149FGS,2017-03-12,19.9,42.979943,0.365446
52615,9EB52AA5DA4D3DGS,2017-03-12,19.9,42.979943,0.365446
52661,D4094AA895E68CGS,2017-03-12,20.0,42.69341,0.367283


+ Test predicting demands associated with the price setting:

In [212]:
type(best_xgb)
best_xgb.best_estimator_

sklearn.model_selection._search.GridSearchCV

In [226]:
test_df.date.iloc[0]

datetime.date(2017, 5, 21)

In [230]:
train_df[train_df.date == dd]

Unnamed: 0,sku_config,current_price,discount_from_rrp,n_competitor,rel_price,product_name,color,color_pop,sub_cat,brand_name,brand_type,date,total_views,total_impressions,mean_views,mean_impressions,brand_pop,ordered_date,n_sold
6071,DO816AA07XSIMY,39.9,55.617353,1007,0.732729,Dungaree Pinny Dress,black,24.872127,dresses,Dorothy Perkins,regional_branded,2017-03-12,6695,212479,11.849558,376.069027,6695,2017-03-12,1
6072,MA193AA06RQJMY,15.9,73.455760,1007,0.291990,Flared Skirt Dress,black,24.872127,dresses,Mango,regional_branded,2017-03-12,15721,394383,18.940964,475.160241,15721,2017-03-12,1
6073,MA193AA66ZLTMY,15.9,68.136273,1007,0.291990,Polka-Dot Textured Dress,dark orange,0.129414,dresses,Mango,regional_branded,2017-03-12,15721,394383,18.940964,475.160241,15721,2017-03-12,1
6074,MA193AA65ZLUMY,15.9,68.136273,1007,0.291990,Polka-Dot Textured Dress,dark navy,0.123251,dresses,Mango,regional_branded,2017-03-12,15721,394383,18.940964,475.160241,15721,2017-03-12,1
6075,DO816AA55EAAMY,59.9,40.040040,1007,1.100012,Blush Chiffon Maxi Dress,beige,1.158563,dresses,Dorothy Perkins,regional_branded,2017-03-12,6695,212479,11.849558,376.069027,6695,2017-03-12,0
6076,ATSDRAA0000092GS,15.0,49.832776,1007,0.275462,Floral Shift Dress,black/grey floral,0.283478,dresses,Something Borrowed,regional_private_label,2017-03-12,50946,876925,30.653430,527.632371,50946,2017-03-12,1
6077,ATPWTAA0000024GS,14.9,50.167224,1007,0.273626,Essential Sleeveless A Line Dress,black,24.872127,dresses,ZALORA,regional_private_label,2017-03-12,68950,1148998,28.717201,478.549771,68950,2017-03-12,1
6078,MI665AA16KSPMY,59.9,51.300813,1007,1.100012,Blue Plisse Maxi Dress,blue,12.534664,dresses,Miss Selfridge,regional_branded,2017-03-12,3308,88732,9.560694,256.450867,3308,2017-03-12,1
6079,MI665AA02KTDMY,39.9,52.443385,1007,0.732729,Mono Print Tee Dress,black,24.872127,dresses,Miss Selfridge,regional_branded,2017-03-12,3308,88732,9.560694,256.450867,3308,2017-03-12,1
6080,AUTVGAA0000083GS,17.9,40.133779,1007,0.328718,Cold Shoulder Dress,powder blue,0.154064,dresses,Something Borrowed,regional_private_label,2017-03-12,50946,876925,30.653430,527.632371,50946,2017-03-12,1


In [231]:
predict_1 = demand_predictor.predict_by_xgb(best_xgb.best_estimator_, test_df=train_df[train_df.date == dd])

In [234]:
predict_1.head()

Unnamed: 0,sku_config,product_name,brand_name,date,current_price,discount_from_rrp,n_competitor,rel_price,color_pop,brand_pop,predict_xgb
6071,DO816AA07XSIMY,Dungaree Pinny Dress,Dorothy Perkins,2017-03-12,39.9,55.617353,1007,0.732729,24.872127,6695,0.937899
6072,MA193AA06RQJMY,Flared Skirt Dress,Mango,2017-03-12,15.9,73.45576,1007,0.29199,24.872127,15721,1.370589
6073,MA193AA66ZLTMY,Polka-Dot Textured Dress,Mango,2017-03-12,15.9,68.136273,1007,0.29199,0.129414,15721,1.230723
6074,MA193AA65ZLUMY,Polka-Dot Textured Dress,Mango,2017-03-12,15.9,68.136273,1007,0.29199,0.123251,15721,1.230723
6075,DO816AA55EAAMY,Blush Chiffon Maxi Dress,Dorothy Perkins,2017-03-12,59.9,40.04004,1007,1.100012,1.158563,6695,0.908533


In [233]:
predict_1[predict_1.sku_config.isin(test_configs)]

Unnamed: 0,sku_config,product_name,brand_name,date,current_price,discount_from_rrp,n_competitor,rel_price,color_pop,brand_pop,predict_xgb


### All price settings

# Side Notes

+ There are SKUs with too large black prices. These must be errors from data input.

In [None]:
print(sum(rrp_df.black_price > 1000))
large_price_skus = rrp_df[rrp_df.black_price > 1000]

In [None]:
cols = ['sku_config', 'product_name', 'brand_name']
large_price_skus.merge(my_trigger.prod_df[cols]).drop_duplicates()

## Check SKUs with weird prices

In [None]:
# cf = AD9ABAA31B83EFGS
sub_df = group_df.query('sku_config == "AD9ABAA31B83EFGS" and is_visible == 1') .drop_duplicates()
sub_df.sort_values('price', ascending=False, inplace=True)

In [None]:
sub_df.info()

In [None]:
cols = ['sku_config', 'price', 'online_aging', 'is_visible', 'snapshot_date']
sub_df[cols].head()

In [None]:
sub_df.query('price > 10**5').snapshot_date.describe()

In [None]:
sub_df.price.describe()

## Attach black price to feature matrix

We also need black price data, thus we query them and attach to current feature matrix.

In [160]:
all_configs = group_df.sku_config.dropna().unique()
black_prices = query_black_price(configs=all_configs, group_df=group_df)

SKU with changing black price, use the latest price
          sku_config snapshot_date  black_price
5106  MA193AA24MPFMY    2016-07-11         55.9
2649  MA193AA24MPFMY    2016-05-26         59.9
SKU with changing black price, use the latest price
          sku_config snapshot_date  black_price
5108  MA193AA13MPQMY    2016-07-11        189.0
2651  MA193AA13MPQMY    2016-05-26        179.0
SKU with changing black price, use the latest price
           sku_config snapshot_date  black_price
29772  DO816AA14RUFMY    2016-09-30         93.9
2807   DO816AA14RUFMY    2016-05-31         89.9
SKU with changing black price, use the latest price
            sku_config snapshot_date  black_price
9133  APAKTAA0000080GS    2016-08-04         39.9
3216  APAKTAA0000080GS    2016-06-10         44.9
SKU with changing black price, use the latest price
            sku_config snapshot_date  black_price
536076  MI511AA28IXRMY    2017-06-17        115.0
524769  MI511AA28IXRMY    2017-06-16       1399.0
3785 

In [166]:
feat_mat.dropna(subset=['sku_config'], inplace=True)
feat_mat = feat_mat.merge(black_prices)

In [194]:
fname = feat_dir + gname + '.csv'
feat_mat.to_csv(fname, index=False)

## Old codes

In [42]:
predictions = demand_predictor.predict_by_xgb(best_xgb, test_df)

In [None]:
# use all models
# predictions = demand_predictor.predict(best_models, test_df)
# print(demand_predictor.sort_models_by_test_rmse())