In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.formula.api import ols 

In [2]:
df = pd.read_csv('train.csv', sep=';')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86916 entries, 0 to 86915
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Store_id       86916 non-null  int64  
 1   SKU_id         86916 non-null  int64  
 2   Date           86916 non-null  object 
 3   Promo          15349 non-null  float64
 4   Demand         86916 non-null  int64  
 5   Regular_Price  86916 non-null  float64
 6   Promo_Price    15349 non-null  float64
dtypes: float64(3), int64(3), object(1)
memory usage: 4.6+ MB


In [3]:
df.head(n=5)

Unnamed: 0,Store_id,SKU_id,Date,Promo,Demand,Regular_Price,Promo_Price
0,1,1,01.01.2015,,22,163.78,
1,1,1,02.01.2015,,41,163.78,
2,1,1,03.01.2015,,35,163.78,
3,1,1,04.01.2015,,72,163.78,
4,1,1,05.01.2015,,25,163.78,


In [4]:
df.Store_id.unique()

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100, 104, 105, 106])

In [5]:
df.SKU_id.unique()

array([1, 2])

In [6]:
def transform(x, ttype):
    '''
    Accepts vector and performs transformation on it based on ttype
    '''
    if ttype == 'log':
        return np.log1p(x)
    
    return np.array(x)

In [7]:
def elasticity(X, y, model, model_type):
    '''
    Computes elasticity of model based on X, y, model itself and model's type
    '''
    if model_type == 'lin-lin':
        return model.params[1] * np.mean(X) / np.mean(y)
    elif model_type == 'log-lin':
        return model.params[1] * np.mean(X)
    elif model_type == 'log-log':
        return model.params[1]

In [8]:
def PEDmodel(X, y, model_type='lin-lin'):
    '''
    Makes PED model based on input data X, y and model type
    '''
    _types = model_type.split('-')
    
    data = pd.DataFrame()
    data['Act_Price'] = transform(X, _types[0])
    data['Act_Demand'] = transform(y, _types[1])
    
    return ols('Act_Demand ~ Act_Price', data=data).fit()

In [9]:
def get_model_stats(X, y, model, model_type):
    '''
    returns model's statistics
        regression coefficient,
        R-squared,
        T-test p-value,
        elasticity
    '''
    
    regr_coeff = model.params[1]
    r2 = model.rsquared
    t_test_p_value = model.pvalues[1]
    elas = elasticity(X, y, model, model_type)
    
    return regr_coeff, r2, t_test_p_value, elas

In [11]:
all_model_types = ['lin-lin', 'log-lin', 'log-log']
sku_id, store_id, coeff, rsquared, ttpvalue, elas, mtype = [], [], [], [], [], [], []

In [12]:
for shop_id in df.Store_id.unique():
    for good_id in df.SKU_id.unique():
        data = df[(df.Store_id == shop_id) & (df.SKU_id == good_id)]
        for model_type in all_model_types:
            model = PEDmodel(data.Regular_Price, data.Demand, model_type)
            
            c, r2, tp, e = get_model_stats(data.Regular_Price, data.Demand, model, model_type)
            
            sku_id.append(good_id)
            store_id.append(shop_id)
            coeff.append(c)
            rsquared.append(r2)
            ttpvalue.append(tp)
            elas.append(e)
            mtype.append(model_type)

In [13]:
stats = pd.DataFrame({
    'SKU': sku_id,
    'Store': store_id,
    'Coefficient': coeff,
    'R-squared': rsquared,
    'T-test-p-value': ttpvalue,
    'Model-type': mtype,
    'Elasticity': elas
})
stats

Unnamed: 0,SKU,Store,Coefficient,R-squared,T-test-p-value,Model-type,Elasticity
0,1,1,-4.603284,0.017688,2.668840e-03,lin-lin,-2.415556e+00
1,1,1,-641.602207,0.016695,3.530984e-03,log-lin,-8.771532e+04
2,1,1,-2.817601,0.052010,2.034887e-07,log-log,-2.817601e+00
3,2,1,-0.815663,0.036251,1.557856e-05,lin-lin,-7.901477e+00
4,2,1,-118.130766,0.036146,1.603605e-05,log-lin,-1.685219e+04
...,...,...,...,...,...,...,...
613,1,106,-1842.012870,0.098066,1.991407e-02,log-lin,-2.485746e+05
614,1,106,-11.743319,0.155823,2.859918e-03,log-log,-1.174332e+01
615,2,106,103.896104,0.072553,4.674140e-02,lin-lin,9.254603e+02
616,2,106,14486.232549,0.072553,4.674140e-02,log-lin,2.005938e+06


In [53]:
'''TODO:Calcilate a statistics based on PED models
amount of elements the following attributes:
ATTRIBUTE 1 values: 1) 0<=x, 2) -10<=x<-1, 3) x<-10
ATTRIBUTE 2: 1) p-value < 0.05, 2) p-value >= 0.05

E.g. 
for each pair attribute1 x  attribute2 provide (6 pairs in total)
   -share of samples (PED models) 
   with this attribute value.
   
Example of output table

ModelType   ATTR1=1&ATTR2=1  ATTR1=1&ATTR2=2  ATTR1=2&ATTR2=1  ATTR1=2&ATTR2=2  ATTR1=3&ATTR2=1  ATTR1=3&ATTR2=2
LIn-Lin          1%                2%              10%                 59%              20%             8%
Log-Lin          1%                2%              10%                 59%              20%             8%
Log-Log          1%                2%              10%                 59%              20%             8%
'''


# (attr value, (attr min threshold, attr max threshold))
firrst_attr_v = [
    (1, (0, float('+inf'))),
    (2, (-1, 0)),
    (3, (-10, -1)),
    (4, (float('-inf'), -10))
]

second_attr_v = [
    (1, (float('-inf'), 0.05)),
    (2, (0.05, float('+inf')))
]

In [54]:
stats_attr = pd.DataFrame()

for f_val, (e_min, e_max) in firrst_attr_v:
    for s_val, (p_min, p_max) in second_attr_v:
        percents = []
        for model_type in all_model_types:
            stats_model_type = stats[stats['Model-type'] == model_type]
            
            stats_attrs = stats[(stats.Elasticity >= e_min) & (stats.Elasticity <= e_max) & (stats['Model-type'] == model_type)]
            stats_attrs = stats_attrs[(stats_attrs['T-test-p-value'] >= p_min) & (stats_attrs['T-test-p-value'] <= p_max)]
            
            percents.append(round(stats_attrs.shape[0] / stats_model_type.shape[0], 2))
        
        percents = [f'{int(p * 100)}%' for p in percents]
        col_name = 'ATTR1={}&ATTR2={}'.format(f_val, s_val)
        stats_attr[col_name] = percents

stats_attr['ModelType'] = all_model_types

In [55]:
stats_attr

Unnamed: 0,ATTR1=1&ATTR2=1,ATTR1=1&ATTR2=2,ATTR1=2&ATTR2=1,ATTR1=2&ATTR2=2,ATTR1=3&ATTR2=1,ATTR1=3&ATTR2=2,ATTR1=4&ATTR2=1,ATTR1=4&ATTR2=2,ModelType
0,4%,8%,0%,2%,71%,9%,5%,1%,lin-lin
1,4%,7%,0%,0%,0%,0%,75%,14%,log-lin
2,5%,6%,0%,5%,74%,5%,4%,1%,log-log
