In [66]:
import pandas as pd
import os
import json
import numpy as np
import joblib
import warnings

warnings.filterwarnings('ignore')

In [96]:
data_dir = os.path.join(os.path.dirname(os.getcwd()),'data')
model_dir = os.path.join(os.path.dirname(os.getcwd()),'models')
file_name = 'df_final_sku_store_quarter_v2.csv'
top_k_brands_file = 'top_k_brands.json'
model_name = 'xgboost.joblib'

data_path = os.path.join(data_dir, file_name)
assert os.path.exists(data_path), f"{file_name} does not exist"

## Model Loading 

- **Final Model Used**
    - Xgboost Regressor
- **Hyper parameters** 
    - 'colsample_bytree': 1.0
    - 'learning_rate': 0.3
    - 'max_depth': 9
    - 'n_estimators': 200
    - 'subsample': 1.0

In [3]:
# Loading the final Model 
model = joblib.load(os.path.join(model_dir, model_name))

## Data Loading

- Load entire feature engineered dataframe into the memory 
- Created two copies, one to use for prediction and one for ROI analysis
- Transformed Categorical Variables to one hot and Cost related variables on log scale
- Dropped the unnecessary columns

In [4]:
df_orig = pd.read_csv(data_path)
df_orig.head()

Unnamed: 0,STORE,COUNT_ITEMS,MenRatio,White,Asian,Income,Poverty,Professional,Construction,Unemployment,...,VENDOR,BRAND,CLASSID,CITY,STATE,ZIP,COST,RETAIL,PROF_MARG,DISCOUNTED
0,102,96312,0.514077,68.120896,1.543284,47144.328358,17.150746,31.208955,12.074627,7.765672,...,5016699,TIMBERLA,913,TAMPA,FL,33607,38.0,79.0,0.481013,True
1,102,96312,0.514077,68.120896,1.543284,47144.328358,17.150746,31.208955,12.074627,7.765672,...,6713105,INTERNAT,203,TAMPA,FL,33607,2.0,3.0,0.666667,True
2,102,96312,0.514077,68.120896,1.543284,47144.328358,17.150746,31.208955,12.074627,7.765672,...,9520439,BALI/HEN,873,TAMPA,FL,33607,8.0,18.0,0.444444,False
3,102,96312,0.514077,68.120896,1.543284,47144.328358,17.150746,31.208955,12.074627,7.765672,...,5010255,ENZO ANG,214,TAMPA,FL,33607,28.5,60.0,0.475,True
4,102,96312,0.514077,68.120896,1.543284,47144.328358,17.150746,31.208955,12.074627,7.765672,...,5010255,ENZO ANG,214,TAMPA,FL,33607,28.5,60.0,0.475,True


In [5]:
# Creating Dataframe for ROI Analysis
df_save = df_orig.copy()
df_save = df_save.reset_index()
df_save['revenue'] = df_save['averagesellingprice'] * df_save['totalunitssold']
original_keep_cols = ['index','STORE','SKU', 'year', 'quarter', 'revenue']
df_save = df_save[original_keep_cols]
df_save.head()

Unnamed: 0,index,STORE,SKU,year,quarter,revenue
0,0,102,387,2004,Q4,131.93
1,1,102,450,2005,Q3,10.08
2,2,102,788,2004,Q3,36.0
3,3,102,1634,2004,Q3,66.0
4,4,102,1634,2004,Q4,168.0


In [59]:
df_save['pred_log_revenue'] = 0

In [6]:
df = df_orig.copy()

In [7]:
df.head()

Unnamed: 0,STORE,COUNT_ITEMS,MenRatio,White,Asian,Income,Poverty,Professional,Construction,Unemployment,...,VENDOR,BRAND,CLASSID,CITY,STATE,ZIP,COST,RETAIL,PROF_MARG,DISCOUNTED
0,102,96312,0.514077,68.120896,1.543284,47144.328358,17.150746,31.208955,12.074627,7.765672,...,5016699,TIMBERLA,913,TAMPA,FL,33607,38.0,79.0,0.481013,True
1,102,96312,0.514077,68.120896,1.543284,47144.328358,17.150746,31.208955,12.074627,7.765672,...,6713105,INTERNAT,203,TAMPA,FL,33607,2.0,3.0,0.666667,True
2,102,96312,0.514077,68.120896,1.543284,47144.328358,17.150746,31.208955,12.074627,7.765672,...,9520439,BALI/HEN,873,TAMPA,FL,33607,8.0,18.0,0.444444,False
3,102,96312,0.514077,68.120896,1.543284,47144.328358,17.150746,31.208955,12.074627,7.765672,...,5010255,ENZO ANG,214,TAMPA,FL,33607,28.5,60.0,0.475,True
4,102,96312,0.514077,68.120896,1.543284,47144.328358,17.150746,31.208955,12.074627,7.765672,...,5010255,ENZO ANG,214,TAMPA,FL,33607,28.5,60.0,0.475,True


In [8]:
# Creating Dataframe for Prediction
df['STORE']=df['STORE'].astype('str')
df['DEPT']=df['DEPT'].astype('str')
df['VENDOR']=df['VENDOR'].astype('str')
for col in df.columns:
    if df[col].dtype == 'O':
        print(col, df[col].unique().shape[0])

STORE 325
quarter 4
DEPT 60
VENDOR 1360
BRAND 1133
CLASSID 756
CITY 264
STATE 29


In [9]:
# Loading top 20 brands used in model training for prediction
top_k_brands = json.load(open(os.path.join(model_dir, top_k_brands_file)))
print(top_k_brands)

['CLINIQUE', 'CABERNET', 'POLO FAS', 'ROUNDTRE', 'LIZ CLAI', 'LANCOME', 'EMMA JAM', 'BROWN SH', 'CALVIN K', 'TOMMY HI', 'BYER CAL', 'NOBLE EX', 'LEVI STR', 'CHANEL I', 'KIDS HEA', 'DAX CORP', 'KORET OF', 'VANITY F', 'NOBILITY', 'WESTPOIN']


In [10]:
# Converting Revenue on Log Scale
df['revenue'] = df['averagesellingprice'] * df['totalunitssold']
df['log_revenue'] = np.log(df['revenue']+1)
df.drop(columns = ['averagesellingprice', 'totalunitssold'],axis=1, inplace=True)

In [11]:
df.head()

Unnamed: 0,STORE,COUNT_ITEMS,MenRatio,White,Asian,Income,Poverty,Professional,Construction,Unemployment,...,CLASSID,CITY,STATE,ZIP,COST,RETAIL,PROF_MARG,DISCOUNTED,revenue,log_revenue
0,102,96312,0.514077,68.120896,1.543284,47144.328358,17.150746,31.208955,12.074627,7.765672,...,913,TAMPA,FL,33607,38.0,79.0,0.481013,True,131.93,4.889823
1,102,96312,0.514077,68.120896,1.543284,47144.328358,17.150746,31.208955,12.074627,7.765672,...,203,TAMPA,FL,33607,2.0,3.0,0.666667,True,10.08,2.405142
2,102,96312,0.514077,68.120896,1.543284,47144.328358,17.150746,31.208955,12.074627,7.765672,...,873,TAMPA,FL,33607,8.0,18.0,0.444444,False,36.0,3.610918
3,102,96312,0.514077,68.120896,1.543284,47144.328358,17.150746,31.208955,12.074627,7.765672,...,214,TAMPA,FL,33607,28.5,60.0,0.475,True,66.0,4.204693
4,102,96312,0.514077,68.120896,1.543284,47144.328358,17.150746,31.208955,12.074627,7.765672,...,214,TAMPA,FL,33607,28.5,60.0,0.475,True,168.0,5.129899


In [12]:
# One Hot Encoding for Brand Names and Dropping the unnecessary columns not used for prediction
for brand in top_k_brands:
    df[brand] = np.where(df['BRAND']==brand,1,0) 
    
drop_cols = [
    'SKU', 'year','purchase_count', 'return_count', 'VENDOR', 'CITY', 'BRAND', 'PROF_MARG', 'ZIP'
]

df.drop(columns=drop_cols , axis=1, inplace=True)

In [13]:
# Converting Cost, retail and original price on log scale
df['COST'] = np.log(df['COST']+1)
df['RETAIL'] = np.log(df['RETAIL']+1)
df['averageoriginalprice'] = np.log(df['averageoriginalprice']+1)

In [14]:
# Encoding one hot for categorical variables
cat_variables = ['STORE', 'quarter', 'DEPT', 'CLASSID', 'STATE']
df_final = pd.get_dummies(df, columns=cat_variables, drop_first=True)
df_final.head()

Unnamed: 0,COUNT_ITEMS,MenRatio,White,Asian,Income,Poverty,Professional,Construction,Unemployment,averageoriginalprice,...,STATE_NM,STATE_NV,STATE_OH,STATE_OK,STATE_SC,STATE_TN,STATE_TX,STATE_UT,STATE_VA,STATE_WY
0,96312,0.514077,68.120896,1.543284,47144.328358,17.150746,31.208955,12.074627,7.765672,4.382027,...,False,False,False,False,False,False,False,False,False,False
1,96312,0.514077,68.120896,1.543284,47144.328358,17.150746,31.208955,12.074627,7.765672,1.94591,...,False,False,False,False,False,False,False,False,False,False
2,96312,0.514077,68.120896,1.543284,47144.328358,17.150746,31.208955,12.074627,7.765672,2.944439,...,False,False,False,False,False,False,False,False,False,False
3,96312,0.514077,68.120896,1.543284,47144.328358,17.150746,31.208955,12.074627,7.765672,4.110874,...,False,False,False,False,False,False,False,False,False,False
4,96312,0.514077,68.120896,1.543284,47144.328358,17.150746,31.208955,12.074627,7.765672,4.110874,...,False,False,False,False,False,False,False,False,False,False


In [15]:
df_final.columns

Index(['COUNT_ITEMS', 'MenRatio', 'White', 'Asian', 'Income', 'Poverty',
       'Professional', 'Construction', 'Unemployment', 'averageoriginalprice',
       ...
       'STATE_NM', 'STATE_NV', 'STATE_OH', 'STATE_OK', 'STATE_SC', 'STATE_TN',
       'STATE_TX', 'STATE_UT', 'STATE_VA', 'STATE_WY'],
      dtype='object', length=1205)

In [16]:
train_columns = json.load(open(os.path.join(model_dir,'columns.json')))

for col in train_columns:
    if col not in df_final.columns:
        print(col)

CLASSID_717
CLASSID_864


In [17]:
df_final['CLASSID_717'] = False
df_final['CLASSID_864'] = False

In [18]:
df_final.drop(columns=['log_revenue','revenue'], axis=1, inplace=True)

In [19]:
df_final.head()

Unnamed: 0,COUNT_ITEMS,MenRatio,White,Asian,Income,Poverty,Professional,Construction,Unemployment,averageoriginalprice,...,STATE_OH,STATE_OK,STATE_SC,STATE_TN,STATE_TX,STATE_UT,STATE_VA,STATE_WY,CLASSID_717,CLASSID_864
0,96312,0.514077,68.120896,1.543284,47144.328358,17.150746,31.208955,12.074627,7.765672,4.382027,...,False,False,False,False,False,False,False,False,False,False
1,96312,0.514077,68.120896,1.543284,47144.328358,17.150746,31.208955,12.074627,7.765672,1.94591,...,False,False,False,False,False,False,False,False,False,False
2,96312,0.514077,68.120896,1.543284,47144.328358,17.150746,31.208955,12.074627,7.765672,2.944439,...,False,False,False,False,False,False,False,False,False,False
3,96312,0.514077,68.120896,1.543284,47144.328358,17.150746,31.208955,12.074627,7.765672,4.110874,...,False,False,False,False,False,False,False,False,False,False
4,96312,0.514077,68.120896,1.543284,47144.328358,17.150746,31.208955,12.074627,7.765672,4.110874,...,False,False,False,False,False,False,False,False,False,False


In [20]:
del df_orig

In [22]:
del df

In [55]:
train_columns.remove('revenue')
df_final = df_final[train_columns]

## Model Prediction

In [67]:
# Batch Prediction for Xgboost, since the system will run out of memory for 6million row data
batch_size = 50000
total_size = df_final.shape[0]

for start_idx in range(0,total_size,batch_size):
    print(f"Start Index : {start_idx}")
    X_test = df_final.iloc[start_idx:start_idx+batch_size,]
    y_pred = model.predict(X_test)
    
    test_indices = list(X_test.index)
    df_save.loc[test_indices,'pred_log_revenue'] = list(y_pred)
    
    df_save.to_csv(os.path.join(model_dir, 'df_roi_pred.csv'), index=False)

Start Index : 0
Start Index : 50000
Start Index : 100000
Start Index : 150000
Start Index : 200000
Start Index : 250000
Start Index : 300000
Start Index : 350000
Start Index : 400000
Start Index : 450000
Start Index : 500000
Start Index : 550000
Start Index : 600000
Start Index : 650000
Start Index : 700000
Start Index : 750000
Start Index : 800000
Start Index : 850000
Start Index : 900000
Start Index : 950000
Start Index : 1000000
Start Index : 1050000
Start Index : 1100000
Start Index : 1150000
Start Index : 1200000
Start Index : 1250000
Start Index : 1300000
Start Index : 1350000
Start Index : 1400000
Start Index : 1450000
Start Index : 1500000
Start Index : 1550000
Start Index : 1600000
Start Index : 1650000
Start Index : 1700000
Start Index : 1750000
Start Index : 1800000
Start Index : 1850000
Start Index : 1900000
Start Index : 1950000
Start Index : 2000000
Start Index : 2050000
Start Index : 2100000
Start Index : 2150000
Start Index : 2200000
Start Index : 2250000
Start Index : 

In [69]:
df_save.to_csv(os.path.join(model_dir, 'df_roi_pred.csv'), index=False)

In [70]:
df_save.shape

(6105994, 7)

In [72]:
df_save['pred_revenue'] = np.exp(df_save['pred_log_revenue']) - 1
df_save.head()

Unnamed: 0,index,STORE,SKU,year,quarter,revenue,pred_log_revenue,pred_revenue
0,0,102,387,2004,Q4,131.93,5.3031,199.95884
1,1,102,450,2005,Q3,10.08,2.225951,8.262289
2,2,102,788,2004,Q3,36.0,3.946944,50.776907
3,3,102,1634,2004,Q3,66.0,4.895599,132.700118
4,4,102,1634,2004,Q4,168.0,5.033957,152.539441


## Baseline Calculation

- Logic Used
    - Baseline average selling price of a SKU at a particular store will be the selling price of that SKU in the previous quarter

In [82]:
def previous_quarter(quarter, year):
    q_num = int(quarter[-1])  # Extract the quarter number (assuming format 'Q1', 'Q2', etc.)
    if q_num == 1:
        return 'Q4', year - 1
    else:
        return f'Q{q_num - 1}', year

In [90]:
df_save[['prev_quarter', 'prev_year']] = df_save.apply(lambda x: previous_quarter(x['quarter'], x['year']), axis=1, result_type='expand')

prev_revenue_df = df_save[['STORE', 'SKU', 'quarter', 'year', 'revenue']].copy()
prev_revenue_df = prev_revenue_df.rename(columns={'quarter': 'prev_quarter', 'year': 'prev_year', 'revenue': 'prev_revenue'})
df_save = df_save.merge(prev_revenue_df, how='left', on=['STORE', 'SKU', 'prev_quarter', 'prev_year'])

df_save.drop(['prev_quarter', 'prev_year'], axis=1, inplace=True)
df_save.loc[(df_save['quarter'] == 'Q1') & (df_save['year'] == 2004), 'prev_revenue'] = pd.NA
df_save.rename(columns={'prev_revenue': 'base_line'}, inplace=True)


In [91]:
df_save.head()

Unnamed: 0,index,STORE,SKU,year,quarter,revenue,pred_log_revenue,pred_revenue,base_line
0,0,102,387,2004,Q4,131.93,5.3031,199.95884,
1,1,102,450,2005,Q3,10.08,2.225951,8.262289,
2,2,102,788,2004,Q3,36.0,3.946944,50.776907,
3,3,102,1634,2004,Q3,66.0,4.895599,132.700118,
4,4,102,1634,2004,Q4,168.0,5.033957,152.539441,66.0


In [93]:
df_save[~df_save.base_line.isnull()]

Unnamed: 0,index,STORE,SKU,year,quarter,revenue,pred_log_revenue,pred_revenue,base_line
4,4,102,1634,2004,Q4,168.00,5.033957,152.539441,66.00
25,25,102,7915,2004,Q4,97.50,5.462652,234.721779,39.00
26,26,102,7915,2005,Q1,156.00,5.034359,152.601099,97.50
27,27,102,7915,2005,Q2,156.00,5.239471,187.570404,156.00
31,31,102,8224,2005,Q3,32.00,3.787999,43.167928,32.00
...,...,...,...,...,...,...,...,...,...
6105960,6105960,9909,5168252,2005,Q3,20.00,3.301930,26.165028,20.00
6105962,6105962,9909,5168362,2005,Q3,22.91,3.308795,26.352146,11.69
6105966,6105966,9909,5170159,2005,Q3,58.00,4.329112,74.876882,58.00
6105969,6105969,9909,5170885,2005,Q1,198.00,4.271519,70.630373,176.00


In [94]:
df_save[(df_save.STORE == 9909) & (df_save.SKU == 5179773)]

Unnamed: 0,index,STORE,SKU,year,quarter,revenue,pred_log_revenue,pred_revenue,base_line
6105991,6105991,9909,5179773,2005,Q2,57.99,3.731722,40.750926,
6105992,6105992,9909,5179773,2005,Q3,19.0,3.434469,30.014931,57.99


In [95]:
df_save.to_csv(os.path.join(model_dir,'final_roi_analysis.csv'), index=False)

In [99]:
import pandas as pd

data_dir = os.path.join(os.path.dirname(os.getcwd()),'data')
model_dir = os.path.join(os.path.dirname(os.getcwd()),'models')
file_name = 'df_final_sku_store_quarter_v2.csv'

df = pd.read_csv(os.path.join(data_dir, file_name))
df_roi = pd.read_csv(os.path.join(model_dir, 'final_roi_analysis.csv'))

In [101]:
df.columns

Index(['STORE', 'COUNT_ITEMS', 'MenRatio', 'White', 'Asian', 'Income',
       'Poverty', 'Professional', 'Construction', 'Unemployment', 'SKU',
       'year', 'quarter', 'totalunitssold', 'averageoriginalprice',
       'averagesellingprice', 'purchase_count', 'return_count', 'DEPT',
       'PACKSIZE', 'VENDOR', 'BRAND', 'CLASSID', 'CITY', 'STATE', 'ZIP',
       'COST', 'RETAIL', 'PROF_MARG', 'DISCOUNTED'],
      dtype='object')

In [102]:
df = df[['STORE','SKU','totalunitssold', 'quarter', 'year', 'averageoriginalprice','averagesellingprice']]

In [104]:
df_final = pd.merge(df_roi, df, on=['STORE','SKU','quarter','year'], how='inner')
df_final.shape, df_roi.shape

((6105994, 12), (6105994, 9))

In [106]:
df_final.to_csv(os.path.join(model_dir, 'final_roi_analysis_v2.csv'), index=False)