In [1]:
!python data_loading.py

In [2]:
!python model.py

In [3]:
import numpy as np
import pandas as pd
import os

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
from model import load_df, preprocess_df, impute_missing

In [6]:
all_data_df = load_df(model_t_plus=1)

In [7]:
all_data_df = impute_missing(all_data_df)

In [8]:
all_data_df, to_pred, gc_time = preprocess_df(all_data_df, months_to_predict=['6.0_2022.0'], model_t_plus=1) 

In [9]:
scope_gcs = ["ESV - PLS",
"ESV - BCM",
"DDV - IT",
"FS - Banking and Capital Markets",
"DDV - Operations",
"Finance - Enterprise Performance Management"]

## Models

In [10]:
from xgboost import XGBRegressor

from sklearn.ensemble import RandomForestRegressor

#model = RandomForestRegressor(n_estimators=125, max_depth = 3)

model_xgb = XGBRegressor(n_estimators = 50, max_depth = 4, tree_method="hist", enable_categorical=True)
#model_xgb = XGBRegressor(n_estimators = 50, max_depth = 4)

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

In [11]:
keep_columns = [ 'Amount added in pipe', 'Entries added in pipe', 'Unique accounts added in pipe', 'Amount won', 'Projects won', 'Unique accounts won', 'EM% mean', 'business_days', 'Fracn of team as As','Fracn of teams as Ms','As underutilized by', 'Ds underutilized by','log of team size']


In [12]:
all_data_df.columns

Index(['Unique accounts added in pipe', 'Amount added in pipe',
       'Entries added in pipe', 'Unique accounts won', 'Amount won',
       'Projects won', 'Senior Associate_Util', 'Associate_Util',
       'Director_Util', 'Senior Manager_Util', 'Manager_Util',
       'Num Ds Above Target', 'Num As Above Target', 'Num SMs above target',
       'Num SAs above target', 'Num Ms above target', 'Num Ms below target',
       'Num As Below Target', 'Num Ds Below Target', 'Num SAs below target',
       'Num SMs below target', 'Ds overutilized by', 'SMs overutilized by',
       'SAs overutilized by', 'As overutilized by', 'Ms overutilized by',
       'As underutilized by', 'Ms underutilized by', 'SMs underutilized by',
       'Ds underutilized by', 'SAs underutilized by', 'Fracn of teams as SMs',
       'Fracn of team as As', 'Fracn of team as SAs', 'Fracn of teams as Ms',
       'Fracn of teams as Ds', 'Revenue Sum', 'EM% mean', 'Revenue Sum + 1',
       'EM% mean + 1', 'Growth Cell', 'busines

In [13]:
y_log = all_data_df['Revenue Sum + 1']

In [14]:
X = all_data_df[keep_columns]

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y_log, test_size=0.30, random_state=42)

In [16]:
model_xgb.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=True,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=4, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=50, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

In [17]:
model_xgb.score(X_train, y_train)

0.984259880857257

In [18]:
model_xgb.score(X_test, y_test)

0.8401189093160855

## Including Growth Call as One Hot Encoding

In [19]:
from sklearn.preprocessing import OneHotEncoder

In [20]:
enc = OneHotEncoder(handle_unknown='ignore',sparse=False)
enc.fit(all_data_df[['Growth Cell']])

OneHotEncoder(handle_unknown='ignore', sparse=False)

In [21]:
pd.DataFrame(enc.transform(all_data_df[['Growth Cell']]), columns=enc.get_feature_names_out(),index=all_data_df.index)

Unnamed: 0,Growth Cell_Cannot Find,Growth Cell_Customer - Analytics,Growth Cell_Customer - Customer Strategy & Experience,Growth Cell_Customer - Marketing,Growth Cell_Customer - Pricing & Profitability,Growth Cell_Customer - Sales,Growth Cell_Customer - Service,Growth Cell_DDV - Finance,Growth Cell_DDV - GTM,"Growth Cell_DDV - ISD (Integration, Separation, Divestitures)",...,Growth Cell_Health - R&D Future of Clinical Trials,Growth Cell_Ops - Capital Project Excellence (CPE),Growth Cell_Ops - Connected Supply Chain (CSC),Growth Cell_Ops - Org Strategy Strategy & Solutions (OSS),Growth Cell_Ops - Product Development & Manufacturing (PDM),Growth Cell_Ops - Sourcing & Procurement (S&P),Growth Cell_Workforce - Deals,Growth Cell_Workforce - HRT,Growth Cell_Workforce - Rewards & Well Being,"Growth Cell_Workforce - Talent, Change & Behaviors"
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
720,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
721,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
722,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
723,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
data = pd.concat([all_data_df.drop("Growth Cell", axis=1), pd.DataFrame(enc.transform(all_data_df[['Growth Cell']]), columns=enc.get_feature_names_out(),index=all_data_df.index)],axis=1)

In [23]:
from sklearn.feature_selection import SequentialFeatureSelector
from joblib import dump, load

In [24]:
c=list(data.columns)
c.remove('Revenue Sum + 1')

In [25]:
for i in range(1, len(c)):
    if os.path.exists('features_and_models_growth/' + str(i) + '_sfs_forward.joblib'):
        pass
    else:
        xgb = XGBRegressor(n_estimators = 50, max_depth = 4, tree_method="hist", enable_categorical=True,random_state=42,n_jobs=-1)
        sfs = SequentialFeatureSelector(xgb,n_features_to_select=i,n_jobs=-1)
        sfs.fit(data[c], data['Revenue Sum + 1'])
        dump(sfs, 'features_and_models_growth/' + str(i) + '_sfs_forward.joblib')

        xgb = XGBRegressor(n_estimators = 50, max_depth = 4, tree_method="hist", enable_categorical=True,random_state=42,n_jobs=-1)
        sfs = SequentialFeatureSelector(xgb,n_features_to_select=i,direction='backward',n_jobs=-1)
        sfs.fit(data[c], data['Revenue Sum + 1'])
        dump(sfs, 'features_and_models_growth/' + str(i) + '_sfs_backward.joblib')

In [26]:
stats = []

In [27]:
def get_feats(sfs):
    feats = []
    for i,val in enumerate(sfs.get_support()):
        if val:
            feats.append(c[i])
    return feats

In [28]:
for i in range(1,len(c)):
    sfs = load('features_and_models_growth/' + str(i) + '_sfs_forward.joblib')
    X = sfs.transform(data[c])
    X_train, X_test, y_train, y_test = train_test_split(X, y_log, test_size=0.30, random_state=42)
    if os.path.exists('features_and_models_growth/' + str(i) + '_xgb_forward.joblib'):
        xgb = load('features_and_models_growth/' + str(i) + '_xgb_forward.joblib')
    else:
        xgb = XGBRegressor(n_estimators = 50, max_depth = 4, tree_method="hist", enable_categorical=True,n_jobs=-1)
        xgb.fit(X_train, y_train)
        dump(xgb, 'features_and_models_growth/' + str(i) + '_xgb_forward.joblib')
    stats.append([get_feats(sfs), xgb.score(X_train, y_train), xgb.score(X_test, y_test), '_growth', '_forward'])
    
    sfs = load('features_and_models_growth/' + str(i) + '_sfs_backward.joblib')
    X = sfs.transform(data[c])
    X_train, X_test, y_train, y_test = train_test_split(X, y_log, test_size=0.30, random_state=42)
    if os.path.exists('features_and_models_growth/' + str(i) + '_xgb_backward.joblib'):
        xgb = load('features_and_models_growth/' + str(i) + '_xgb_backward.joblib')
    else:
        xgb = XGBRegressor(n_estimators = 50, max_depth = 4, tree_method="hist", enable_categorical=True,n_jobs=-1)
        xgb.fit(X_train, y_train)
        dump(xgb, 'features_and_models_growth/' + str(i) + '_xgb_backward.joblib')
    stats.append([get_feats(sfs), xgb.score(X_train, y_train), xgb.score(X_test, y_test), '_growth', '_backward'])

## Excluding Growth Call 

In [29]:
c=list(all_data_df.columns)
c.remove('Revenue Sum + 1')
c.remove('Growth Cell')

In [30]:
for i in range(1, len(c)):
    if os.path.exists('features_and_models/' + str(i) + '_sfs_forward.joblib'):
        pass
    else:
        xgb = XGBRegressor(n_estimators = 50, max_depth = 4, tree_method="hist", enable_categorical=True,random_state=42,n_jobs=-1)
        sfs = SequentialFeatureSelector(xgb,n_features_to_select=i,n_jobs=-1)
        sfs.fit(all_data_df[c], all_data_df['Revenue Sum + 1'])
        dump(sfs, 'features_and_models/' + str(i) + '_sfs_forward.joblib')

        xgb = XGBRegressor(n_estimators = 50, max_depth = 4, tree_method="hist", enable_categorical=True,random_state=42,n_jobs=-1)
        sfs = SequentialFeatureSelector(xgb,n_features_to_select=i,direction='backward',n_jobs=-1)
        sfs.fit(all_data_df[c], all_data_df['Revenue Sum + 1'])
        dump(sfs, 'features_and_models/' + str(i) + '_sfs_backward.joblib')

In [31]:
for i in range(1,len(c)):
    sfs = load('features_and_models/' + str(i) + '_sfs_forward.joblib')
    X = sfs.transform(all_data_df[c])
    X_train, X_test, y_train, y_test = train_test_split(X, y_log, test_size=0.30, random_state=42)
    if os.path.exists('features_and_models/' + str(i) + '_xgb_forward.joblib'):
        xgb = load('features_and_models/' + str(i) + '_xgb_forward.joblib')
    else:
        xgb = XGBRegressor(n_estimators = 50, max_depth = 4, tree_method="hist", enable_categorical=True,n_jobs=-1)
        xgb.fit(X_train, y_train)
        dump(xgb, 'features_and_models/' + str(i) + '_xgb_forward.joblib')
    stats.append([get_feats(sfs), xgb.score(X_train, y_train), xgb.score(X_test, y_test), '', '_forward'])
    
    sfs = load('features_and_models/' + str(i) + '_sfs_backward.joblib')
    X = sfs.transform(all_data_df[c])
    X_train, X_test, y_train, y_test = train_test_split(X, y_log, test_size=0.30, random_state=42)
    if os.path.exists('features_and_models/' + str(i) + '_xgb_backward.joblib'):
        xgb = load('features_and_models/' + str(i) + '_xgb_backward.joblib')
    else:
        xgb = XGBRegressor(n_estimators = 50, max_depth = 4, tree_method="hist", enable_categorical=True,n_jobs=-1)
        xgb.fit(X_train, y_train)
        dump(xgb, 'features_and_models/' + str(i) + '_xgb_backward.joblib')
    stats.append([get_feats(sfs), xgb.score(X_train, y_train), xgb.score(X_test, y_test), '', '_backward'])

## Finding the model that gives best performance

In [32]:
stats.sort(key=lambda x: x[2], reverse=True)

In [33]:
stats[0]

[['Revenue Sum', 'EM% mean + 1'],
 0.9875885304750589,
 0.9554403034948116,
 '_growth',
 '_backward']

In [34]:
final_features = list(stats[0][0])
if stats[0][3] == '_growth':
    X = data[final_features]
else:
    X = all_data_df[final_features]
X_train, X_test, y_train, y_test = train_test_split(X, y_log, test_size=0.30, random_state=42)
final_model = load('features_and_models'+stats[0][3]+'/' + str(len(stats[0][0])) + '_xgb'+stats[0][4]+'.joblib')
print(final_model.score(X_train, y_train))
print(final_model.score(X_test, y_test))

0.9875885304750589
0.9554403034948116


In [35]:
keep_columns = final_features

# Predict for Future

In [36]:
to_pred_gc_time = to_pred[['Growth Cell', 'Time']]

In [38]:
def get_features_with_one_hot(df):
    return pd.concat([df.drop("Growth Cell", axis=1), pd.DataFrame(enc.transform(df[['Growth Cell']]), columns=enc.get_feature_names_out(),index=df.index)],axis=1)

In [39]:
#X_copy = to_pred_copy.drop(['Identifier', 'Growth Cell', 'Time'], axis = 1)
X_copy = get_features_with_one_hot(to_pred)[keep_columns]

In [40]:
# pred_rev = (np.exp(model_xgb.predict(X_copy)) - 1)
pred_rev = (np.exp(final_model.predict(X_copy)) - 1)

In [41]:
to_pred_gc_time['Revenue Sum'] = pred_rev

In [42]:
to_pred_gc_time[to_pred_gc_time['Growth Cell'].apply(lambda x: x in scope_gcs)]

Unnamed: 0,Growth Cell,Time,Revenue Sum
130,DDV - IT,6.0_2022.0,649231.875
142,DDV - Operations,6.0_2022.0,100759.445312
166,ESV - BCM,6.0_2022.0,147384.5625
238,ESV - PLS,6.0_2022.0,53616.476562
274,FS - Banking and Capital Markets,6.0_2022.0,942099.3125
310,Finance - Enterprise Performance Management,6.0_2022.0,209491.328125
