In [24]:
import numpy as np
import os

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
from model import load_df, preprocess_df, impute_missing

In [4]:
all_data_df = load_df(model_t_plus=1)

In [5]:
all_data_df = impute_missing(all_data_df)

In [6]:
all_data_df, to_pred, gc_time = preprocess_df(all_data_df, months_to_predict=['6.0_2022.0'], model_t_plus=1) 

In [7]:
scope_gcs = ["ESV - PLS",
"ESV - BCM",
"DDV - IT",
"FS - Banking and Capital Markets",
"DDV - Operations",
"Finance - Enterprise Performance Management"]

## Models

In [8]:
from xgboost import XGBRegressor

from sklearn.ensemble import RandomForestRegressor

#model = RandomForestRegressor(n_estimators=125, max_depth = 3)

model_xgb = XGBRegressor(n_estimators = 50, max_depth = 4, tree_method="hist", enable_categorical=True)
#model_xgb = XGBRegressor(n_estimators = 50, max_depth = 4)

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

In [9]:
keep_columns = [ 'Amount added in pipe', 'Entries added in pipe', 'Unique accounts added in pipe', 'Amount won', 'Projects won', 'Unique accounts won', 'EM% mean', 'business_days', 'Fracn of team as As','Fracn of teams as Ms','As underutilized by', 'Ds underutilized by','log of team size']


In [10]:
all_data_df.columns

Index(['Amount added in pipe', 'Unique accounts added in pipe',
       'Entries added in pipe', 'Amount won', 'Unique accounts won',
       'Projects won', 'Senior Manager_Util', 'Manager_Util', 'Associate_Util',
       'Senior Associate_Util', 'Director_Util', 'Num Ms above target',
       'Num SAs above target', 'Num Ds Above Target', 'Num SMs above target',
       'Num As Above Target', 'Num As Below Target', 'Num SAs below target',
       'Num Ds Below Target', 'Num SMs below target', 'Num Ms below target',
       'SAs overutilized by', 'SMs overutilized by', 'As overutilized by',
       'Ds overutilized by', 'Ms overutilized by', 'SAs underutilized by',
       'SMs underutilized by', 'Ds underutilized by', 'Ms underutilized by',
       'As underutilized by', 'Fracn of team as As', 'Fracn of team as SAs',
       'Fracn of teams as SMs', 'Fracn of teams as Ds', 'Fracn of teams as Ms',
       'Revenue Sum', 'EM% mean', 'Revenue Sum + 1', 'EM% mean + 1',
       'Growth Cell', 'busines

In [11]:
y_log = all_data_df['Revenue Sum + 1']

In [12]:
X = all_data_df[keep_columns]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y_log, test_size=0.30, random_state=42)

In [14]:
model_xgb.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=True,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=4, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=50, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

In [15]:
model_xgb.score(X_train, y_train)

0.984259880857257

In [16]:
model_xgb.score(X_test, y_test)

0.8401189093160855

In [17]:
from sklearn.feature_selection import SequentialFeatureSelector

In [18]:
c=['Entries added in pipe', 'Unique accounts added in pipe',
       'Amount added in pipe', 'Projects won', 'Unique accounts won',
       'Amount won', 'Senior Associate_Util', 'Manager_Util', 'Associate_Util',
       'Senior Manager_Util', 'Director_Util', 'Num SMs above target',
       'Num Ds Above Target', 'Num As Above Target', 'Num Ms above target',
       'Num SAs above target', 'Num As Below Target', 'Num Ms below target',
       'Num SAs below target', 'Num Ds Below Target', 'Num SMs below target',
       'SAs overutilized by', 'As overutilized by', 'Ms overutilized by',
       'SMs overutilized by', 'Ds overutilized by', 'Ds underutilized by',
       'SAs underutilized by', 'As underutilized by', 'Ms underutilized by',
       'SMs underutilized by', 'Fracn of team as SAs', 'Fracn of teams as Ms',
       'Fracn of team as As', 'Fracn of teams as SMs', 'Fracn of teams as Ds',
       'Revenue Sum', 'EM% mean', 'EM% mean + 1',
       'business_days', 'log of team size']

In [19]:
from joblib import dump, load

In [None]:
for i in range(1, len(all_data_df.columns)):
    xgb = XGBRegressor(n_estimators = 50, max_depth = 4, tree_method="hist", enable_categorical=True)
    sfs = SequentialFeatureSelector(xgb,n_features_to_select=i)
    sfs.fit(all_data_df[c], all_data_df['Revenue Sum + 1'])
    dump(sfs, 'features_and_models/' + str(i) + '_sfs.joblib')

In [31]:
stats = []

In [32]:
for i in range(1,41):
    sfs = load('features_and_models/' + str(i) + '_sfs.joblib')
    X = sfs.transform(all_data_df[c])
    X_train, X_test, y_train, y_test = train_test_split(X, y_log, test_size=0.30, random_state=42)
    xgb = XGBRegressor(n_estimators = 50, max_depth = 4, tree_method="hist", enable_categorical=True)
    xgb.fit(X_train, y_train)
    dump(xgb, 'features_and_models/' + str(i) + '_xgb.joblib')
    stats.append([sfs.get_feature_names_out(), xgb.score(X_train, y_train), xgb.score(X_test, y_test)])

In [34]:
stats.sort(key=lambda x: x[2], reverse=True)

In [55]:
stats[0]

[array(['Amount won', 'Associate_Util', 'Num SMs above target',
        'Num Ds Above Target', 'Num As Above Target',
        'Num Ms above target', 'Num SAs above target',
        'Num As Below Target', 'Num SAs below target',
        'Num Ds Below Target', 'As overutilized by',
        'SMs underutilized by', 'Fracn of team as As', 'Revenue Sum',
        'EM% mean + 1'], dtype=object),
 0.9933924364031038,
 0.9196920107164297]

In [42]:
final_features = ['Amount won', 'Associate_Util', 'Num SMs above target',
        'Num Ds Above Target', 'Num As Above Target',
        'Num Ms above target', 'Num SAs above target',
        'Num As Below Target', 'Num SAs below target',
        'Num Ds Below Target', 'As overutilized by',
        'SMs underutilized by', 'Fracn of team as As', 'Revenue Sum',
        'EM% mean + 1'] + ['Growth Cell']
X = all_data_df[final_features]
X_train, X_test, y_train, y_test = train_test_split(X, y_log, test_size=0.30, random_state=42)
model_with_growth_cell = XGBRegressor(n_estimators = 50, max_depth = 4, tree_method="hist", enable_categorical=True)
model_with_growth_cell.fit(X_train, y_train)
print(model_with_growth_cell.score(X_train, y_train))
print(model_with_growth_cell.score(X_test, y_test))
print(stats[0][1])
print(stats[0][2])

-16.186520727649427
-4.44402702403491
0.9933924364031038
0.9196920107164297


In [44]:
final_features = list(stats[0][0])
X = all_data_df[final_features]
X_train, X_test, y_train, y_test = train_test_split(X, y_log, test_size=0.30, random_state=42)
final_model = load('features_and_models/' + str(len(stats[0][0])) + '_xgb.joblib')
print(final_model.score(X_train, y_train))
print(final_model.score(X_test, y_test))

0.9933924364031038
0.9196920107164297


In [45]:
keep_columns = final_features

# Predict for Future

In [46]:
to_pred_gc_time = to_pred[['Growth Cell', 'Time']]

In [47]:
#X_copy = to_pred_copy.drop(['Identifier', 'Growth Cell', 'Time'], axis = 1)
X_copy = to_pred[keep_columns]

In [49]:
# pred_rev = (np.exp(model_xgb.predict(X_copy)) - 1)
pred_rev = (np.exp(final_model.predict(X_copy)) - 1)

In [50]:
to_pred_gc_time['Revenue Sum'] = pred_rev

In [51]:
to_pred_gc_time[to_pred_gc_time['Growth Cell'].apply(lambda x: x in scope_gcs)]

Unnamed: 0,Growth Cell,Time,Revenue Sum
130,DDV - IT,6.0_2022.0,805181.7
142,DDV - Operations,6.0_2022.0,93418.62
166,ESV - BCM,6.0_2022.0,155847.6
238,ESV - PLS,6.0_2022.0,91013.29
274,FS - Banking and Capital Markets,6.0_2022.0,1112598.0
310,Finance - Enterprise Performance Management,6.0_2022.0,295768.8
