In [1]:
!python data_loading.py

In [2]:
!python model.py

In [3]:
import numpy as np
import os

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
from model import load_df, preprocess_df, impute_missing

In [6]:
all_data_df = load_df(model_t_plus=1)

In [7]:
all_data_df = impute_missing(all_data_df)

In [8]:
all_data_df, to_pred, gc_time = preprocess_df(all_data_df, months_to_predict=['6.0_2022.0'], model_t_plus=1) 

In [9]:
scope_gcs = ["ESV - PLS",
"ESV - BCM",
"DDV - IT",
"FS - Banking and Capital Markets",
"DDV - Operations",
"Finance - Enterprise Performance Management"]

## Models

In [10]:
from xgboost import XGBRegressor

from sklearn.ensemble import RandomForestRegressor

#model = RandomForestRegressor(n_estimators=125, max_depth = 3)

model_xgb = XGBRegressor(n_estimators = 50, max_depth = 4, tree_method="hist", enable_categorical=True)
#model_xgb = XGBRegressor(n_estimators = 50, max_depth = 4)

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

In [11]:
keep_columns = ['Amount added in pipe', 'Entries added in pipe', 'Unique accounts added in pipe', 'Amount won', 'Projects won', 'Unique accounts won', 'EM% mean', 'business_days', 'Fracn of team as As','Fracn of teams as Ms','As underutilized by', 'Ds underutilized by','log of team size']


In [12]:
all_data_df.columns

Index(['Entries added in pipe', 'Unique accounts added in pipe',
       'Amount added in pipe', 'Projects won', 'Unique accounts won',
       'Amount won', 'Director_Util', 'Senior Manager_Util', 'Manager_Util',
       'Senior Associate_Util', 'Associate_Util', 'Num Ds Above Target',
       'Num SMs above target', 'Num Ms above target', 'Num SAs above target',
       'Num As Above Target', 'Num Ds Below Target', 'Num SAs below target',
       'Num As Below Target', 'Num Ms below target', 'Num SMs below target',
       'Ms overutilized by', 'SAs overutilized by', 'Ds overutilized by',
       'As overutilized by', 'SMs overutilized by', 'As underutilized by',
       'SAs underutilized by', 'Ds underutilized by', 'SMs underutilized by',
       'Ms underutilized by', 'Fracn of teams as Ds', 'Fracn of team as As',
       'Fracn of teams as Ms', 'Fracn of teams as SMs', 'Fracn of team as SAs',
       'Revenue Sum', 'EM% mean', 'Revenue Sum + 1', 'EM% mean + 1',
       'Growth Cell', 'busines

In [13]:
y_log = all_data_df['Revenue Sum + 1']

In [14]:
X = all_data_df[keep_columns]

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y_log, test_size=0.30, random_state=42)

In [16]:
model_xgb.fit(X_train, y_train)

In [17]:
model_xgb.score(X_train, y_train)

0.984259880857257

In [18]:
model_xgb.score(X_test, y_test)

0.8401189093160855

# CODE

In [19]:
from sklearn.feature_selection import SequentialFeatureSelector

In [20]:
c=['Entries added in pipe', 'Unique accounts added in pipe',
       'Amount added in pipe', 'Projects won', 'Unique accounts won',
       'Amount won', 'Senior Associate_Util', 'Manager_Util', 'Associate_Util',
       'Senior Manager_Util', 'Director_Util', 'Num SMs above target',
       'Num Ds Above Target', 'Num As Above Target', 'Num Ms above target',
       'Num SAs above target', 'Num As Below Target', 'Num Ms below target',
       'Num SAs below target', 'Num Ds Below Target', 'Num SMs below target',
       'SAs overutilized by', 'As overutilized by', 'Ms overutilized by',
       'SMs overutilized by', 'Ds overutilized by', 'Ds underutilized by',
       'SAs underutilized by', 'As underutilized by', 'Ms underutilized by',
       'SMs underutilized by', 'Fracn of team as SAs', 'Fracn of teams as Ms',
       'Fracn of team as As', 'Fracn of teams as SMs', 'Fracn of teams as Ds',
       'Revenue Sum', 'EM% mean', 'EM% mean + 1',
       'business_days', 'log of team size']

In [21]:
from joblib import dump, load

In [22]:
for i in range(1, len(c)):
    if os.path.exists('features_and_models/' + str(i) + '_sfs_forward.joblib'):
        pass
    else:
        xgb = XGBRegressor(n_estimators = 50, max_depth = 4, tree_method="hist", enable_categorical=True,random_state=42)
        sfs = SequentialFeatureSelector(xgb,n_features_to_select=i,n_jobs=-1)
        sfs.fit(all_data_df[c], all_data_df['Revenue Sum + 1'])
        dump(sfs, 'features_and_models/' + str(i) + '_sfs_forward.joblib')

        xgb = XGBRegressor(n_estimators = 50, max_depth = 4, tree_method="hist", enable_categorical=True,random_state=42)
        sfs = SequentialFeatureSelector(xgb,n_features_to_select=i,direction='backward',n_jobs=-1)
        sfs.fit(all_data_df[c], all_data_df['Revenue Sum + 1'])
        dump(sfs, 'features_and_models/' + str(i) + '_sfs_backward.joblib')

KeyboardInterrupt: 

In [None]:
stats = []

In [None]:
for i in range(1,41):
    sfs = load('features_and_models/' + str(i) + '_sfs_forward.joblib')
    X = sfs.transform(all_data_df[c])
    X_train, X_test, y_train, y_test = train_test_split(X, y_log, test_size=0.30, random_state=42)
    if os.path.exists('features_and_models/' + str(i) + '_xgb_forward.joblib'):
        xgb = load('features_and_models/' + str(i) + '_xgb_forward.joblib')
    else:
        xgb = XGBRegressor(n_estimators = 50, max_depth = 4, tree_method="hist", enable_categorical=True)
        xgb.fit(X_train, y_train)
        dump(xgb, 'features_and_models/' + str(i) + '_xgb_forward.joblib')
    stats.append([sfs.get_feature_names_out(), xgb.score(X_train, y_train), xgb.score(X_test, y_test)])
    
    sfs = load('features_and_models/' + str(i) + '_sfs_backward.joblib')
    X = sfs.transform(all_data_df[c])
    X_train, X_test, y_train, y_test = train_test_split(X, y_log, test_size=0.30, random_state=42)
    if os.path.exists('features_and_models/' + str(i) + '_xgb_backward.joblib'):
        xgb = load('features_and_models/' + str(i) + '_xgb_backward.joblib')
    else:
        xgb = XGBRegressor(n_estimators = 50, max_depth = 4, tree_method="hist", enable_categorical=True)
        xgb.fit(X_train, y_train)
        dump(xgb, 'features_and_models/' + str(i) + '_xgb_backward.joblib')
    stats.append([sfs.get_feature_names_out(), xgb.score(X_train, y_train), xgb.score(X_test, y_test)])

In [None]:
stats.sort(key=lambda x: x[2], reverse=True)

In [None]:
stats[0]

In [None]:
final_features = list(stats[0][0]) + ['Growth Cell']
X = all_data_df[final_features]
X_train, X_test, y_train, y_test = train_test_split(X, y_log, test_size=0.30, random_state=42)
model_with_growth_cell = XGBRegressor(n_estimators = 50, max_depth = 4, tree_method="hist", enable_categorical=True)
model_with_growth_cell.fit(X_train, y_train)
print(model_with_growth_cell.score(X_train, y_train))
print(model_with_growth_cell.score(X_test, y_test))
print(stats[0][1])
print(stats[0][2])

In [None]:
final_features = list(stats[0][0])
X = all_data_df[final_features]
X_train, X_test, y_train, y_test = train_test_split(X, y_log, test_size=0.30, random_state=42)
final_model = load('features_and_models/' + str(len(stats[0][0])) + '_xgb.joblib')
print(final_model.score(X_train, y_train))
print(final_model.score(X_test, y_test))

In [None]:
keep_columns = final_features

# Predict for Future

In [None]:
to_pred_gc_time = to_pred[['Growth Cell', 'Time']]

In [None]:
#X_copy = to_pred_copy.drop(['Identifier', 'Growth Cell', 'Time'], axis = 1)
X_copy = to_pred[keep_columns]

In [None]:
# pred_rev = (np.exp(model_xgb.predict(X_copy)) - 1)
pred_rev = (np.exp(final_model.predict(X_copy)) - 1)

In [None]:
to_pred_gc_time['Revenue Sum'] = pred_rev

In [None]:
to_pred_gc_time[to_pred_gc_time['Growth Cell'].apply(lambda x: x in scope_gcs)]