In [6]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score

#tuning
from sklearn.metrics import make_scorer, mean_squared_error
from bayes_opt import BayesianOptimization
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn import svm

import lightgbm as lgb
import xgboost as xgb

import warnings
warnings.filterwarnings("ignore")

from matplotlib import pyplot as plt

AttributeError: module 'matplotlib' has no attribute 'get_data_path'

In [2]:
df = pd.read_excel("./calls.xlsx")
df.drop(['INDEX()'],axis=1,inplace=True)
df['Call Date'] = pd.to_datetime(df['Call Date'], format="%d.%m.%Y")
df['Week of Call Date'] = df['Week of Call Date'].str.replace('Week ','').astype(int)
#df = pd.get_dummies(df, columns=['Call Reason','Customer'])

df = df[['Call Date','Month','Year of Call Date','Week of Call Date','Day of Call Date','Weekday of Call Date','Total Incoming Calls']]
#df = df.groupby(['Call Date','Year of Call Date','Month','Week of Call Date','Day of Call Date','Weekday of Call Date']).sum('Total Incoming Calls')
#df.reset_index(inplace=True)
#df['Day'] = pd.to_datetime(df['Call Date']).dt.day
#df['WeekDay'] = pd.to_datetime(df['Call Date']).dt.dayofweek
df

Unnamed: 0,Call Date,Month,Year of Call Date,Week of Call Date,Day of Call Date,Weekday of Call Date,Total Incoming Calls
0,2020-01-17,1,2020,3,17,6,296
1,2020-01-20,1,2020,4,20,2,381
2,2020-01-21,1,2020,4,21,3,363
3,2020-01-22,1,2020,4,22,4,305
4,2020-01-23,1,2020,4,23,5,304
...,...,...,...,...,...,...,...
482,2021-12-09,12,2021,50,9,5,882
483,2021-12-10,12,2021,50,10,6,792
484,2021-12-13,12,2021,51,13,2,941
485,2021-12-14,12,2021,51,14,3,948


In [3]:
df.dtypes

Call Date               datetime64[ns]
Month                            int64
Year of Call Date                int64
Week of Call Date                int64
Day of Call Date                 int64
Weekday of Call Date             int64
Total Incoming Calls             int64
dtype: object

In [4]:
#Breaking the data and selecting features , predictors
from sklearn.model_selection import train_test_split
predictors=df.drop(['Total Incoming Calls','Call Date'],axis=1)
target=df['Total Incoming Calls']
x_train,x_cv,y_train,y_cv=train_test_split(predictors,target,test_size=0.2,random_state=42)

#Comparing Algorithms
def scores(i):
    lin = i()
    lin.fit(x_train, y_train)
    y_pred=lin.predict(x_cv)
    lin_r= r2_score(y_cv, y_pred)
    s.append(lin_r)
#Checking the scores by using our function
algos=[LinearRegression,KNeighborsRegressor,
       RandomForestRegressor,Lasso,ElasticNet,DecisionTreeRegressor]
s=[]
for i in algos:
    scores(i)

models = pd.DataFrame({
    'Method': ['LinearRegression', 'KNeighborsRegressor',
               'RandomForestRegressor', 'Lasso','DecisionTreeRegressor'],
    'Score': [s[0],s[1],s[2],s[3],s[4]]})
models.sort_values(by='Score', ascending=False)

Unnamed: 0,Method,Score
2,RandomForestRegressor,0.820205
1,KNeighborsRegressor,0.728191
0,LinearRegression,0.540051
3,Lasso,0.453211
4,DecisionTreeRegressor,0.43533


### Tune Random forest

In [20]:
def rms(y_actual, y_predicted):
    return sqrt(mean_squared_error(y_actual, y_predicted))


my_scorer = make_scorer(rms, greater_is_better=False)
pbounds = {
    'n_estimators': (100, 10000),
    'max_depth': (3, 15),
    'min_samples_leaf': (1, 4),
    'min_samples_split': (2, 10),
}


def rf_hyper_param(n_estimators,
                   max_depth,
                   min_samples_leaf,
                   min_samples_split):

    max_depth = int(max_depth)
    n_estimators = int(n_estimators)

    clf = RandomForestRegressor(n_estimators=n_estimators,
                                max_depth=int(max_depth),
                                min_samples_leaf=int(min_samples_leaf),
                                min_samples_split=int(min_samples_split),
                                n_jobs=1)

    return -np.mean(cross_val_score(clf, x_train, y_train, cv=3))


optimizer = BayesianOptimization(
    f=rf_hyper_param,
    pbounds=pbounds,
    random_state=1,
)

In [21]:
optimizer.maximize(
    init_points=3,
    n_iter=20,
    acq='ei'
)

|   iter    |  target   | max_depth | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------
| [0m 1       [0m | [0m-0.7744  [0m | [0m 8.004   [0m | [0m 3.161   [0m | [0m 2.001   [0m | [0m 3.093e+0[0m |
| [95m 2       [0m | [95m-0.7408  [0m | [95m 4.761   [0m | [95m 1.277   [0m | [95m 3.49    [0m | [95m 3.521e+0[0m |
| [0m 3       [0m | [0m-0.773   [0m | [0m 7.761   [0m | [0m 2.616   [0m | [0m 5.354   [0m | [0m 6.884e+0[0m |


TypeError: 'float' object is not subscriptable

In [29]:
params = optimizer.max['params']

# Converting the max_depth and n_estimator values from float to int
params['max_depth'] = int(params['max_depth'])
params['n_estimators'] = int(params['n_estimators'])
params['min_samples_leaf'] = int(params['min_samples_leaf'])
params['min_samples_split'] = int(params['min_samples_split'])

# Initialize an XGBRegressor with the tuned parameters and fit the training data
tunned_rf = RandomForestRegressor(**params)
# Change verbose to True if you want to see it train
tunned_rf.fit(x_train, y_train)

y_pred = tunned_rf.predict(x_cv)
r2_score(y_cv, y_pred)
#mean_squared_error(y_cv, y_pred)

0.7985504334708212

In [37]:
#Hypertuned Model
#RandomForestRegressor(max_depth=4, min_samples_leaf=1, min_samples_split=3, n_estimators=3521)
model = RandomForestRegressor()
model.fit(x_train,y_train)
y_pred = model.predict(x_cv)
r2_score(y_cv,y_pred)
#mean_squared_error(y_cv, y_pred)

0.827642793374243

#### XGBoost

In [110]:
params={ 'objective':'reg:squarederror',
         'max_depth': 6,
         'colsample_bylevel':0.5,
         'learning_rate':0.01,
         'random_state':20}

#reg = xgb.XGBRegressor(objective='reg:squarederror', max_depth=6, colsample_bylevel=0.5, learning_rate=0.01, random_state=20, n_estimators=1000)
reg = xgb.XGBRegressor(objective='reg:squarederror',subsample=0.5, n_estimators=1000, max_depth=5, learning_rate=0.01, colsample_bytree=0.8, colsample_bylevel=0.8)

reg.fit(x_train, y_train,
        verbose=False)  # Change verbose to True if you want to see it train
y_pred = reg.predict(x_cv)
r2_score(y_cv, y_pred)
# mean_squared_error(y_cv, y_pred)
# reg.feature_importances_

0.8657426698526389

#### XGBoost Tuner
Use computed values from below in above regressor

In [80]:
params = { 'max_depth': [3, 5, 6, 10, 15, 20],
           'learning_rate': [0.01, 0.1, 0.2, 0.3],
           'subsample': np.arange(0.5, 1.0, 0.1),
           'colsample_bytree': np.arange(0.4, 1.0, 0.1),
           'colsample_bylevel': np.arange(0.4, 1.0, 0.1),
           'n_estimators': [100, 500, 1000]}
xgbr = xgb.XGBRegressor(seed = 20)
clf = RandomizedSearchCV(estimator=xgbr,
                         param_distributions=params,
                         scoring='neg_mean_squared_error',
                         n_iter=25,
                         verbose=1)
clf.fit(x_train, y_train)
print("Best parameters:", clf.best_params_)
print("Lowest RMSE: ", (-clf.best_score_)**(1/2.0))

Fitting 5 folds for each of 25 candidates, totalling 125 fits
Best parameters: {'subsample': 0.6, 'n_estimators': 500, 'max_depth': 20, 'learning_rate': 0.3, 'colsample_bytree': 0.8999999999999999, 'colsample_bylevel': 0.5}
Lowest RMSE:  135.33376107398024


### LightGBM

In [95]:
lightGBM = lgb.LGBMRegressor(subsample=0.6, max_depth=20, learning_rate=0.3, colsample_bytree=0.9)
lightGBM.fit(x_train, y_train)
y_pred = lightGBM.predict(x_cv)
r2_score(y_cv, y_pred)
#mean_squared_error(y_cv, y_pred)

0.8273641470587264

In [79]:
test = pd.read_excel('./calls_test.xlsx')
test['Call Date'] = pd.to_datetime(test['Call Date'], format="%d.%m.%Y")
test1=test.drop(['Call Date'],axis=1)
pred2=reg.predict(test1)
test['Total Incoming Calls']=pred2

test

Unnamed: 0,Call Date,Year of Call Date,Month,Week of Call Date,Day of Call Date,Weekday of Call Date,Total Incoming Calls
0,2021-12-15,2021,12,51,15,4,901.629639
1,2021-12-16,2021,12,51,16,5,822.551636
2,2021-12-17,2021,12,51,17,6,777.045837
3,2021-12-18,2021,12,51,18,7,793.510864
4,2021-12-19,2021,12,51,19,1,958.438171
5,2021-12-20,2021,12,52,20,2,1285.635864
6,2021-12-21,2021,12,52,21,3,1370.397217
7,2021-12-22,2021,12,52,22,4,1267.748657
8,2021-12-23,2021,12,52,23,5,1122.187012
9,2021-12-24,2021,12,52,24,6,986.279846


In [42]:
model.feature_importances_

array([0.03568191, 0.01716947, 0.82852594, 0.06259934, 0.05602334])