In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from scipy import stats
import sys

sys.path.append('../')

from src.functions import load_dataset, cast_CCT_features

scaler = MinMaxScaler()

### Number of advisors per hour
Read the planning from the excel file

In [None]:
gs_uri = 'gs://videotron-ai-bucket/'
dataset_path = gs_uri + 'dataset/'
planning_df = pd.read_excel(dataset_path+'Planning intra-day 2020-01-01 au 2020-04-30.xlsx')
planning_df = planning_df.fillna(0.0)
planning_df

### Display an histograms of mean number of advisors in the IT call center of calls per hour of the day

In [None]:
hours_cols = planning_df.iloc[:,2:].columns.values
x = range(0, 24)
y = planning_df[hours_cols].mean(axis=0).values

plt.bar(x, y)

plt.ylabel('Mean number of advisors')
plt.xticks(x)
plt.xlabel('Hour')

### Compute the avg number of advisors in the working hours

In [None]:
# Most calls happen in this period
working_hours_cols = ['08:00', '09:00', '10:00', '11:00', '12:00',
        '13:00', '14:00', '15:00', '16:00', '17:00']
planning_df['date'] = pd.to_datetime(planning_df['Date nominale'])
planning_df['avg_num_advisors'] = planning_df[hours_cols].mean(axis=1).round(1)
planning_df['avg_num_advisors_8_to_5'] = planning_df[working_hours_cols].mean(axis=1)
advisor_df = planning_df[['date', 'avg_num_advisors','avg_num_advisors_8_to_5']]

advisor_df

### Daily Performance of the IT call center
The average times are in seconds

In [None]:
perfo_df = pd.read_csv(dataset_path + 'performance_centre_appels_sept2017_mars2020.csv', encoding='iso-8859-1')
perfo_df.columns = ['week_day', 'date', 'total_calls', 'accept', 'accept_percent', 'abandon', 'abandon_percent',
                'SLA', 'avg_time_treat', 'avg_time_engage', 'avg_time_accept', 'avg_time_abadon', 
                'max_time_accept', 'max_time_abandon', 'accept_percent_no_attend', 'abandon_during_ring',
                'abandon_short', 'other']
perfo_df.head()

### Average daily abandon percentage

#### First objective: **Drop goal 10%**:
Percentage Total of abandoned calls per day. The objective is to reach an average daily percentage of abandoned calls less than or equal to **10%** of all calls Receipts


* Current average daily percentage of abandoned calls is 20% for year 2020. So we need to divide by two this metric
* Given the strong correlation between abandon percentage and average wait time, reducing average wait time will decrease abandon percentage

In [None]:
# Convert the average time from minutes to hours
perfo_df['avg_time_accept_min'] = perfo_df['avg_time_accept'] / 60.
perfo_df['avg_time_abandon_min'] = perfo_df['avg_time_abadon'] / 60.

# Set he abandon percentage
perfo_df['abandon_percentage'] = perfo_df['abandon_percent'] * 100.

# Define the start of the test period
test_df = perfo_df[perfo_df['date'] >= '2020-01-01'].copy().reset_index(drop=True)

#
mean_abandon = test_df['abandon_percentage'].mean()
print('Mean abandon percentage on the test period: %.2f%%' % mean_abandon)

### Compute the correlation between average waiting time and abandon percentage

In [None]:
x  = range(test_df.shape[0])

# Scale the data
scaler.fit(test_df[['avg_time_abandon_min', 'abandon_percentage']])
y = scaler.transform(test_df[['avg_time_abandon_min', 'abandon_percentage']])
y1 = y[:, 0]
y2 = y[:, 1]

# Do a pearson correlation analysis
pearson_r , p_val = stats.pearsonr(y1, y2)
title_str = 'Corr coef: {0:.4f}    p-value: {1:.2e}'.format(pearson_r , p_val)

plt.suptitle('Avg waiting time before abandon vs abandon percentage')
plt.title(title_str, size=10)
plt.plot(x, y1, 'r--', label='Avg waiting time before abandon') 
plt.plot(x, y2, 'b--', label='Abandon percentage')
plt.xlabel('Observation day')
plt.legend()
plt.show()

In [None]:
meadian_ratio = np.median(y1/y2)

plt.plot(x, y1/y2, 'r--', label='ratio') 
plt.suptitle('Ratio of percentage of abandonned calls on avg waiting time')
plt.title('Median is: {0:.2f}'.format(meadian_ratio), size=10)
plt.xlabel('Observation day')
plt.legend()
plt.show()

print('The median ratio of percentage of abandonned calls on avg waiting time is {0:.2f}'.format(meadian_ratio))

### Compute the correlation between the number of calls and the waiting time
On the test period

In [None]:
x  = range(test_df.shape[0])

# Scale the data
scaler.fit(test_df[['avg_time_abandon_min', 'total_calls']])
y = scaler.transform(test_df[['avg_time_abandon_min', 'total_calls']])
y1 = y[:, 0]
y2 = y[:, 1]

# Do a pearson correlation analysis
pearson_r , p_val = stats.pearsonr(y1, y2)
print('Corr coef: %.4f, p-value: %.2e' % (pearson_r , p_val))

plt.plot(x, y1, 'r--', label='Avg waiting time before abandon') 
plt.plot(x, y2, 'b--', label='Total number of calls')
plt.xlabel('Observation day')
plt.legend()
plt.show()

### Compute the correlation between the number of calls and the waiting time
On the 2 first months of the test. This correspond to a period before the COVID-19 crisis

In [None]:
x  = range(test_df.loc[0:60, :].shape[0])

# Scale the data
scaler.fit(test_df.loc[0:60, ['avg_time_abandon_min', 'total_calls']])
y = scaler.transform(test_df.loc[0:60, ['avg_time_abandon_min', 'total_calls']])
y1 = y[:, 0]
y2 = y[:, 1]

# Do a pearson correlation analysis
pearson_r , p_val = stats.pearsonr(y1, y2)
print('Corr coef: %.4f, p-value: %.2e' % (pearson_r , p_val))

plt.plot(x, y1, 'r--', label='Avg waiting time before abandon') 
plt.plot(x, y2, 'b--', label='Total number of calls')
plt.xlabel('Observation day')
plt.legend()
plt.show()

### Daily wait time

#### Second objective: 
* The curent weekly waiting time is 550 hours
* The objective is to reduce the weekly waiting time to 300 hours 
* A week is from monday to sunday

* The waiting time is depends on the volume of calls (see correlation analysis above)
* **time gain on Call Center advisors**: operational time we could save on IT call center ressources. This means we could've let the clients wait more time to reduce call center capacity and therefore reduce the operational cost of the IT call center
* **time to gain on Videotron employee working hours**: hours worked by the Videotron's employee. This means we should answer quicker by increasing capacity when required. The Videotron's employee will wait less time when they call the IT center and thereore will be more productive


#### Assumption: We assume that if given the perfectly accurate prediction of number of calls, the call center management will be able to allocate exact number of advisors to keep desire average wait time

In [None]:
test_df.columns.values

In [None]:
test_df['avg_time_abandon_min']

In [None]:
test_df

### Weekly wait time
Compute the weekly wait time in hours during the test period.

Reminder: the average times are in second in the dataframe

In [None]:
test_df['total_time_accept']  = (test_df['avg_time_accept'] * test_df['accept']) / (60 * 60)
test_df['total_time_abandon'] = (test_df['avg_time_abadon'] * test_df['abandon']) /  (60 * 60)
test_df['total_time'] = test_df['total_time_accept'] + test_df['total_time_abandon']
test_df['total_time_desire']  = 300/7 # The desired time is the weekly objective divided by 7 (7 days in a week)

test_df['date'] = pd.to_datetime(test_df['date'])

In [None]:
out_df = test_df[['date', 'total_calls', 'total_time_accept', 'total_time_abandon', 'total_time', 'total_time_desire']]
out_df = test_df.resample('W-Mon', on='date')\
          .agg({'total_calls': np.sum,
                'total_time_accept' : np.sum, 
                'total_time_abandon' : np.sum,
                'total_time': np.sum,
                'total_time_desire' : np.sum}).reset_index()\
          .sort_values(by='date')

out_df['mean_waiting_time (minutes)'] = (out_df['total_time']/out_df['total_calls'])*60

# Compute the desired waiting time per call every week in order to achieve the 300 hours of waiting time per week
out_df['mean_waiting_time_desired (minutes)'] = (out_df['total_time_desire']/out_df['total_calls'])*60
out_df['week'] = out_df['date'].apply(lambda x: x.strftime('%U'))
out_df['year'] = out_df['date'].apply(lambda x: x.strftime('%Y'))

# #
# print('Total desired weekly waiting time: %d hours' % out_df['total_time_desire'].mean())
out_df

In [None]:
plt.bar(out_df.week, out_df['total_calls'])

In [None]:
out_df.iloc[:9, :].describe()

### Compute the mean desired waiting time by call every week of the test period

In [None]:
current_waiting_time = out_df['mean_waiting_time (minutes)'].mean()
WAITING_TIME_GOAL = out_df['mean_waiting_time_desired (minutes)'].mean()

print('During the test period, the current mean waiting time for 1 call is: %.1f minutes' % current_waiting_time)
print('During the test period, the desired mean waiting time for 1 call is: %.1f minutes' % WAITING_TIME_GOAL)

In [None]:
y1 = out_df['mean_waiting_time (minutes)'].values
y2 = out_df['mean_waiting_time_desired (minutes)'].values

x = out_df['week'].values
y3 = WAITING_TIME_GOAL*np.ones(len(x))

plt.plot(x, y1, 'r', label='Observed waiting time')
# plt.plot(x, y2, 'b', label='Desired waiting time')
plt.plot(x, y3, 'g--', label=f'Mean Desired waiting time ({WAITING_TIME_GOAL:.2} mins)')

plt.ylabel('Waiting time by call (minutes)')
plt.xlabel('Week of ' + str(out_df.loc[0,'year']))
plt.title('Waiting times by call by week')
plt.legend()

plt.show()

### Compute the mean desired abandon percentage based on the desired waiting time for one call

In [None]:
avg_abandon_percent_desired = test_df[test_df['avg_time_abandon_min'] <= WAITING_TIME_GOAL][['abandon_percentage']].mean()
print('Average percentage of abandonned calls desired: %.2f%%' % avg_abandon_percent_desired)

### Compute the possible gain on the accept waiting time and the abandon waiting time
* For the calls accepted, if we can answer before the WAITING TIME GOAL, the time gain can be translate to more productivity for the Videotron's employee
* For the calls with an abandon, if we can answer before the WAITING TIME GOAL, less people will abandon, the waiting time will decrease, the videotron's employee will not try a second time to call and can work instead. The gain is also in productivity

We should focus on the gains on th waitings times because the cost of a videotron's employee waiting is much higher that the cost of an operator in the IT call center

In [None]:
test_df['accept_time_gains']  = (test_df['avg_time_accept_min']  - WAITING_TIME_GOAL) * test_df['accept']  / 60
test_df['abandon_time_gains'] = (test_df['avg_time_abandon_min'] - WAITING_TIME_GOAL) * test_df['abandon'] / 60

accept_wt_loss = test_df.loc[test_df['accept_time_gains'] <  0, 'accept_time_gains'].sum()
accept_wt_gain = test_df.loc[test_df['accept_time_gains'] >= 0, 'accept_time_gains'].sum()
abandon_wt_loss = test_df.loc[test_df['abandon_time_gains'] <  0, 'abandon_time_gains'].sum()
abandon_wt_gain = test_df.loc[test_df['abandon_time_gains'] >= 0, 'abandon_time_gains'].sum()

print("Accept time to lose", accept_wt_loss, 'hours')
print("Accept time to gain", accept_wt_gain, 'hours')
#
print("Abandon time to lose", abandon_wt_loss, 'hours')
print("Abandon time to gain", abandon_wt_gain, 'hours')

In [None]:
y1 = test_df['accept_time_gains'].values
y2 = test_df['abandon_time_gains'].values

x = test_df.index

plt.plot(x, y1, 'b', label='Gain on waiting time on accepted')
plt.ylabel('Total waiting time (hours)')
plt.xlabel('Days in test set')
plt.title('Possible gain on waiting times per day')
plt.legend()

plt.show()


plt.plot(x, y2, 'orangered', label='Gain on waiting time on abandon')
plt.ylabel('Total waiting time (hours)')
plt.xlabel('Days in test set')
plt.title('Possible gain on waiting times per day')
plt.legend()

plt.show()

### Join with number of advisors

- Can get the number of calls per hour to join with number of advisors per hour for next step


### Temporary Conclusion

- if ratio between average number of call volume per hour and average number of advisors per hours is < 5, then average wait time should be within 6.3 minutes desired wait time

### Average ratio between average number of calls and average number of advisors per hour that result in wait time less than optimal wait time

Get the number of calls per hour

In [None]:
CCT_df, perfo_df = load_dataset(dataset_path)
CCT_df = CCT_df.loc[~CCT_df['Submitter'].str.contains('SYSTEM')]
CCT_df = cast_CCT_features(CCT_df)

In [None]:
rename_dict = {'status': 'ticket count',
               'Submit_Date_with_hour': 'datetime'}

ticket_stats_df = CCT_df.groupby('Submit_Date_with_hour')['status'].count().reset_index()
ticket_stats_df.rename(columns=rename_dict, inplace=True)
ticket_stats_df['datetime'] = pd.to_datetime(ticket_stats_df['datetime'])
# ticket_stats_df['datetime_int'] = ticket_stats_df['datetime'].astype('int')

# ticket_stats_df['date'] = ticket_stats_df['datetime'].apply(lambda x: x.date())
# ticket_stats_df['hour'] = ticket_stats_df['datetime'].apply(lambda x: x.hour)

ticket_stats_df

### Add the number of advisors along the number of call per hour

In [None]:
planning_df['Date nominale'] = pd.to_datetime(planning_df['Date nominale'])

datetime_list = []
nb_advisor_list = []

for i, dt in enumerate(planning_df['Date nominale']):
    for col in planning_df[hours_cols]:
        nb_advisors = planning_df.loc[planning_df['Date nominale'] == dt, col][i]
        datetime_list.append(f'{dt.date()} {col}')
        nb_advisor_list.append(int(nb_advisors))

nb_advisors_df = pd.DataFrame.from_dict({'datetime': datetime_list,
                                         'nb advisors': nb_advisor_list})
nb_advisors_df['datetime'] = pd.to_datetime(nb_advisors_df['datetime'])

nb_advisors_df['date'] = nb_advisors_df['datetime'].apply(lambda x: x.date())
nb_advisors_df['date'] = pd.to_datetime(nb_advisors_df['date'])
nb_advisors_df['hour'] = nb_advisors_df['datetime'].apply(lambda x: x.hour)

# Merge the number of tickets per hours with the number of advisors per hour
nb_advisors_df = nb_advisors_df.merge(ticket_stats_df, how='left', on='datetime')
nb_advisors_df['ticket count'] = nb_advisors_df['ticket count'].fillna(0)

# Compute the ratio number of tickets by advisors
nb_advisors_df['ticket count per advisor'] = (nb_advisors_df['ticket count']/nb_advisors_df['nb advisors']).fillna(0)
nb_advisors_df['ticket count per advisor'] = nb_advisors_df['ticket count per advisor'].replace([np.inf, -np.inf], np.nan)
nb_advisors_df

In [None]:
import matplotlib

cond = (nb_advisors_df['ticket count per advisor'] != 0)

fig, ax = plt.subplots(figsize=(12,4))

y = nb_advisors_df.loc[cond, 'ticket count per advisor'].values
x = nb_advisors_df.loc[cond, ['datetime']]
mean = nb_advisors_df.loc[cond, 'ticket count per advisor'].mean()

ax.xaxis.set_major_locator(matplotlib.dates.MonthLocator(list(range(1,13))))
ax.xaxis.set_minor_locator(matplotlib.dates.DayLocator(list(range(1,90))))

ax.xaxis.set_major_formatter(matplotlib.dates.DateFormatter("\n%b"))
# ax.xaxis.set_minor_formatter(matplotlib.dates.DateFormatter("%d"))
plt.setp(ax.get_xticklabels(), rotation=0, ha="center")

plt.plot(x, y)
plt.plot(x, mean*np.ones(len(y)), 'r--', label=f'mean ({mean:.2})')
plt.ylabel('Ticket count per advisor per hour')
plt.xticks(rotation=45) 
plt.legend()
plt.show()

In [None]:
nb_advisors_df.loc[cond, :].describe()

In [None]:
cond = ((nb_advisors_df['hour'] >= 8) & (nb_advisors_df['hour'] <= 17))
nb_advisors_df.loc[cond].describe()

In [None]:
cond = ((nb_advisors_df['hour'] >= 8) & (nb_advisors_df['hour'] <= 17))

agg_df = nb_advisors_df.loc[cond].groupby('date').mean()
agg_df

In [None]:
nb_advisors_df.loc[cond].groupby('hour').mean()

In [None]:
test_df = test_df[['date', 'total_calls', 'accept_percent', 'abandon_percent', 'avg_time_accept_min', 'avg_time_abandon_min']]
test_df['date'] = pd.to_datetime(test_df['date'])
test_df = test_df.merge(advisor_df, on='date', how='inner')

# Divide by 10 => approximation for the 10 hours of working hours
test_df['ratio_call_advisor'] = (test_df['total_calls'] / 10) / test_df['avg_num_advisors']

# Merge Renan's ratio call advisor
test_df = test_df.merge(nb_advisors_df[['date','ticket count per advisor']], on='date', how='inner')

# Printing the average ratio between number of calls and number of advisors per hour that will result in waiting that is less than the disired wainting time
print(test_df[(test_df['avg_time_accept_min'] < WAITING_TIME_GOAL) & (test_df['avg_time_abandon_min'] < WAITING_TIME_GOAL)]['ratio_call_advisor'].mean())
print(test_df[(test_df['avg_time_accept_min'] < WAITING_TIME_GOAL) & (test_df['avg_time_abandon_min'] < WAITING_TIME_GOAL)]['ticket count per advisor'].mean())

In [None]:
test_df['ratio_call_advisor'].describe()

### Average ratio between average number of calls and average number of advisors per hour that result in wait time more than the optimal wait time

In [None]:
test_df[(test_df['avg_time_accept_min'] > WAITING_TIME_GOAL) & (test_df['avg_time_abandon_min'] > WAITING_TIME_GOAL)]['ratio_call_advisor'].mean()

### ROI with Prediction

In [None]:
def cast_CCT_features(df):
    dates_list = ['Submit_Date','Reported_Date','Required_Resolution_DateTime','Responded_Date','Last_Resolved_Date','closed_date']
    for col in dates_list:
        df[col] = pd.to_datetime(df[col].fillna('1900-01-01 00:00:00.000'))
        df[col + '_day'] = df[col].apply(lambda x: x.strftime('%Y-%m-%d')) # To group by day
        df[col + '_with_hour'] =df[col].apply(lambda x: x.strftime('%Y-%m-%d %H:00')) # To group by day
    return df

In [None]:
result_path = gs_uri + 'results/'

preds = pd.read_csv(result_path + 'ticket_count_prediction_202005062015.csv')
preds['date'] = pd.to_datetime(preds['date'])
#
perf = pd.read_csv(dataset_path + 'performance_centre_appels_sept2017_mars2020.csv', encoding='iso-8859-1')
perf.columns = ['week_day', 'date', 'total_calls', 'accept', 'accept_percent', 'abandon', 'abandon_percent',
                'SLA', 'avg_time_treat', 'avg_time_engage', 'avg_time_accept', 'avg_time_abadon', 
                'max_time_accept', 'max_time_abandon', 'accept_percent_no_attend', 'abandon_during_ring',
                'abandon_short', 'other']
perf['date'] = pd.to_datetime(perf['date'])
perf = perf[['date', 'total_calls', 'avg_time_accept', 'avg_time_abadon']]
#
inci = pd.read_csv(dataset_path + 'IncidentsCTT20170930_2.csv', encoding="ISO-8859-1", low_memory=False)
inci = inci.loc[~inci['Submitter'].str.contains('SYSTEM')]
inci = cast_CCT_features(inci)
groupby = inci.groupby('Submit_Date_day')
agg_df = pd.DataFrame(groupby['status'].count().reset_index().values, columns=['Submit_Date_day', 'Ticket cnt'])
agg_df['total_real_calls'] = agg_df['Ticket cnt'].astype(int)
agg_df['date'] = pd.to_datetime(agg_df['Submit_Date_day'])
agg_df = agg_df[['date', 'total_real_calls']]

In [None]:
final = preds[['date', 'ticket cnt pred']]\
                .merge(perf[['date', 'total_calls', 'avg_time_accept', 'avg_time_abadon']], 
                       on='date', how='left')\
                .merge(agg_df, on='date', how='left')
final.head()

### Optimal case with perfect prediction

In [None]:
final['accept_time_gains']  = (final['avg_time_accept']/60.  - 7.) * final['total_real_calls']  / 60
#
print("accept time to lose", final.loc[final['accept_time_gains'] < 0, 'accept_time_gains'].sum(), 'hours')
print("accept time to gain", final.loc[final['accept_time_gains'] > 0, 'accept_time_gains'].sum(), 'hours')

### With prediction

- Assume that the difference between predicted and total calls have a direct linear impact on the average wait time
- The result suggests that the prediction tend to over-predict, which benefits time to gain, and at the same time increase time to lose

In [None]:
final['avg_time_pred'] = (final['total_real_calls'] / final['ticket cnt pred']) * 7.
#
final['pred_time_gains']  = (final['avg_time_accept']/60. - final['avg_time_pred']) * final['total_real_calls'] /60.

cost_IT_call_center = int(final.loc[final['pred_time_gains'] < 0, 'pred_time_gains'].sum())
cost_WT_Videotron_employee = int(final.loc[final['pred_time_gains'] > 0, 'pred_time_gains'].sum())


print("pred time to lose (3 months): %d hours" % cost_IT_call_center)
print("pred time to gain (3 months): %d hours" % cost_WT_Videotron_employee)

In [None]:
plt.plot(range(final.shape[0]), final['ticket cnt pred'], 'r--', label='Predict') 
plt.plot(range(final.shape[0]), final['total_real_calls'], 'b--', label='Real')
plt.xlabel('Observation day')
plt.legend()
plt.show()

### Compute the annualize ROI from the 3 months of test
Multiply by 4 the results
* Hourly rate for the employee in the IT call center: 50$ per hour

* Hourly rate for the Videotron employee when they call the IT center: 110$ per hour

In [None]:
print("Cost to lose", cost_IT_call_center * 50 * 4)
print("Cost to gain", cost_WT_Videotron_employee * 110 * 4)

In [None]:
rate_1 = 50
rate_2 = 110

cost_IT_call_center_annualize = -1*cost_IT_call_center*4
cost_WT_Videotron_employee_annualize = cost_WT_Videotron_employee*4

print('Number of hours saved in the IT call center every year: %d' % cost_IT_call_center_annualize)
print('Number of hours of waiting time saved for the Videotron\'s employee: %d' % cost_WT_Videotron_employee_annualize)

print('Cost saved in the IT call center every year: %d$' % (cost_IT_call_center_annualize*rate_1))
print('Cost on waiting time saved for the Videotron\'s employee: {:9,.0f}$\n'.format(cost_WT_Videotron_employee_annualize*rate_2))


total_ROI = cost_IT_call_center_annualize*rate_1+cost_WT_Videotron_employee_annualize*rate_2
print(56*'*')
print('Total potential ROI of the project on 1 year: {:9,.0f}$'.format(total_ROI))
print(56*'*')