In [1]:
from dotenv import load_dotenv
import requests
import json
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# plt.style.use('classic')
%matplotlib inline

import seaborn as sns
# sns.set()
# plt.rcParams['figure.figsize'] = (31,7)

In [2]:
cacheDir = '~/pandas_cache'

load_dotenv()
HASURA_SECRET = os.getenv("HASURA_SECRET")

In [3]:
def run_query(query, limit=1000000, offset=None):
    
    if offset is None:
        return_df = None
        offset = 0;
        while True:
            df = run_query(query, limit, offset)
            if return_df is None:
                return_df = df.copy()
            else:
                return_df = pd.concat([return_df, df], ignore_index=True)
            print(f"{len(return_df)} rows")
            if len(df) < limit:
                break
            else:
                offset += limit
        return return_df        
    
    url = 'http://localhost:8080/v1/query'
    headers = {"content-type": "application/json", 
               "x-hasura-admin-secret": HASURA_SECRET
              }
    try:
        body = {
            "type": "run_sql",
            "args": {
                "sql": f"{query} limit {limit} offset {offset}"
            }
        }
        r = requests.post(url, json=body, headers=headers)
        results = json.loads(r.text)["result"]
        return pd.DataFrame(results[1:], columns=results[0])
    except Exception as error:
        print(error)
        print(r.status_code)
        print(r.text)

In [4]:
if not os.path.exists(cacheDir):
  os.makedirs(cacheDir)

def cache_query(name, query, munger=None, use_cache=True):
    fname = f"{cacheDir}/{name}.pkl"
    try:
        if not use_cache:
            raise Exception("no cache")
        df = pd.read_pickle(fname)
        print("read from cache")
        print(f"{len(df)} rows")
        return df
    except:
        print("running database query")
        df = run_query(query)
        df.to_pickle(fname)
        if munger:
            df = munger(df)
        df.to_pickle(fname)
        return df 

In [5]:
def query_year(year):
    return f"""
select 
    n.created_at, n.sent_at, n.updated_at, n.notification_status as status,
    n.notification_type as type,
    extract(epoch from (n.sent_at - n.created_at)) as processing_seconds,
    extract(epoch from (n.updated_at - n.sent_at)) as callback_seconds,
    t.process_type as priority,
    s.count_as_live,
    s.name
from 
    notification_history n
    join services s on n.service_id = s.id
    join templates_history t on n.template_id = t.id and n.template_version = t.version
where
    s.count_as_live and
    extract(YEAR from n.created_at) = {year}
order by created_at
"""

In [6]:
def munge_df(df):
    print("munging data")
    df = df.replace("NULL", np.nan)
    df['created_at'] = pd.to_datetime(df.created_at)
    df['updated_at'] = pd.to_datetime(df.updated_at)
    df['date'] = df.created_at.dt.date
    df['month'] = df.created_at.dt.to_period('M')
    df['processing_seconds'] = pd.to_numeric(df.processing_seconds,  errors='coerce')
    df['callback_seconds'] = pd.to_numeric(df.callback_seconds,  errors='coerce')
    return df

## Get data

query / munging times:
* 2020: 8min 36s
* 2021: 31min 40s
* 2022: 31min 43s

In [7]:
%time df = cache_query('notification_history_2022', query_year(2022), munger=munge_df)

read from cache
16824277 rows
CPU times: user 15.8 s, sys: 8.11 s, total: 23.9 s
Wall time: 24.8 s


## Monthly amount of notifications

In [8]:
month_counts = df[['month']].value_counts()
pd.DataFrame(month_counts).sort_values(by='month')

Unnamed: 0_level_0,0
month,Unnamed: 1_level_1
2022-01,2300506
2022-02,2410295
2022-03,4011291
2022-04,4930607
2022-05,3165550
2022-06,6028


## By type

In [9]:
df_counts = df[['month', 'type', 'processing_seconds']].groupby(['month', 'type']).count()
df_counts.columns = ['count']
df_counts = df_counts.reset_index()
df_counts['percent'] = df_counts.apply(lambda r: r['count'] / month_counts[r['month']] * 100, axis=1).round(1)
df_counts.set_index(['month', 'type'])

Unnamed: 0_level_0,Unnamed: 1_level_0,count,percent
month,type,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-01,email,2234721,97.1
2022-01,sms,65783,2.9
2022-02,email,2337695,97.0
2022-02,sms,72594,3.0
2022-03,email,3905388,97.4
2022-03,sms,105894,2.6
2022-04,email,4807629,97.5
2022-04,sms,122943,2.5
2022-05,email,3071041,97.0
2022-05,sms,94466,3.0


In [10]:
print(df_counts.to_markdown(index=False))

| month   | type   |   count |   percent |
|:--------|:-------|--------:|----------:|
| 2022-01 | email  | 2234721 |      97.1 |
| 2022-01 | sms    |   65783 |       2.9 |
| 2022-02 | email  | 2337695 |      97   |
| 2022-02 | sms    |   72594 |       3   |
| 2022-03 | email  | 3905388 |      97.4 |
| 2022-03 | sms    |  105894 |       2.6 |
| 2022-04 | email  | 4807629 |      97.5 |
| 2022-04 | sms    |  122943 |       2.5 |
| 2022-05 | email  | 3071041 |      97   |
| 2022-05 | sms    |   94466 |       3   |
| 2022-06 | email  |    1504 |      25   |
| 2022-06 | sms    |    4524 |      75   |


## By priority

In [11]:
df_counts = df[['month', 'priority', 'processing_seconds']].groupby(['month', 'priority']).count()
df_counts.columns = ['count']
df_counts = df_counts.reset_index()
df_counts['percent'] = df_counts.apply(lambda r: r['count'] / month_counts[r['month']] * 100, axis=1).round(1)
df_counts.set_index(['month', 'priority'])

Unnamed: 0_level_0,Unnamed: 1_level_0,count,percent
month,priority,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-01,bulk,424493,18.5
2022-01,normal,1859685,80.8
2022-01,priority,16326,0.7
2022-02,bulk,523494,21.7
2022-02,normal,1862528,77.3
2022-02,priority,24267,1.0
2022-03,bulk,1695289,42.3
2022-03,normal,2269286,56.6
2022-03,priority,46707,1.2
2022-04,bulk,2406941,48.8


In [12]:
print(df_counts.to_markdown(index=False))

| month   | priority   |   count |   percent |
|:--------|:-----------|--------:|----------:|
| 2022-01 | bulk       |  424493 |      18.5 |
| 2022-01 | normal     | 1859685 |      80.8 |
| 2022-01 | priority   |   16326 |       0.7 |
| 2022-02 | bulk       |  523494 |      21.7 |
| 2022-02 | normal     | 1862528 |      77.3 |
| 2022-02 | priority   |   24267 |       1   |
| 2022-03 | bulk       | 1695289 |      42.3 |
| 2022-03 | normal     | 2269286 |      56.6 |
| 2022-03 | priority   |   46707 |       1.2 |
| 2022-04 | bulk       | 2406941 |      48.8 |
| 2022-04 | normal     | 2456123 |      49.8 |
| 2022-04 | priority   |   67508 |       1.4 |
| 2022-05 | bulk       |  480625 |      15.2 |
| 2022-05 | normal     | 2302800 |      72.7 |
| 2022-05 | priority   |  382082 |      12.1 |
| 2022-06 | normal     |    6028 |     100   |


## Quantiles

Restricting to successful emails for callback timings. Otherwise we will be impacted by the repeated attempts to send the emails.

In [13]:
df_p = df.query("status == 'delivered'")[['month', 'priority', 'processing_seconds']]
df_50 = df_p.groupby(['month', 'priority']).quantile(q=0.5).apply(np.ceil)
df_50.columns = ['processing_50']
df_90 = df_p.groupby(['month', 'priority']).quantile(q=0.9).apply(np.ceil)
df_90.columns = ['processing_90']
df_99 = df_p.groupby(['month', 'priority']).quantile(q=0.99).apply(np.ceil)
df_99.columns = ['processing_99']
processing_df = pd.concat([df_50, df_90, df_99], axis=1).reset_index()
processing_df.set_index(['month', 'priority'])

Unnamed: 0_level_0,Unnamed: 1_level_0,processing_50,processing_90,processing_99
month,priority,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-01,bulk,20.0,48.0,68.0
2022-01,normal,7.0,34.0,78.0
2022-01,priority,1.0,1.0,5.0
2022-02,bulk,18.0,878.0,2615.0
2022-02,normal,6.0,35.0,78.0
2022-02,priority,1.0,2.0,5.0
2022-03,bulk,899.0,2183.0,2938.0
2022-03,normal,4.0,959.0,3165.0
2022-03,priority,1.0,1.0,4.0
2022-04,bulk,1243.0,2610.0,3375.0


In [14]:
df_p = df.query("status == 'delivered'")[['month', 'priority', 'callback_seconds']]
df_50 = df_p.groupby(['month', 'priority']).quantile(q=0.5).apply(np.ceil)
df_50.columns = ['callback_50']
df_90 = df_p.groupby(['month', 'priority']).quantile(q=0.9).apply(np.ceil)
df_90.columns = ['callback_90']
df_99 = df_p.groupby(['month', 'priority']).quantile(q=0.99).apply(np.ceil)
df_99.columns = ['callback_99']
callback_df = pd.concat([df_50, df_90, df_99], axis=1).reset_index()
callback_df.set_index(['month', 'priority'])

Unnamed: 0_level_0,Unnamed: 1_level_0,callback_50,callback_90,callback_99
month,priority,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-01,bulk,12.0,35.0,103.0
2022-01,normal,8.0,39.0,133.0
2022-01,priority,14.0,23.0,294.0
2022-02,bulk,15.0,56.0,167.0
2022-02,normal,7.0,36.0,276.0
2022-02,priority,15.0,33.0,2706.0
2022-03,bulk,11.0,35.0,136.0
2022-03,normal,8.0,30.0,136.0
2022-03,priority,14.0,23.0,151.0
2022-04,bulk,13.0,61.0,467.0


## Email Quantiles

In [15]:
df_p = df.query("type=='email' and status == 'delivered'")[['month', 'priority', 'processing_seconds']]
df_50 = df_p.groupby(['month', 'priority']).quantile(q=0.5).apply(np.ceil)
df_50.columns = ['processing_50']
df_90 = df_p.groupby(['month', 'priority']).quantile(q=0.9).apply(np.ceil)
df_90.columns = ['processing_90']
df_99 = df_p.groupby(['month', 'priority']).quantile(q=0.99).apply(np.ceil)
df_99.columns = ['processing_99']
processing_df = pd.concat([df_50, df_90, df_99], axis=1).reset_index()
processing_df.set_index(['month', 'priority'])

Unnamed: 0_level_0,Unnamed: 1_level_0,processing_50,processing_90,processing_99
month,priority,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-01,bulk,20.0,48.0,68.0
2022-01,normal,7.0,34.0,73.0
2022-01,priority,1.0,1.0,2.0
2022-02,bulk,18.0,878.0,2615.0
2022-02,normal,6.0,34.0,71.0
2022-02,priority,1.0,1.0,3.0
2022-03,bulk,899.0,2183.0,2938.0
2022-03,normal,4.0,976.0,3185.0
2022-03,priority,1.0,527.0,644.0
2022-04,bulk,1243.0,2610.0,3375.0


In [16]:
print(processing_df.to_markdown(index=False))

| month   | priority   |   processing_50 |   processing_90 |   processing_99 |
|:--------|:-----------|----------------:|----------------:|----------------:|
| 2022-01 | bulk       |              20 |              48 |              68 |
| 2022-01 | normal     |               7 |              34 |              73 |
| 2022-01 | priority   |               1 |               1 |               2 |
| 2022-02 | bulk       |              18 |             878 |            2615 |
| 2022-02 | normal     |               6 |              34 |              71 |
| 2022-02 | priority   |               1 |               1 |               3 |
| 2022-03 | bulk       |             899 |            2183 |            2938 |
| 2022-03 | normal     |               4 |             976 |            3185 |
| 2022-03 | priority   |               1 |             527 |             644 |
| 2022-04 | bulk       |            1243 |            2610 |            3375 |
| 2022-04 | normal     |               3 |          

Restricting to successful emails for callback timings. Otherwise we will be impacted by the repeated attempts to send the emails.

In [17]:
df_p = df.query("type=='email' and status == 'delivered'")[['month', 'priority', 'callback_seconds']]
df_50 = df_p.groupby(['month', 'priority']).quantile(q=0.5).apply(np.ceil)
df_50.columns = ['callback_50']
df_90 = df_p.groupby(['month', 'priority']).quantile(q=0.9).apply(np.ceil)
df_90.columns = ['callback_90']
df_99 = df_p.groupby(['month', 'priority']).quantile(q=0.99).apply(np.ceil)
df_99.columns = ['callback_99']
callback_df = pd.concat([df_50, df_90, df_99], axis=1).reset_index()
callback_df.set_index(['month', 'priority'])

Unnamed: 0_level_0,Unnamed: 1_level_0,callback_50,callback_90,callback_99
month,priority,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-01,bulk,12.0,35.0,103.0
2022-01,normal,8.0,37.0,88.0
2022-01,priority,2.0,3.0,13.0
2022-02,bulk,15.0,56.0,167.0
2022-02,normal,7.0,34.0,87.0
2022-02,priority,2.0,6.0,51.0
2022-03,bulk,11.0,35.0,136.0
2022-03,normal,8.0,28.0,107.0
2022-03,priority,3.0,12.0,23.0
2022-04,bulk,13.0,61.0,467.0


In [18]:
print(callback_df.to_markdown(index=False))

| month   | priority   |   callback_50 |   callback_90 |   callback_99 |
|:--------|:-----------|--------------:|--------------:|--------------:|
| 2022-01 | bulk       |            12 |            35 |           103 |
| 2022-01 | normal     |             8 |            37 |            88 |
| 2022-01 | priority   |             2 |             3 |            13 |
| 2022-02 | bulk       |            15 |            56 |           167 |
| 2022-02 | normal     |             7 |            34 |            87 |
| 2022-02 | priority   |             2 |             6 |            51 |
| 2022-03 | bulk       |            11 |            35 |           136 |
| 2022-03 | normal     |             8 |            28 |           107 |
| 2022-03 | priority   |             3 |            12 |            23 |
| 2022-04 | bulk       |            13 |            61 |           467 |
| 2022-04 | normal     |             7 |            27 |           263 |
| 2022-04 | priority   |             2 |           

## SMS Quantiles

In [19]:
df_p = df.query("type=='sms' and status == 'delivered'")[['month', 'priority', 'processing_seconds']]
df_50 = df_p.groupby(['month', 'priority']).quantile(q=0.5).apply(np.ceil)
df_50.columns = ['processing_50']
df_90 = df_p.groupby(['month', 'priority']).quantile(q=0.9).apply(np.ceil)
df_90.columns = ['processing_90']
df_99 = df_p.groupby(['month', 'priority']).quantile(q=0.99).apply(np.ceil)
df_99.columns = ['processing_99']
processing_df = pd.concat([df_50, df_90, df_99], axis=1).reset_index()
processing_df.set_index(['month', 'priority'])

Unnamed: 0_level_0,Unnamed: 1_level_0,processing_50,processing_90,processing_99
month,priority,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-01,normal,17.0,120.0,451.0
2022-01,priority,1.0,1.0,5.0
2022-02,normal,19.0,209.0,556.0
2022-02,priority,1.0,2.0,5.0
2022-03,normal,14.0,333.0,1014.0
2022-03,priority,1.0,1.0,4.0
2022-04,normal,18.0,345.0,3251.0
2022-04,priority,1.0,2.0,5.0
2022-05,normal,8.0,226.0,575.0
2022-05,priority,1.0,2.0,6.0


In [20]:
print(processing_df.to_markdown(index=False))

| month   | priority   |   processing_50 |   processing_90 |   processing_99 |
|:--------|:-----------|----------------:|----------------:|----------------:|
| 2022-01 | normal     |              17 |             120 |             451 |
| 2022-01 | priority   |               1 |               1 |               5 |
| 2022-02 | normal     |              19 |             209 |             556 |
| 2022-02 | priority   |               1 |               2 |               5 |
| 2022-03 | normal     |              14 |             333 |            1014 |
| 2022-03 | priority   |               1 |               1 |               4 |
| 2022-04 | normal     |              18 |             345 |            3251 |
| 2022-04 | priority   |               1 |               2 |               5 |
| 2022-05 | normal     |               8 |             226 |             575 |
| 2022-05 | priority   |               1 |               2 |               6 |
| 2022-06 | normal     |              12 |          

In [21]:
df_p = df.query("type=='sms' and status == 'delivered'")[['month', 'priority', 'callback_seconds']]
df_50 = df_p.groupby(['month', 'priority']).quantile(q=0.5).apply(np.ceil)
df_50.columns = ['callback_50']
df_90 = df_p.groupby(['month', 'priority']).quantile(q=0.9).apply(np.ceil)
df_90.columns = ['callback_90']
df_99 = df_p.groupby(['month', 'priority']).quantile(q=0.99).apply(np.ceil)
df_99.columns = ['callback_99']
callback_df = pd.concat([df_50, df_90, df_99], axis=1).reset_index()
callback_df.set_index(['month', 'priority'])

Unnamed: 0_level_0,Unnamed: 1_level_0,callback_50,callback_90,callback_99
month,priority,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-01,normal,25.0,448.0,928.0
2022-01,priority,14.0,23.0,295.0
2022-02,normal,25.0,7221.0,14776.0
2022-02,priority,15.0,33.0,2714.0
2022-03,normal,22.0,170.0,459.0
2022-03,priority,14.0,23.0,151.0
2022-04,normal,31.0,477.0,1002.0
2022-04,priority,15.0,25.0,399.0
2022-05,normal,20.0,62.0,168.0
2022-05,priority,15.0,24.0,308.0


In [22]:
print(callback_df.to_markdown(index=False))

| month   | priority   |   callback_50 |   callback_90 |   callback_99 |
|:--------|:-----------|--------------:|--------------:|--------------:|
| 2022-01 | normal     |            25 |           448 |           928 |
| 2022-01 | priority   |            14 |            23 |           295 |
| 2022-02 | normal     |            25 |          7221 |         14776 |
| 2022-02 | priority   |            15 |            33 |          2714 |
| 2022-03 | normal     |            22 |           170 |           459 |
| 2022-03 | priority   |            14 |            23 |           151 |
| 2022-04 | normal     |            31 |           477 |          1002 |
| 2022-04 | priority   |            15 |            25 |           399 |
| 2022-05 | normal     |            20 |            62 |           168 |
| 2022-05 | priority   |            15 |            24 |           308 |
| 2022-06 | normal     |            20 |           118 |           402 |


## Busiest 10 minutes

In [23]:
df['ten_minutes'] = df.created_at.dt.ceil('10min')
df2 = pd.DataFrame(df[['month', 'ten_minutes']].value_counts()).reset_index()
df2.columns = ['month', 'ten_minutes', 'count']
df2 = df2.sort_values(by=['month', 'count']).reset_index(drop=True)
df2.groupby('month').last()

Unnamed: 0_level_0,ten_minutes,count
month,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-01,2022-01-11 19:10:00,20692
2022-02,2022-02-16 19:00:00,51752
2022-03,2022-03-08 18:20:00,100158
2022-04,2022-04-13 12:10:00,95040
2022-05,2022-05-04 03:10:00,51498
2022-06,2022-06-01 01:40:00,375


## Find busiest hour from 1 minute segments

In [24]:
df['one_minute'] = df.created_at.dt.ceil('1min')
df2 = pd.DataFrame(df[['month', 'one_minute']].value_counts()).reset_index()
df2.columns = ['month', 'hour_ending_at', 'count']

df2 = df2.sort_values(by=['hour_ending_at'])

df2['count'] = df2[['count']].rolling(window=60).sum()
df2 = df2.dropna()

df2.sort_values(by=['count']).groupby('month').last()

Unnamed: 0_level_0,hour_ending_at,count
month,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-01,2022-01-21 19:49:00,81380.0
2022-02,2022-02-09 22:35:00,100812.0
2022-03,2022-03-08 19:09:00,146253.0
2022-04,2022-04-21 16:58:00,125513.0
2022-05,2022-05-09 15:26:00,99034.0
2022-06,2022-06-03 03:42:00,1038.0
