Run this async e.g. weekly to avoid latency in the app

In [1]:
import os

import mysql.connector
import pandas as pd

mydb = mysql.connector.connect(
  host=os.environ['UAT_RDS_CLONE_HOST'],
  user=os.environ['UAT_RDS_CLONE_USER'],
  password=os.environ['UAT_RDS_CLONE_PASSWORD'],
  database="rr_core"
)

##### Connect to PROD Clone (Cost Control)

In [2]:
# get data

myQuery = '''-- cost control hourly
SELECT
	*,
	 DAY(date_of_business) as salesDay,
	 DAYOFWEEK(date_of_business) as salesDayofWeek,
	 WEEK(date_of_business) as salesWeek,
	 MONTH(date_of_business) as salesMonth,
	 YEAR(date_of_business) as salesYear,
	 HOUR(date_of_business) as salesHr
FROM `collated_cc_costcontrol_data`
WHERE type_id = 2 
-- now 10m records from 1.2 records last year!
'''

df_CoCo = pd.read_sql(myQuery, con=mydb)

df_CoCo.head() # TAKES 3 MINS



Unnamed: 0,id,entity_id,type_id,data_date,date_of_business,stream_id,sub_stream_id,data_value,edited_userid,edited_date,realm,salesDay,salesDayofWeek,salesWeek,salesMonth,salesYear,salesHr
0,1,gla,2,2024-03-18 00:00:00,2024-03-18,1,0,0.0,1,2024-03-20 16:18:17,aq15i4or,18,2,11,3,2024,0
1,1,lim,2,2024-02-05 00:00:00,2024-02-05,1,0,1230.49,73,2024-02-14 16:27:05,caledonian,5,2,5,2,2024,0
2,1,bic,2,2020-04-19 23:00:00,2020-04-20,1,0,1000000.0,17,2020-04-28 13:45:34,commture365,20,2,16,4,2020,0
3,1,afh,2,2024-05-21 23:00:00,2024-05-22,1,0,539.12,13,2024-05-29 10:11:05,dfuzph7v,22,4,20,5,2024,0
4,1,mma,2,2023-06-12 05:00:00,2023-06-12,2,0,0.0,0,2023-06-14 11:16:14,dishoom,12,2,24,6,2023,0


In [3]:
df_CoCo.shape

(9905845, 17)

In [4]:
df_CoCo.columns

Index(['id', 'entity_id', 'type_id', 'data_date', 'date_of_business',
       'stream_id', 'sub_stream_id', 'data_value', 'edited_userid',
       'edited_date', 'realm', 'salesDay', 'salesDayofWeek', 'salesWeek',
       'salesMonth', 'salesYear', 'salesHr'],
      dtype='object')

##### Hourly Summary

In [5]:
hourly_summary = df_CoCo.groupby(by = ['realm','entity_id','date_of_business', 'stream_id', 'salesDay', 'salesDayofWeek', 'salesWeek', 'salesMonth', 'salesYear', 'salesHr'], as_index = False) \
                            .agg(num_hrly_txns=('id', 'count'),
                                 sum_revenue=('data_value', 'sum'),
                                 max_revenue=('data_value', 'max')).sort_values(['realm','entity_id','stream_id', 'salesDay', 'salesMonth', 'salesYear'])

hourly_summary.head()

Unnamed: 0,realm,entity_id,date_of_business,stream_id,salesDay,salesDayofWeek,salesWeek,salesMonth,salesYear,salesHr,num_hrly_txns,sum_revenue,max_revenue
267,103parkway,foh,2024-02-01,1,1,5,4,2,2024,0,7,287.55,114.17
316,103parkway,foh,2024-03-01,1,1,6,8,3,2024,0,5,217.09,122.5
406,103parkway,foh,2024-05-01,1,1,4,17,5,2024,0,9,217.51,105.83
48,103parkway,foh,2023-09-01,1,1,6,35,9,2023,0,8,430.08,176.67
145,103parkway,foh,2023-11-01,1,1,4,44,11,2023,0,8,397.33,249.58


In [6]:
hourly_summary.to_csv('hourly_summary.csv', index=False)

##### Close Connection

In [7]:
mydb.close()