In [None]:
import pandas as pd
import plotly.express as px


In [None]:
OO = pd.read_parquet('data/REP_ORD_ORDER.parquet',engine = 'pyarrow',columns = ['ORDER_ID','ORDER_NUM','JOB_CODE','ELIGIBLE'])
JC = pd.read_parquet('data/REP_ORD_JOB_CODE.parquet',engine = 'pyarrow',columns = ['JOB_CODE_ID','NAME','CORE_DESCRIPTION'])
OOS = pd.read_parquet('data/REP_ORD_ORDER_STATE.parquet',engine = 'pyarrow',columns=['ORDER_STATE_ID','FOR_ORDER','ORDER_NUM','LATEST_ASSIGNMENT','TOTAL_TIME_EN_ROUTE','TOTAL_TIME_ON_SITE','ONSITE_AT', 'COMPLETED', 'ENROUTE_AT'])
AA = pd.read_parquet('data/REP_ASN_ASSIGNMENT.parquet',engine = 'pyarrow',columns = ['ASSIGNMENT_ID','FOR_RESOURCE'])
LR = pd.read_parquet('data/REP_LAB_RESOURCE.parquet',engine = 'pyarrow',columns = ['RESOURCE_ID','FOR_USER'])
LU = pd.read_parquet('data/REP_LAB_USER.parquet',engine = 'pyarrow',columns = ['USER_ID','LOGON_ID'])

In [None]:
DF = OO.merge(JC,left_on = 'JOB_CODE',right_on='JOB_CODE_ID')
DF = DF.merge(OOS, left_on = 'ORDER_ID', right_on = 'FOR_ORDER')
DF = DF.merge(AA,left_on = 'LATEST_ASSIGNMENT',right_on = 'ASSIGNMENT_ID')
DF = DF.merge(LR,left_on = 'FOR_RESOURCE',right_on = 'RESOURCE_ID')
DF = DF.merge(LU,left_on = 'FOR_USER',right_on = 'USER_ID')

DF[:3].T

In [None]:
timestamp_cols = ['ELIGIBLE','ONSITE_AT','COMPLETED','ENROUTE_AT',]

DF[timestamp_cols] = DF[timestamp_cols].apply(pd.to_datetime)

In [None]:
DF['WORK_START'] = DF['ENROUTE_AT']  
DF['WORK_END'] = DF['COMPLETED']    
DF['ONSITE_START'] = DF['ONSITE_AT'] 

DF['ACTUAL_WORK_DURATION'] = (DF['WORK_END'] - DF['WORK_START']).dt.total_seconds() / 60

In [None]:
def get_work_dates(start_time, end_time):
    if pd.isna(start_time) or pd.isna(end_time):
        return []
    start_date = start_time.date()
    end_date = end_time.date()
    dates = []
    current_date = start_date
    while current_date <= end_date:
        dates.append(current_date)
        current_date += pd.Timedelta(days=1)
    return dates


expanded_records = []

for idx, row in DF.iterrows():
    if pd.notna(row['WORK_START']) and pd.notna(row['WORK_END']):
        work_dates = get_work_dates(row['WORK_START'], row['WORK_END'])
        if len(work_dates) == 1:
            expanded_records.append({
                'LOGON_ID': row['LOGON_ID'],
                'CORE_DESCRIPTION': row['CORE_DESCRIPTION'],
                'DATE': work_dates[0],
                'DAILY_WORK_TIME': row['ACTUAL_WORK_DURATION'],
                'ORDER_ID': row['ORDER_ID'],
                'JOB_CODE': row['NAME'],
                'WORK_START': row['WORK_START'],
                'WORK_END': row['WORK_END']
            })
        else:
            total_duration = row['ACTUAL_WORK_DURATION']
            daily_duration = total_duration / len(work_dates)
            
            for work_date in work_dates:
                expanded_records.append({
                    'LOGON_ID': row['LOGON_ID'],
                    'CORE_DESCRIPTION': row['CORE_DESCRIPTION'],
                    'DATE': work_date,
                    'DAILY_WORK_TIME': daily_duration,
                    'ORDER_ID': row['ORDER_ID'],
                    'JOB_CODE': row['NAME'],
                    'WORK_START': row['WORK_START'],
                    'WORK_END': row['WORK_END']
                })

expanded_df = pd.DataFrame(expanded_records)

In [None]:
expanded_df[:3].T

In [None]:
daily_utilization = expanded_df.groupby(['LOGON_ID', 'DATE']).agg({
    'DAILY_WORK_TIME': 'sum',
    'ORDER_ID': 'count'
}).reset_index()

daily_utilization.rename(columns={
    'DAILY_WORK_TIME': 'TOTAL_MINUTES_WORKED',
    'ORDER_ID': 'JOBS_COUNT'
}, inplace=True)

DAILY_WORK_MIN = 7 * 60  
daily_utilization['UTILIZATION_RATE'] = (
    daily_utilization['TOTAL_MINUTES_WORKED'] / DAILY_WORK_MIN
).round(4)

In [None]:
daily_utilization['UTILIZATION_RATE_%'] = daily_utilization['UTILIZATION_RATE'] *100
daily_utilization['DATE'] = daily_utilization['DATE'].apply(pd.to_datetime)
daily_utilization[:5].T

In [None]:
util_job_counts = daily_utilization.copy()

In [None]:
bins = [0, 3, 6, 9, 12, 15, 18, float('inf')]
labels = ['1–3', '4–6', '7–9', '10–12', '13–15', '16–18', '19+']
util_job_counts['JOB_COUNT_RANGE'] = pd.cut(util_job_counts['JOBS_COUNT'], bins=bins, labels=labels, right=True)

job_range_avg = (
    util_job_counts
    .groupby('JOB_COUNT_RANGE', observed=False)['UTILIZATION_RATE_%']
    .mean()
    .reset_index()
)

fig1 = px.bar(
    job_range_avg,
    x='JOB_COUNT_RANGE',
    y='UTILIZATION_RATE_%',
    text=job_range_avg['UTILIZATION_RATE_%'].round(1).astype(str) + '%',
    title='Average Utilization Rate by Job Counts',
    labels={'UTILIZATION_RATE_%': 'Average Utilization Rate (%)', 'JOB_COUNT_RANGE': 'Job Count Range'}
)

fig1.update_layout(template='simple_white')
fig1.show()

In [None]:
util_day_ofweek = daily_utilization.copy()

In [None]:
util_day_ofweek['DAY_OF_WEEK'] = util_day_ofweek['DATE'].dt.day_of_week

map_day = {
    0: 'Monday',
    1: 'Tuesday',
    2: 'Wednesday',
    3: 'Thursday',
    4: 'Friday',
    5: 'Saturday',
    6: 'Sunday'
}

util_day_ofweek['DAY_OF_WEEK'] = util_day_ofweek['DAY_OF_WEEK'].map(map_day)

In [None]:
util_day_ofweek

In [None]:
day_week_avg = (
    util_day_ofweek
    .groupby('DAY_OF_WEEK', observed=False)['UTILIZATION_RATE_%']
    .mean()
    .reset_index()
)

weekday_in_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
day_week_avg['DAY_OF_WEEK'] = pd.Categorical(day_week_avg['DAY_OF_WEEK'], categories=weekday_in_order, ordered=True)
day_week_avg = day_week_avg.sort_values('DAY_OF_WEEK')

fig2 = px.bar(
    day_week_avg,
    x='DAY_OF_WEEK',
    y='UTILIZATION_RATE_%',
    text=day_week_avg['UTILIZATION_RATE_%'].round(1).astype(str) + '%',
    title='Average Utilization Rate by Days of the Week',
    labels={'UTILIZATION_RATE_%': 'Average Utilization Rate (%)', 'DAY_OF_WEEK': 'Days of the Week'}
)

fig2.update_layout(template='simple_white')
fig2.show()