In [None]:
import pandas as pd
import pyarrow.parquet as pq
import glob
import os
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load dataFrames
OO = pd.read_parquet('REP_ORD_ORDER.parquet', engine='pyarrow', columns=['ORDER_ID','ORDER_NUM','JOB_CODE','ELIGIBLE'])
JC = pd.read_parquet('REP_ORD_JOB_CODE.parquet', engine='pyarrow', columns=['JOB_CODE_ID','NAME','CORE_DESCRIPTION'])
OOS = pd.read_parquet('REP_ORD_ORDER_STATE.parquet', engine='pyarrow', columns=['ORDER_STATE_ID','FOR_ORDER','ORDER_NUM','LATEST_ASSIGNMENT','TOTAL_TIME_EN_ROUTE','TOTAL_TIME_ON_SITE', 'ONSITE_AT', 'COMPLETED', 'ENROUTE_AT'])
AA = pd.read_parquet('REP_ASN_ASSIGNMENT.parquet', engine='pyarrow', columns=['ASSIGNMENT_ID','FOR_RESOURCE'])
LR = pd.read_parquet('REP_LAB_RESOURCE.parquet', engine='pyarrow', columns=['RESOURCE_ID','FOR_USER'])
LU = pd.read_parquet('REP_LAB_USER.parquet', engine='pyarrow', columns=['USER_ID','LOGON_ID'])

# Merge dataFrames 
DF = OO.merge(JC, left_on='JOB_CODE', right_on='JOB_CODE_ID', how='left')
DF = DF.merge(OOS, left_on='ORDER_ID', right_on='FOR_ORDER', how='left')
DF = DF.merge(AA, left_on='LATEST_ASSIGNMENT', right_on='ASSIGNMENT_ID', how='left')
DF = DF.merge(LR, left_on='FOR_RESOURCE', right_on='RESOURCE_ID', how='left')
DF = DF.merge(LU, left_on='FOR_USER', right_on='USER_ID', how='left')

# Clean and convert timestamps
# Assign and convert the time columns to datetime objects
DF['WORK_START'] = DF['ENROUTE_AT']
DF['WORK_END'] = DF['COMPLETED']
DF['ONSITE_START'] = DF['ONSITE_AT']

DF['WORK_START'] = pd.to_datetime(DF['WORK_START'], errors='coerce')
DF['WORK_END'] = pd.to_datetime(DF['WORK_END'], errors='coerce')
DF['ONSITE_START'] = pd.to_datetime(DF['ONSITE_START'], errors='coerce')

# Calculate actual work duration in minutes
DF['ACTUAL_WORK_DURATION'] = (DF['WORK_END'] - DF['WORK_START']).dt.total_seconds() / 60

# Function to get work dates
def get_work_dates(start_time, end_time):
    if pd.isna(start_time) or pd.isna(end_time):
        return []
        
    start_date = start_time.date()
    end_date = end_time.date()
    
    dates = []
    current_date = start_date
    while current_date <= end_date:
        dates.append(current_date)
        current_date += pd.Timedelta(days=1)
    
    return dates

# Expanding multi-day records
expanded_records = []

for idx, row in DF.iterrows():
    if pd.notna(row['WORK_START']) and pd.notna(row['WORK_END']):
        work_dates = get_work_dates(row['WORK_START'], row['WORK_END'])
        
        total_duration = row['ACTUAL_WORK_DURATION']
        num_work_days = len(work_dates)
        
        if num_work_days > 0 and total_duration > 0:
            daily_duration = total_duration / num_work_days
        else:
             daily_duration = 0
            
        for work_date in work_dates:
            expanded_records.append({
                'TECH_ID': row['LOGON_ID'],
                'DATE': work_date,
                'DAILY_WORK_TIME': daily_duration,
                'ORDER_ID': row['ORDER_ID'],
                'JOB_CODE': row['NAME'],
                'WORK_START': row['WORK_START'],
                'WORK_END': row['WORK_END']
            })

# Creating the expanded dataframe
expanded_df = pd.DataFrame(expanded_records)


# Sector classification (E for Electric and G for Gas)
expanded_df['SECTOR'] = expanded_df['JOB_CODE'].str[0].str.upper()
expanded_df = expanded_df[expanded_df['SECTOR'].isin(['E', 'G'])]


# Aggregate and calculate utilization rate
# Group by Tech, Date, AND Sector to separate minutes worked for E and G jobs on the same day
daily_utilization_by_sector = expanded_df.groupby(['TECH_ID', 'DATE', 'SECTOR']).agg({
    'DAILY_WORK_TIME': 'sum',
    'ORDER_ID': 'count'
}).reset_index()

daily_utilization_by_sector.rename(columns={
    'DAILY_WORK_TIME': 'TOTAL_MINUTES_WORKED',
    'ORDER_ID': 'JOBS_COUNT'
}, inplace=True)

WORK_DAY_MINUTES = 7 * 60  
daily_utilization_by_sector['UTILIZATION_RATE'] = (
    daily_utilization_by_sector['TOTAL_MINUTES_WORKED'] / WORK_DAY_MINUTES
).round(4)


In [None]:
# Creating the sector distribution chart 

custom_palette = {'E': 'red', 'G': 'yellow'}

plt.figure(figsize=(12, 6))

sns.histplot(
    data=daily_utilization_by_sector, 
    x='UTILIZATION_RATE', 
    hue='SECTOR', 
    bins=50, 
    kde=True,
    stat='percent',
    common_norm=False,
    palette=custom_palette
)

plt.axvline(
    x=1.0, 
    color='gray', 
    linestyle='--', 
    linewidth=2, 
    label='100% Target'
)

plt.title('Distribution of Daily Utilization Rate by Sector (Electric vs. Gas)', fontsize=14)
plt.xlabel('Daily Utilization (%)', fontsize=12)
plt.ylabel('Frequency of Days (%)', fontsize=12)

plt.gca().xaxis.set_major_formatter(plt.matplotlib.ticker.PercentFormatter(xmax=1.0))
plt.xlim(0, daily_utilization_by_sector['UTILIZATION_RATE'].max() * 1.05) 

plt.legend(title='Sector', labels=['Electric (E)', 'Gas (G)'])
plt.grid(axis='y', linestyle='--')
plt.tight_layout()
plt.show() 