In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import requests

In [None]:
OO = pd.read_parquet('sort/REP_ORD_ORDER.parquet',engine = 'pyarrow',columns = ['ORDER_ID','ORDER_NUM','JOB_CODE','ELIGIBLE', 'DISPATCH_AREA', 'SLR_ZIP', 'SLR_CITY'])
JC = pd.read_parquet('sort/REP_ORD_JOB_CODE.parquet',engine = 'pyarrow',columns = ['JOB_CODE_ID','NAME','CORE_DESCRIPTION'])
OOS = pd.read_parquet('sort/REP_ORD_ORDER_STATE.parquet',engine = 'pyarrow',columns=['ORDER_STATE_ID','FOR_ORDER','ORDER_NUM','LATEST_ASSIGNMENT','TOTAL_TIME_EN_ROUTE','TOTAL_TIME_ON_SITE', 'DISPATCH_AT', 'RECEIVED_AT', 'ACKNOWLEDGED_AT', 'ENROUTE_AT',
                  'ONSITE_AT', 'COMPLETED', 'CLOSED'])
AA = pd.read_parquet('sort/REP_ASN_ASSIGNMENT.parquet',engine = 'pyarrow',columns = ['ASSIGNMENT_ID','FOR_RESOURCE'])
LR = pd.read_parquet('sort/REP_LAB_RESOURCE.parquet',engine = 'pyarrow',columns = ['RESOURCE_ID','FOR_USER'])
LU = pd.read_parquet('sort/REP_LAB_USER.parquet',engine = 'pyarrow',columns = ['USER_ID','LOGON_ID'])

In [None]:
DF = OO.merge(JC,left_on = 'JOB_CODE',right_on='JOB_CODE_ID')
DF = DF.merge(OOS, left_on = 'ORDER_ID', right_on = 'FOR_ORDER')
DF = DF.merge(AA,left_on = 'LATEST_ASSIGNMENT',right_on = 'ASSIGNMENT_ID')
DF = DF.merge(LR,left_on = 'FOR_RESOURCE',right_on = 'RESOURCE_ID')
DF = DF.merge(LU,left_on = 'FOR_USER',right_on = 'USER_ID')
print(DF[:3].T)
print(DF.shape)

In [None]:
url = "https://raw.githubusercontent.com/OpenDataDE/State-zip-code-GeoJSON/master/ca_california_zip_codes_geo.min.json"

response = requests.get(url)

geojson_data = response.json()

DF_clean = DF[DF['SLR_ZIP'].notna()].copy()
DF_clean['ZIP5'] = DF_clean['SLR_ZIP'].astype(str).str[:5]

DF_clean['TOTAL_TIME_EN_ROUTE'] = pd.to_numeric(DF_clean['TOTAL_TIME_EN_ROUTE'], errors='coerce')
DF_clean['TOTAL_TIME_ON_SITE'] = pd.to_numeric(DF_clean['TOTAL_TIME_ON_SITE'], errors='coerce')

DF_clean['WORK_START'] = DF_clean['ENROUTE_AT']
DF_clean['WORK_END'] = DF_clean['COMPLETED']

def get_work_dates(start_time, end_time):
    if pd.isna(start_time) or pd.isna(end_time):
        return []

    start_date = start_time.date()
    end_date = end_time.date()
    dates = []
    current_date = start_date

    while current_date <= end_date:
        dates.append(current_date)
        current_date += pd.Timedelta(days=1)
    return dates

expanded_records = []

for idx, row in DF_clean.iterrows():
    if pd.notna(row['WORK_START']) and pd.notna(row['WORK_END']):
        work_dates = get_work_dates(row['WORK_START'], row['WORK_END'])

        if len(work_dates) == 1:
            expanded_records.append({
                'LOGON_ID': row['LOGON_ID'],
                'DATE': work_dates[0],
                'ZIP5': row['ZIP5'],
                'TOTAL_TIME_EN_ROUTE': row['TOTAL_TIME_EN_ROUTE'],
                'TOTAL_TIME_ON_SITE': row['TOTAL_TIME_ON_SITE']
            })
        else:
            n_days = len(work_dates)
            en_route_per_day = row['TOTAL_TIME_EN_ROUTE'] / n_days
            onsite_per_day = row['TOTAL_TIME_ON_SITE'] / n_days

            for d in work_dates:
                expanded_records.append({
                    'LOGON_ID': row['LOGON_ID'],
                    'DATE': d,
                    'ZIP5': row['ZIP5'],
                    'TOTAL_TIME_EN_ROUTE': en_route_per_day,
                    'TOTAL_TIME_ON_SITE': onsite_per_day
                })

expanded_df = pd.DataFrame(expanded_records)
expanded_df['TOTAL_WORKED'] = expanded_df['TOTAL_TIME_EN_ROUTE'] + expanded_df['TOTAL_TIME_ON_SITE']

daily = expanded_df.groupby(['LOGON_ID','DATE','ZIP5'], as_index=False).agg({'TOTAL_WORKED':'sum'})
daily['DAILY_UTIL_%'] = (daily['TOTAL_WORKED'] / (7*60*60)) * 100

zip_util = daily.groupby('ZIP5', as_index=False).agg({
    'DAILY_UTIL_%': 'mean',
    'TOTAL_WORKED': 'sum',
    'LOGON_ID': 'count'
})

zip_util = zip_util.rename(columns={'LOGON_ID':'TOTAL_JOBS'})
zip_util['DAILY_UTIL_%'] = zip_util['DAILY_UTIL_%'].round(2)

In [None]:
fig = px.choropleth(
    zip_util,
    geojson=geojson_data,
    locations='ZIP5',
    featureidkey='properties.ZCTA5CE10',
    color='DAILY_UTIL_%',               
    hover_data=['TOTAL_WORKED', 'TOTAL_JOBS'],
    color_continuous_scale='Plasma',
    range_color=[0, 60],
    title='Average Technician Utilization by Zip Code'
)

fig.update_geos(fitbounds="locations", visible=False)
fig.update_layout(margin={"r":0,"t":50,"l":0,"b":0})
fig.show()

In [None]:
DF['TOTAL_TIME_EN_ROUTE'] = pd.to_numeric(DF['TOTAL_TIME_EN_ROUTE'], errors='coerce')
DF['TOTAL_TIME_ON_SITE'] = pd.to_numeric(DF['TOTAL_TIME_ON_SITE'], errors='coerce')

DF['WORK_START'] = DF['ENROUTE_AT']
DF['WORK_END'] = DF['COMPLETED']
DF['ONSITE_START'] = DF['ONSITE_AT']

DF['ACTUAL_TIME_EN_ROUTE'] = (DF['ONSITE_START'] - DF['WORK_START']).dt.total_seconds()
DF['ACTUAL_TIME_ON_SITE'] = (DF['WORK_END'] - DF['ONSITE_START']).dt.total_seconds()


def get_work_dates(start_time, end_time):
    if pd.isna(start_time) or pd.isna(end_time):
        return []
    start_date = start_time.date()
    end_date = end_time.date()
    dates = []
    current_date = start_date
    while current_date <= end_date:
        dates.append(current_date)
        current_date += pd.Timedelta(days=1)
    return dates


expanded_records = []

for idx, row in DF.iterrows():
    if pd.notna(row['WORK_START']) and pd.notna(row['WORK_END']):
        work_dates = get_work_dates(row['WORK_START'], row['WORK_END'])
    if len(work_dates) == 1:
        expanded_records.append({
        'LOGON_ID': row['LOGON_ID'],
        'DATE': work_dates[0],
        'TOTAL_TIME_EN_ROUTE': row['ACTUAL_TIME_EN_ROUTE'],
        'TOTAL_TIME_ON_SITE': row['ACTUAL_TIME_ON_SITE']
        })
    else:
        n_days = len(work_dates)
        en_route_per_day = row['ACTUAL_TIME_EN_ROUTE'] / n_days
        onsite_per_day = row['ACTUAL_TIME_ON_SITE'] / n_days

        for d in work_dates:
            expanded_records.append({
            'LOGON_ID': row['LOGON_ID'],
            'DATE': d,
            'TOTAL_TIME_EN_ROUTE': en_route_per_day,
            'TOTAL_TIME_ON_SITE': onsite_per_day
            })

expanded_df = pd.DataFrame(expanded_records)


daily = expanded_df.groupby(['LOGON_ID', 'DATE'], as_index=False).agg({
'TOTAL_TIME_EN_ROUTE': 'sum',
'TOTAL_TIME_ON_SITE': 'sum'
})


daily['TOTAL_WORKED'] = daily['TOTAL_TIME_EN_ROUTE'] + daily['TOTAL_TIME_ON_SITE']
daily['DAILY_UTILIZATION_RATIO'] = daily['TOTAL_WORKED'] / (7 * 60 * 60)
daily['DAILY_UTILIZATION_%'] = daily['DAILY_UTILIZATION_RATIO'] * 100


tech_util = daily.groupby('LOGON_ID', as_index=False).agg({
'DAILY_UTILIZATION_RATIO': 'mean'
})
tech_util['UTILIZATION_%'] = tech_util['DAILY_UTILIZATION_RATIO'] * 100

In [None]:
DF['TOTAL_TIME_EN_ROUTE'] = pd.to_numeric(DF['TOTAL_TIME_EN_ROUTE'], errors='coerce')
DF['TOTAL_TIME_ON_SITE'] = pd.to_numeric(DF['TOTAL_TIME_ON_SITE'], errors='coerce')

DF['DATE'] = pd.to_datetime(DF['ELIGIBLE']).dt.date

daily = (
    DF.groupby(['LOGON_ID', 'DATE'], as_index=False)
      .agg({'TOTAL_TIME_EN_ROUTE': 'sum', 'TOTAL_TIME_ON_SITE': 'sum'})
)

daily['TOTAL_WORKED'] = daily['TOTAL_TIME_EN_ROUTE'] + daily['TOTAL_TIME_ON_SITE']

daily['DAILY_UTILIZATION_RATIO'] = (daily['TOTAL_WORKED'] / (7 * 60 * 60))
daily['DAILY_UTILIZATION_%'] = daily['DAILY_UTILIZATION_RATIO'] * 100

tech_util = (
    daily.groupby('LOGON_ID', as_index=False)
         .agg({'DAILY_UTILIZATION_RATIO': 'mean'})
)

tech_util['UTILIZATION_%'] = tech_util['DAILY_UTILIZATION_RATIO'] * 100

In [None]:
fig = px.histogram(
    tech_util,
    x='UTILIZATION_%',
    nbins=20,
    title='Distribution of Technician Utilization for SDGE Job Assignments',
)

fig.update_layout(
    xaxis_title='Utilization (%)',
    yaxis_title='Number of Technicians',
    xaxis=dict(range=[0, 100]),
    bargap=0.05,
    template='plotly_white',
)

fig.update_traces(
    hovertemplate='Utilization: %{x:.1f}%<br>Technicians: %{y}'
)

fig.update_layout(
    annotations=[
        dict(
            xref='paper',
            yref='paper',
            x=0, 
            y=-0.2,
            showarrow=False,
            text='*Utilization = % of shift time spent traveling to and working on jobs*',
            font=dict(size=12, color='gray'),
            align='left'
        )
    ]
)

fig.show()

In [None]:
daily_dispatch_area = (
    DF.groupby(['LOGON_ID', 'DISPATCH_AREA', 'DATE'], as_index=False)
    .agg({'TOTAL_TIME_EN_ROUTE': 'sum', 'TOTAL_TIME_ON_SITE': 'sum'})
    )

daily_dispatch_area['TOTAL_WORKED'] = daily_dispatch_area['TOTAL_TIME_EN_ROUTE'] + daily_dispatch_area['TOTAL_TIME_ON_SITE']
daily_dispatch_area['DAILY_UTILIZATION_RATIO'] = daily_dispatch_area['TOTAL_WORKED'] / (7 * 60 * 60)
daily_dispatch_area['DAILY_UTILIZATION_%'] = daily_dispatch_area['DAILY_UTILIZATION_RATIO'] * 100

dispatch_util = (
daily_dispatch_area.groupby('DISPATCH_AREA', as_index=False)
.agg({'DAILY_UTILIZATION_RATIO': 'mean'})
)

dispatch_util['AVG_UTILIZATION_%'] = dispatch_util['DAILY_UTILIZATION_RATIO'] * 100
dispatch_util = dispatch_util.sort_values(by='AVG_UTILIZATION_%', ascending=False)

In [None]:
fig = px.bar(dispatch_util,
x='DISPATCH_AREA',
y='AVG_UTILIZATION_%',
text='AVG_UTILIZATION_%',
labels={'AVG_UTILIZATION_%':'Average Utilization (%)', 'DISPATCH_AREA':'Dispatch Area'},
title='Average Technician Utilization by Dispatch Area',
color='AVG_UTILIZATION_%',
color_continuous_scale='Blues')

fig.update_traces(texttemplate='%{text:.2f}%', textposition='outside')
fig.update_layout(yaxis=dict(range=[0, 50]), xaxis_tickangle=-45)
fig.show()

In [None]:
lowest_util = tech_util.sort_values('UTILIZATION_%', ascending=True)

In [None]:
lowest10 = lowest_util.head(10)

plt.figure(figsize=(8, 5))
plt.barh(lowest10['LOGON_ID'], lowest10['UTILIZATION_%'], color='salmon')
plt.xlabel('Utilization (%)')
plt.ylabel('Technician')
plt.title('Technicians with Lowest Utilization Rates')
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.show()

In [None]:
job_counts = DF.groupby('LOGON_ID').size().reset_index(name='TOTAL_JOBS')

tech_util_full = tech_util.merge(job_counts, on='LOGON_ID')

top10_jobs = tech_util_full.sort_values('TOTAL_JOBS', ascending=False).head(10)

plt.figure(figsize=(8, 5))
plt.barh(top10_jobs['LOGON_ID'], top10_jobs['UTILIZATION_%'], color='skyblue')

for i, (util, total_jobs) in enumerate(zip(top10_jobs['UTILIZATION_%'], top10_jobs['TOTAL_JOBS'])):
    plt.text(util + 1, i, f'{total_jobs} jobs', va='center')

plt.xlabel('Average Daily Utilization (%)')
plt.ylabel('Technician')
plt.title('Technicians with Most Jobs and Their Average Daily Utilization')
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.show()

In [None]:
job_counts = DF.groupby('LOGON_ID').size().reset_index(name='TOTAL_JOBS')
tech_util_full = tech_util.merge(job_counts, on='LOGON_ID')
tech_util_clean = tech_util_full[tech_util_full['UTILIZATION_%'] <= 100]

plt.figure(figsize=(8, 6))
plt.scatter(tech_util_clean['TOTAL_JOBS'], tech_util_clean['UTILIZATION_%'], alpha=0.7, color='teal')
plt.xlabel('Total Jobs')
plt.ylabel('Average Daily Utilization (%)')
plt.title('Technician Utilization vs. Total Jobs')
plt.grid(True, linestyle='--', alpha=0.5)
plt.show()

In [None]:
timestamp_cols = ['COMPLETED', 'CLOSED','DISPATCH_AT', 'RECEIVED_AT', 'ACKNOWLEDGED_AT', 'ENROUTE_AT', 'ONSITE_AT']
for col in timestamp_cols:
    DF[col] = pd.to_datetime(DF[col], errors='coerce')

DF = DF.dropna(subset=['ACKNOWLEDGED_AT'])

In [None]:
DF['time_to_receive'] = (DF['RECEIVED_AT'] - DF['DISPATCH_AT']).dt.total_seconds() / 3600
DF['time_to_ack'] = (DF['ACKNOWLEDGED_AT'] - DF['RECEIVED_AT']).dt.total_seconds() / 3600
DF['time_to_leave'] = (DF['ENROUTE_AT'] - DF['ACKNOWLEDGED_AT']).dt.total_seconds() / 3600
DF['time_to_enroute'] = DF['TOTAL_TIME_EN_ROUTE'] / 3600
DF['onsite_duration'] = DF['TOTAL_TIME_ON_SITE'] / 3600
DF['post_completion_delay'] = (DF['CLOSED'] - DF['COMPLETED']).dt.total_seconds() / 3600

delays = DF[['time_to_receive', 'time_to_ack', 'time_to_leave', 'time_to_enroute','onsite_duration','post_completion_delay']].mean().sort_values(ascending=False)

delays_df = delays.reset_index()
delays_df.columns = ['Stage', 'Average_Hours']

fig = px.bar(
    delays_df,
    x='Average_Hours',
    y='Stage',
    orientation='h',
    text='Average_Hours',
    title="Average Duration Between Job Lifecycle Stages (hrs)",
    labels={'Average_Hours':'Hours', 'Stage':'Job Stage'}
)

fig.update_traces(texttemplate='%{text:.2f} hrs', textposition='outside')
fig.update_layout(yaxis={'categoryorder':'total ascending'}, margin=dict(l=150, r=50, t=50, b=150))

caption_text = (
    "time_to_receive: Time from job dispatch to technician receiving it<br>"
    "time_to_ack: Time from technician receiving job to acknowledging it <br>"
    "time_to_leave: Time from technician acknowledging job to going to job site<br>"
    "time_to_enroute: Time spent traveling to the job site<br>"
    "onsite_duration: Time spent working at the job site<br>"
    "post_completion_delay: Time from job completion to closing job"
)

fig.add_annotation(
    text=caption_text,
    xref="paper",
    yref="paper",
    x=-0.2,
    y=-0.4,
    showarrow=False,
    align="left",
    font=dict(size=12)
)
fig.show()