Author: Sabrina Derwent

In [2]:
# Importing relevant libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from datetime import datetime

import warnings
warnings.filterwarnings("ignore")

### User input required
Put the data path on your system in the cell below

In [3]:
#data_path = "C:\\Users\\akl0407\\OneDrive - Northwestern University\\Back up\\2025-26\\Spring 2025\\STAT390\\LegalAid\\Data\\CAR_-_EP_Flow_Activity_Queue__Agent_Names\\"

In [4]:
data_path = "/Users/sabrinaderwent/Desktop/fall25/stat390/CAR_datasets/"

### User input ends

### Reading all filenames in the data folder

In [5]:
folder = Path(data_path)
files = sorted(list(folder.glob("*.csv")) + list(folder.glob("*.xlsx")))
df_main = pd.DataFrame(columns=['Contact Session ID', 'EP Name', 'Flow Name', 'Activity Name', 'Activity Start Timestamp', 
                                'Queue Name', 'Agent Name', 'Termination Reason'])
df_main

Unnamed: 0,Contact Session ID,EP Name,Flow Name,Activity Name,Activity Start Timestamp,Queue Name,Agent Name,Termination Reason


### Reading all data files
The code chunk below reads and appends all the CAR data files. The first two rows of each file are blank and thus ignored.

In [6]:
i=0
for f in files:
    i = i + 1
    if f.suffix.lower() == ".csv":
        df = pd.read_csv(f, header=2, dtype=str, engine="python", skip_blank_lines=False)
    else:  # .xlsx
        df = pd.read_excel(f, sheet_name=0, header=2, dtype=str)
    df_main = pd.concat([df_main, df], ignore_index=True)
    print(i, f.stem, df.shape)

1 CAR - EP, Flow, Activity, Queue, & Agent Names (01-12-25 - 01-18-25) (52010, 7)
2 CAR - EP, Flow, Activity, Queue, & Agent Names (01-19-25 - 02-01-25) (95495, 7)
3 CAR - EP, Flow, Activity, Queue, & Agent Names (02-02-25 - 02-15-25) (90056, 7)
4 CAR - EP, Flow, Activity, Queue, & Agent Names (02-16-25 - 03-01-25) (88186, 7)
5 CAR - EP, Flow, Activity, Queue, & Agent Names (03-02-25 - 03-15-25) (86377, 7)
6 CAR - EP, Flow, Activity, Queue, & Agent Names (04-07-24 - 04-20-24) (88766, 7)
7 CAR - EP, Flow, Activity, Queue, & Agent Names (04-21-24 - 05-04-24) (89643, 7)
8 CAR - EP, Flow, Activity, Queue, & Agent Names (05-05-24 - 05-18-24) (82575, 7)
9 CAR - EP, Flow, Activity, Queue, & Agent Names (05-19-24 - 06-01-24) (71103, 7)
10 CAR - EP, Flow, Activity, Queue, & Agent Names (06-02-24 - 06-15-24) (84354, 7)
11 CAR - EP, Flow, Activity, Queue, & Agent Names (06-16-24 - 06-29-24) (82124, 7)
12 CAR - EP, Flow, Activity, Queue, & Agent Names (06-30-24 - 07-13-24) (79752, 7)
13 CAR - EP, 

In [7]:
df_main.head()


Unnamed: 0,Contact Session ID,EP Name,Flow Name,Activity Name,Activity Start Timestamp,Queue Name,Agent Name,Termination Reason
0,001a3748-8d50-4550-8461-33547983deb0,Main Number Telephony EP,,,2025/01/14 12:39:32 PM,,,
1,001a3748-8d50-4550-8461-33547983deb0,,LACMain,,2025/01/14 12:39:32 PM,,,
2,001a3748-8d50-4550-8461-33547983deb0,Main Number Telephony EP,,LanguageSelectionMenu,2025/01/14 12:39:32 PM,,,
3,001a3748-8d50-4550-8461-33547983deb0,Main Number Telephony EP,LACMain,,2025/01/14 12:39:32 PM,,,
4,001a3748-8d50-4550-8461-33547983deb0,Main Number Telephony EP,,MainMenu,2025/01/14 12:39:45 PM,,,


### Time datatype conversion
The code chunk below converts time from string to datetime datatype.

In [8]:
df_main["Activity Start Timestamp"] = df_main["Activity Start Timestamp"].apply(
    lambda x: datetime.strptime(x, "%Y/%m/%d %I:%M:%S %p"))

In [9]:
# Checking the datatype of all columns
df_main.dtypes

Contact Session ID                  object
EP Name                             object
Flow Name                           object
Activity Name                       object
Activity Start Timestamp    datetime64[ns]
Queue Name                          object
Agent Name                          object
Termination Reason                  object
dtype: object

In [10]:
# Creating a new column 'hour' as it will be useful to visualize peak calling hours
df_main["hour"] = df_main["Activity Start Timestamp"].dt.hour

In [11]:
df_main["date"] = df_main["Activity Start Timestamp"].dt.date
df_main["date"] = pd.to_datetime(df_main["date"])

#### Exploring Menu Options in Call Journey

In [12]:
df_main["Termination Reason"].unique()

array([nan, 'Customer Left', 'Queue Timeout', 'CUSTOMER_UNAVAILABLE',
       'System disconnected the contact', 'Agent Left',
       'RONA_TIMER_EXPIRED', 'RONA Timer Expired',
       'MAX_CALLBACK_RETRY_LIMIT_REACHED', 'NO_ANSWER_FROM_CUSTOMER',
       'NO_ANSWER_FROM_AGENT', 'System Error', 'AGENT_ENDS',
       'CUSTOMER_BUSY', 'NO_ANSWER_USER', 'NO_ANSWER_CUSTOMER',
       'USER_UNAVAILABLE', 'Participant Invite timer expired',
       'CONTACT_CALLBACK_IN_PROGRESS', 'OUTDIAL_FAILED', 'USER_BUSY',
       'AGENT_UNAVAILABLE', 'MEDIA_MANAGER_INTERNAL_ERROR',
       'CHANNEL_FAILURE', 'USER_DECLINED', 'AGENT_BUSY'], dtype=object)

In [13]:
df_main["EP Name"].unique()

array(['Main Number Telephony EP', nan,
       'Pre-Legal Menu Seniors Menu Telephony EP',
       'Legal Menu Telephony EP', 'Legal Employment Menu Telephony EP',
       'Closed Queue Menu Telephony EP', 'Legal Family Menu Telephony EP',
       'Other Legal Menu Telephony EP', 'Legal Housing Menu Telephony EP',
       'Intake Outdial EP', 'Legal Benefits Menu Telephony EP',
       'Farmworker Main Number Telephony EP',
       'Closed Hours-Holidays Menu Telephony EP',
       'All LAC Queues Telephony EP', 'Courtesy Callback Telephony EP',
       'Legal Immigration Menu Telephony EP',
       'Legal HIV Menu Telephony EP'], dtype=object)

In [14]:
df_main["Activity Name"].unique()

array([nan, 'LanguageSelectionMenu', 'MainMenu', 'SeniorsMenu',
       'LegalMenu1', 'LegalMenu2', 'EmploymentMenu', 'ClosedQueueMenu',
       'FamilyMenu', 'OtherLegalMenu', 'OtherLegalOtherMenu',
       'SeniorsConfirmationMenu', 'SuburbsOrCityMenu', 'SeniorsADAPTMenu',
       'HousingMenu', 'PreTenantMenu', 'TenantMenu',
       'DivorceOrParentingMenu', 'ClinicVoicemailTransfer',
       'DisconnectContact', 'SetCallerID', 'BenefitsMenu',
       'FarmworkerMainMenu', 'FrontDeskTransfer1',
       'HelpWithLegalorOtherReasonMenu', 'StaffDirectoryEnglishTransfer',
       'AppointmentMenu', 'FrontDeskTransfer2',
       'ComplimentOrComplaintMenu', 'OtherLegalCriminalCaseMenu',
       'OtherLegalPersonalInjuryMenu', 'ClosedMenu', 'DisconnectContact1',
       'GetLoggedInConsumerAgents', 'IntakePreQueueMessage1',
       'ConsumerQueue', 'PreQueueMessage2', 'PlayMOH300s', 'QueueMenu1',
       'ReadANI', 'CCB', 'PlayCCBConfirmation', 'CallbackRetry',
       'AddressFaxHoursMenu', 'CriminalRe

In [15]:
df_main["Flow Name"].unique()

array([nan, 'LACMain', 'PreLegalMenuSeniorsMenu', 'LegalMenu',
       'LegalEmploymentMenu', 'Queues', 'ClosedQueueMenu',
       'LegalFamilyMenu', 'OtherLegalMenu', 'LegalHousingMenu',
       'Intake_Outdial', 'LegalBenefitsMenu', 'FarmworkerMain',
       'ClosedHoursHolidaysMenu', 'CourtesyCallback',
       'LegalImmigrationMenu', 'LegalHIVMenu'], dtype=object)

In [16]:
df_main["Queue Name"].unique()

array([nan, 'Clinic Voicemail Transfer', 'Intake Outdial Queue',
       'Front Desk Transfer', 'Staff Directory English Transfer',
       'Consumer', 'Criminal Records Voicemail Transfer',
       'Staff Directory Spanish Transfer', 'SubSenior Homeowner SP',
       'SubSenior Other', 'SubSenior Homeowner', 'Family SP',
       'SubSenior Consumer', 'Family', 'HIV Voicemail Transfer',
       'Benefits', 'Trafficking Voicemail Transfer', 'ADAPT',
       'SubSenior Tenant', 'Benefits SubSeniors', 'Employment',
       'SubSenior Family', 'Housing', 'SubSenior Benefits', 'Benefits SP',
       'Veterans Benefits Voicemail Transfer', 'SubSenior Tenant SP',
       'Immigration SP', 'SubSenior ADAPT', 'SubSenior Employment',
       'SubSenior Other SP', 'Consumer SP', 'Employment SP',
       'SubSenior Consumer SP', 'Education', 'Immigration',
       'SubSenior Benefits SP', 'ADAPT SP', 'ADAPT SubSeniors',
       'Housing SP', 'SubSenior Family SP',
       'Farmworker Voicemail Transfer', 'SubSen

For presentation 1, I spent a lot of time tyring to conceptualize what a call journey looks like, as the dataset can sometimes be ambiguous. For the next presentation, my goal is to develop a systematic way to trace a call journey despite some confusing journeys

### Filtering Dataset

In [17]:
start_date = "2025-03-16"
end_date = "2025-09-23"
df_short = df_main[(df_main['date'] >= start_date) & (df_main["date"] <= end_date)]

### Analyzing ClosedQueueMenu to see which Queue Client was Placed in

In [18]:
# dropped rows that had the same contact session ID assuming that the last row would be when the client is in the queue
    # still raises question - why are there multiple rows with a ClosedQueueMenu? 
    # shouldn't the client only reach the closed queue menu once?
closed_q = df_short[df_short["Activity Name"] == "ClosedQueueMenu"]
closed_q = closed_q.drop_duplicates(subset=["Contact Session ID"], keep="last")
print((closed_q["Contact Session ID"].nunique()), closed_q.shape[0])

16575 16575


In [19]:
num_calls_q = pd.DataFrame(closed_q.groupby(["EP Name"]).nunique())
num_calls_q

Unnamed: 0_level_0,Contact Session ID,Flow Name,Activity Name,Activity Start Timestamp,Queue Name,Agent Name,Termination Reason,hour,date
EP Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Closed Queue Menu Telephony EP,16575,0,1,16533,0,0,0,12,126


In [21]:
num_calls_q = pd.DataFrame(closed_q.groupby(["Queue Name"]).nunique())
num_calls_q.iloc[:,:1]

Unnamed: 0_level_0,Contact Session ID
Queue Name,Unnamed: 1_level_1
ADAPT,21
ADAPT SP,2
ADAPT SubSeniors SP,1
Benefits,38
Benefits SP,5
Consumer,67
Consumer SP,6
Education,3
Education SP,1
Employment,25


### Starting Analysis on Queue Timeout

In [20]:
closed_q = df_short[df_short["Termination Reason"] == "Queue Timeout"]
closed_q = closed_q.drop_duplicates(subset=["Contact Session ID"], keep="last")
print((closed_q["Contact Session ID"].nunique()), closed_q.shape[0])

376 376


In [22]:
queue_time = df_short[df_short["Termination Reason"] == "Queue Timeout"]
queue_time
queue_time.groupby(["Contact Session ID","Queue Name"]).size()


Contact Session ID                    Queue Name        
020ef1ea-04ef-4624-a75d-4ed838f052a1  Family                1
05e61ab6-f8a1-42e3-a53c-79451252aff2  Family                1
063dd11b-9e6d-47a9-bb34-6721ee340045  SubSenior Benefits    3
070a2cf3-cb97-447d-9017-a710e5295cd1  ADAPT SP              2
071b2c7b-12d7-4f3f-8e24-275cd2b63944  Consumer              3
                                                           ..
fcad6176-91fa-4607-9b17-6584bb2f901c  SubSenior Other SP    4
fcbf5e71-4cab-4d58-926c-f4ffb0ff755c  Consumer              1
fd898520-9264-4ec2-b206-d309e1f0b0d4  Consumer              1
ffc8f728-d545-4637-ab2b-e5f44d5b55a4  Employment SP         1
ffd82a4b-a4b5-41d6-bc29-f1612d1c307e  Benefits              4
Length: 376, dtype: int64

This shows that there are some calls that have multiple rows that indicate they are in the same queue name. For example, the 3rd row in the grouped dataset shows that there were three rows that had "SubSenior Benefits" as the Queue Name. This raises questions for me - are these rows duplicates? If someone was solely waiting in a queue, why is that noted in multiple rows? 