In [None]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
import pytz

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df=pd.read_csv("/content/drive/MyDrive/311SerivceRequest_dataset/311_Service_Requests_2yrs.csv")
df.shape

(1093918, 15)

In [None]:
df = df.iloc[:1062842]
df.shape

(1062842, 15)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1062842 entries, 0 to 1062841
Data columns (total 15 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   service_request_id  1062842 non-null  object 
 1   requested_date      1062842 non-null  object 
 2   updated_date        1062842 non-null  object 
 3   closed_date         1030749 non-null  object 
 4   status_description  1062842 non-null  object 
 5   source              1062842 non-null  object 
 6   service_name        1062842 non-null  object 
 7   agency_responsible  1062684 non-null  object 
 8   address             0 non-null        float64
 9   comm_code           989008 non-null   object 
 10  comm_name           989009 non-null   object 
 11  location_type       989216 non-null   object 
 12  longitude           988997 non-null   float64
 13  latitude            988997 non-null   float64
 14  point               988997 non-null   object 
dtypes: float64(3), 

In [None]:
missing_values = df.isna().sum()
missing_values

Unnamed: 0,0
service_request_id,0
requested_date,0
updated_date,0
closed_date,32093
status_description,0
source,0
service_name,0
agency_responsible,158
address,1062842
comm_code,73834


In [None]:
df = df.drop(columns=['address','location_type'],axis = 1)

In [None]:
# Deriving new columns from requested date

df['requested_date'] = pd.to_datetime(df['requested_date'], format = '%Y/%m/%d %I:%M:%S %p')
print(f"Data type of 'requested_date': {df['requested_date'].dtype}")

df['request_year'] = df['requested_date'].dt.year
df['request_month'] = df['requested_date'].dt.month
df['request_day'] = df['requested_date'].dt.day

df['updated_date'] = pd.to_datetime(df['updated_date'], format = '%Y/%m/%d %I:%M:%S %p')

df['update_year'] = df['updated_date'].dt.year
df['update_month'] = df['updated_date'].dt.month
df['update_day'] = df['updated_date'].dt.day



df['closed_date'] = pd.to_datetime(df['closed_date'], format = '%Y/%m/%d %I:%M:%S %p')
print("Dataype:", df['closed_date'].dtype)

# Converting null values to NaT
df['closed_date'] = df['closed_date'].fillna(pd.NaT)


df['closed_year'] = df['closed_date'].dt.year
df['closed_month'] = df['closed_date'].dt.month
df['closed_day'] = df['closed_date'].dt.day

# Replacing null values in derived columns with 0 and converting the column values to int type

df.loc[df['closed_date'].isna(), ['closed_year', 'closed_month', 'closed_day']] = 0
df[['closed_year', 'closed_month', 'closed_day']] = df[['closed_year', 'closed_month', 'closed_day']].astype('Int32')

df.info()

Data type of 'requested_date': datetime64[ns]
Dataype: datetime64[ns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1062842 entries, 0 to 1062841
Data columns (total 22 columns):
 #   Column              Non-Null Count    Dtype         
---  ------              --------------    -----         
 0   service_request_id  1062842 non-null  object        
 1   requested_date      1062842 non-null  datetime64[ns]
 2   updated_date        1062842 non-null  datetime64[ns]
 3   closed_date         1030749 non-null  datetime64[ns]
 4   status_description  1062842 non-null  object        
 5   source              1062842 non-null  object        
 6   service_name        1062842 non-null  object        
 7   agency_responsible  1062684 non-null  object        
 8   comm_code           989008 non-null   object        
 9   comm_name           989009 non-null   object        
 10  longitude           988997 non-null   float64       
 11  latitude            988997 non-null   float64       
 12  

In [None]:
# Calculating closing delay and creating new inttype column for closing delay

df['closing_delay'] = df['closed_date'] - df['requested_date']
print("1",df['closing_delay'].dtype)
df['closing_delay'] = df['closing_delay'].dt.days
print(df['closing_delay'].dtype)
df['closing_delay'] = df['closing_delay'].astype('Int64')

print(df['closing_delay'].dtype)

1 timedelta64[ns]
float64
Int64


In [None]:
df['duplicate_request'] = df['status_description'].str.contains(r'Duplicate \(Closed\)', regex=True)

# Convert the boolean values to 'Yes'/'No'
df['duplicate_request'] = df['duplicate_request'].replace({True: 'Yes', False: 'No'})

In [None]:
# Season Categorisation of "Requests"

# Defining Calgary's timezone
calgary_tz = pytz.timezone('America/Edmonton')

# Exact UTC times for solstices and equinoxes (taken from Govt of Canada Website)

seasons_utc = {
    'Spring_2023': '2023-03-20 21:24:00',
    'Summer_2023': '2023-06-21 14:57:00',
    'Autumn_2023': '2023-09-23 06:50:00',
    'Winter_2023': '2023-12-22 03:27:00',
    'Spring_2024': '2024-03-20 03:06:00',
    'Summer_2024': '2024-06-20 20:50:00',
    'Autumn_2024': '2024-09-22 12:43:00',
    'Winter_2024': '2024-12-21 09:20:00'
}

# Converting the UTC times to Calgary local time

seasons = {}

for season, utc_time_str in seasons_utc.items():

    # Converting the UTC string into a datetime object

    utc_time = datetime.strptime(utc_time_str, '%Y-%m-%d %H:%M:%S')
    utc_time = pytz.utc.localize(utc_time)

    # Converting to Calgary local time
    local_time = utc_time.astimezone(calgary_tz)

    # Saving the result in the dictionary
    seasons[season] = local_time

for key, value in seasons.items():
#print(f"{key}: {value.strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"{key}: {value}")

Spring_2023: 2023-03-20 15:24:00-06:00
Summer_2023: 2023-06-21 08:57:00-06:00
Autumn_2023: 2023-09-23 00:50:00-06:00
Winter_2023: 2023-12-21 20:27:00-07:00
Spring_2024: 2024-03-19 21:06:00-06:00
Summer_2024: 2024-06-20 14:50:00-06:00
Autumn_2024: 2024-09-22 06:43:00-06:00
Winter_2024: 2024-12-21 02:20:00-07:00


In [None]:
if df['requested_date'].dt.tz is None:
    df['new_requested_date'] = df['requested_date'].dt.tz_localize('America/Edmonton')

print(df['new_requested_date'].head())

0   2023-01-02 00:00:00-07:00
1   2023-01-02 00:00:00-07:00
2   2023-01-02 00:00:00-07:00
3   2023-01-02 00:00:00-07:00
4   2023-01-02 00:00:00-07:00
Name: new_requested_date, dtype: datetime64[ns, America/Edmonton]


In [None]:
def get_season(request_date):
    for season, season_date in seasons.items():
        if request_date < season_date:
            return season
    return 'Winter_2024'

df['Season'] = df['new_requested_date'].apply(get_season)

In [None]:
statistics_closing_delay = df['closing_delay'].describe()
print(statistics_closing_delay)

count    1030749.0
mean     12.630157
std      39.587334
min            0.0
25%            1.0
50%            3.0
75%            8.0
max          740.0
Name: closing_delay, dtype: Float64


In [None]:
filtered_df = df[df['status_description'] == 'Open' ]
grouped_data = filtered_df.groupby('status_description')['closed_date']
pd.set_option('display.max_rows', None)
grouped_data.tail()

Unnamed: 0,closed_date
1062821,NaT
1062822,NaT
1062824,NaT
1062832,NaT
1062839,2025-01-02


In [None]:
df['modified_status'] = df.apply(
   lambda row: 'Closed' if pd.notna(row['closed_date']) and row['status_description'] == 'Open'
    else ('Duplicate (Closed)' if pd.notna(row['closed_date']) and row['status_description'] == 'Duplicate (Open)'
          else row['status_description']), axis=1
)
df.tail()

Unnamed: 0,service_request_id,requested_date,updated_date,closed_date,status_description,source,service_name,agency_responsible,comm_code,comm_name,...,update_month,update_day,closed_year,closed_month,closed_day,closing_delay,duplicate_request,new_requested_date,Season,modified_status
1062837,24-00979069,2024-12-31,2025-01-27,2025-01-27,Closed,Other,DBBS - RIM - Property Research,"PDS - Development, Business and Building Services",ASP,ASPEN WOODS,...,1,27,2025,1,27,27,No,2024-12-31 00:00:00-07:00,Winter_2024,Closed
1062838,24-00978685,2024-12-31,2025-01-21,2025-01-21,Closed,Other,Bylaw - Vehicle Concerns,CS - Emergency Management and Community Safety,TEM,TEMPLE,...,1,21,2025,1,21,21,No,2024-12-31 00:00:00-07:00,Winter_2024,Closed
1062839,24-00977260,2024-12-31,2025-01-21,2025-01-02,Open,Other,WRS - Cart Management,OS - Waste and Recycling Services,HUN,HUNTINGTON HILLS,...,1,21,2025,1,2,2,No,2024-12-31 00:00:00-07:00,Winter_2024,Closed
1062840,24-00978925,2024-12-31,2025-01-21,2025-01-21,Closed,Other,CT AC - Trip Feedback - CTA,OS - Calgary Transit,DNW,DOWNTOWN WEST END,...,1,21,2025,1,21,21,No,2024-12-31 00:00:00-07:00,Winter_2024,Closed
1062841,24-00977424,2024-12-31,2025-01-22,2025-01-22,Closed,Other,WRS - Collection Schedule Inquiry,OS - Waste and Recycling Services,SET,SETON,...,1,22,2025,1,22,22,No,2024-12-31 00:00:00-07:00,Winter_2024,Closed


In [None]:
missing = df.isna().sum()
print(missing)
print(df.info())

service_request_id        0
requested_date            0
updated_date              0
closed_date           32093
status_description        0
source                    0
service_name              0
agency_responsible      158
comm_code             73834
comm_name             73833
longitude             73845
latitude              73845
point                 73845
request_year              0
request_month             0
request_day               0
update_year               0
update_month              0
update_day                0
closed_year               0
closed_month              0
closed_day                0
closing_delay         32093
duplicate_request         0
new_requested_date        0
Season                    0
modified_status           0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1062842 entries, 0 to 1062841
Data columns (total 27 columns):
 #   Column              Non-Null Count    Dtype                           
---  ------              --------------   

In [None]:
df.loc[df['agency_responsible'].isnull() & df['service_name'].str.contains('WATR -'), 'agency_responsible'] = 'UEP - Utilities & Environmental Protection'
df.loc[df['agency_responsible'].isnull() & df['service_name'].str.contains('PSD -'), 'agency_responsible'] = 'PDS - Planning & Development Services'
df.loc[df['agency_responsible'].isnull() & df['service_name'].str.contains('CPI -'), 'agency_responsible'] = 'OSC - Operational Services and Compliance'

# agency abbreviations are extracted
def extract_division(value):
    if pd.isna(value):
        return np.nan
    parts = value.split('-')
    resultStr = parts[0].strip() if '-' in value else value.strip()
    return resultStr


df['agency_division'] = df['agency_responsible'].apply(extract_division)

#Actual agencies or divisions under Calgary Government
agency_division = {
    'agency_name': ['Affiliated Organizations', 'Chief Financial Officer Department', 'Corporate Wide Service Requests',
                    'Calgary Police & Fire Services', 'Community Services', "Deputy City Manager's Office",
                   'Elected Officials', 'Fleet and Inventory', 'Information Services','Legal or Legislative Services',
                   'Office of the City Auditor','Operational Services and Compliance', 'Partnerships',
                   'Planning & Development Services','Project Information and Control Systems', 'Recreation and Social Programs',
                    'Transportation', 'Utilities & Environmental Protection'],
    'abbreviations': [['AO', 'Affiliated Organizations'], ['CFOD'], ['Corporate Wide Service Requests'],
                      ['CPFS'],['CS'], ['DCMO'],
                      ['Elected Officials'], ['Fleet and Inventory'], ['IS'], ['LL','LLSS'],
                      ['Office of the City Auditor'],['OS','OSC'],['Partnerships'],
                      ['PD','PDS'],['PICS'],['Recreation and Social Programs'],
                      ['TRAN','Tranc'], ['UEP','Uepc']]
}


# Create a mapping dictionary
mapping = {abbreviation: agency_name
           for agency_name, abbreviations in zip(agency_division['agency_name'], agency_division['abbreviations'])
           for abbreviation in abbreviations}


# Replace the agency_division values with actual agency_name or divisions
df['agency_division'] = df['agency_division'].map(mapping)
agencies= df['agency_division'].unique()
# Iterate through each agency division in the list
for division in agencies:
    subset_df = df[df['agency_division'] == division]

    # Split the 'agency_responsible' column at the first hyphen and create 'agency_subdivision'
    df.loc[df['agency_division'] == division, 'agency_subdivision'] = subset_df['agency_responsible'].apply(
        lambda x: x.split('-', 1)[1] if '-' in x else division
    )

    # Split the 'service_name' column at the first hyphen and create 'service_category'
    df.loc[df['agency_division'] == division, 'service_category'] = subset_df['service_name'].apply(
        lambda x: x.split('-', 1)[0] if '-' in x else x
    )

    # Split the 'service_name' column at the first hyphen and create 'service_request'
    df.loc[df['agency_division'] == division, 'service_request'] = subset_df['service_name'].apply(
        lambda x: x.split('-', 1)[1] if '-' in x else x
    )

# Display the updated DataFrame
print(df.head(10))


print("\n\033[1m"+"Additional Columns created are:"+"\033[0m")
print("\tagency_division")
print("\tagency_subdivision")
print("\tservice_category")
print("\tservice_request")

  service_request_id requested_date updated_date closed_date  \
0        23-00000797     2023-01-02   2023-01-10  2023-01-10   
1        23-00001045     2023-01-02   2024-01-11  2024-01-11   
2        23-00001163     2023-01-02   2023-01-06  2023-01-06   
3        23-00001191     2023-01-02   2024-05-19  2023-01-10   
4        23-00001584     2023-01-02   2023-01-04  2023-01-04   
5        23-00001849     2023-01-02   2024-01-09  2024-01-09   
6        23-00001987     2023-01-02   2023-01-03  2023-01-03   
7        23-00002055     2023-01-02   2023-01-03  2023-01-03   
8        23-00002077     2023-01-02   2023-01-07  2023-01-07   
9        23-00002084     2023-01-02   2023-01-02  2023-01-02   

  status_description source                             service_name  \
0             Closed  Other  Finance - ONLINE TIPP Agreement Request   
1             Closed  Other        Active Living Program Application   
2             Closed  Phone     CN - Registered Social Worker Letter   
3      

In [None]:
df.isnull().sum()

Unnamed: 0,0
service_request_id,0
requested_date,0
updated_date,0
closed_date,32093
status_description,0
source,0
service_name,0
agency_responsible,0
comm_code,73834
comm_name,73833


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1062842 entries, 0 to 1062841
Data columns (total 31 columns):
 #   Column              Non-Null Count    Dtype                           
---  ------              --------------    -----                           
 0   service_request_id  1062842 non-null  object                          
 1   requested_date      1062842 non-null  datetime64[ns]                  
 2   updated_date        1062842 non-null  datetime64[ns]                  
 3   closed_date         1030749 non-null  datetime64[ns]                  
 4   status_description  1062842 non-null  object                          
 5   source              1062842 non-null  object                          
 6   service_name        1062842 non-null  object                          
 7   agency_responsible  1062842 non-null  object                          
 8   comm_code           989008 non-null   object                          
 9   comm_name           989009 non-null   object  

## Handling null values (longitude, latitude, point)

In [None]:
print(df['comm_name'].isnull().sum())
print(df['comm_code'].isnull().sum())
print(df['longitude'].isnull().sum())
print(df['latitude'].isnull().sum())
print(df['point'].isnull().sum())

73833
73834
73845
73845
73845


In [None]:
df['longitude'] = df['longitude'].fillna(df['longitude'].median())
df['latitude'] = df['latitude'].fillna(df['latitude'].median())
#df['comm_name'] = df['comm_name'].fillna(df['comm_name'].mode()[0])
#df['comm_code'] = df['comm_code'].fillna(df['comm_code'].mode()[0])
df['point'] = df['point'].fillna(df['point'].mode()[0])
#df['new_agencyResponsible'] = df['new_agencyResponsible'].fillna(df['new_agencyResponsible'].mode()[0])
print(df['longitude'].isnull().sum(),df['latitude'].isnull().sum(),df['point'].isnull().sum())

0 0 0


## Adding Sector ('SOUTHEAST', 'EAST', 'CENTRE', 'NORTH', 'SOUTH', 'NORTHWEST','NORTHEAST', 'WEST') Column for each community

In [None]:
community_data=pd.read_csv("/content/drive/MyDrive/311SerivceRequest_dataset/Community_Points_20250208.csv")
print(community_data['COMM_CODE'].isnull().sum(),community_data['SECTOR'].isnull().sum())

0 0


In [None]:
community_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 312 entries, 0 to 311
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   CLASS           312 non-null    object 
 1   CLASS_CODE      312 non-null    int64  
 2   COMM_CODE       312 non-null    object 
 3   NAME            312 non-null    object 
 4   SECTOR          312 non-null    object 
 5   SRG             251 non-null    object 
 6   COMM_STRUCTURE  310 non-null    object 
 7   longitude       312 non-null    float64
 8   latitude        312 non-null    float64
 9   POINT           312 non-null    object 
dtypes: float64(2), int64(1), object(7)
memory usage: 24.5+ KB


In [None]:
def merge_community_sector(main_data, community_data):
    # Rename the relevant columns in the community_data for clarity and consistency
    community_data.rename(columns={'COMM_CODE': 'comm_code', 'SECTOR': 'community_sector'}, inplace=True)

    # Merge the datasets based on the 'comm_code'
    merged_data = main_data.merge(community_data[['comm_code', 'community_sector']], on='comm_code', how='left')

    return merged_data

df = merge_community_sector(df, community_data)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1062842 entries, 0 to 1062841
Data columns (total 32 columns):
 #   Column              Non-Null Count    Dtype                           
---  ------              --------------    -----                           
 0   service_request_id  1062842 non-null  object                          
 1   requested_date      1062842 non-null  datetime64[ns]                  
 2   updated_date        1062842 non-null  datetime64[ns]                  
 3   closed_date         1030749 non-null  datetime64[ns]                  
 4   status_description  1062842 non-null  object                          
 5   source              1062842 non-null  object                          
 6   service_name        1062842 non-null  object                          
 7   agency_responsible  1062842 non-null  object                          
 8   comm_code           989008 non-null   object                          
 9   comm_name           989009 non-null   object  

In [None]:
null_rows = df[df['community_sector'].isnull()]
null_comm_codes = null_rows['comm_code'].unique()
print(null_comm_codes)
comm_codes_to_check = ['12I', 'ABT', '01I', '11A', 'ABR']
def check_comm_codes_for_sector(df, comm_codes):
    filtered_data = df[df['comm_code'].isin(comm_codes)]
    sector_mapping = filtered_data[['comm_code', 'community_sector']]

    return sector_mapping
sector_mapping = check_comm_codes_for_sector(df, comm_codes_to_check)
sector_mapping.info()


[nan '12I' 'ABT' '01I' '11A' 'ABR']
<class 'pandas.core.frame.DataFrame'>
Index: 1406 entries, 26 to 882478
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   comm_code         1406 non-null   object
 1   community_sector  0 non-null      object
dtypes: object(2)
memory usage: 33.0+ KB


In [None]:
print(df['comm_code'].isnull().sum(),df['community_sector'].isnull().sum())

73834 75240


In [None]:
df['community_sector'].unique()

array([nan, 'SOUTHEAST', 'EAST', 'CENTRE', 'NORTH', 'SOUTH', 'NORTHWEST',
       'NORTHEAST', 'WEST'], dtype=object)

In [None]:
df.loc[(df['comm_code'] == '12I') & (df['community_sector'].isnull()), 'community_sector'] = 'SOUTHEAST'
df.loc[(df['comm_code'] == '01I') & (df['community_sector'].isnull()), 'community_sector'] = 'WEST'
df.loc[(df['comm_code'] == 'ABT') & (df['community_sector'].isnull()), 'community_sector'] = 'NORTHWEST'
df.loc[(df['comm_code'] == 'ABR') & (df['community_sector'].isnull()), 'community_sector'] = 'NORTHWEST'
#11 A, South west and south east
df.loc[(df['comm_name'] == '05E') & (df['comm_code'].isnull()), 'comm_code'] = '05E'
print(df['comm_code'].isnull().sum(),df['community_sector'].isnull().sum())

73833 73835


Before sector filled:<br>
Total null values in comm_code is 73834, for community_sector was 75240.<br>
After filling null of comm_code ['12I' 'ABT' '01I' 'ABR'] the null of sector reduced to 73835 (which is reduced by 1405). couldn't find 11A's SECTOR and only one request was pulled by them.


In [None]:
df.to_csv("dataVisual_311data.csv", index=False)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1062842 entries, 0 to 1062841
Data columns (total 31 columns):
 #   Column              Non-Null Count    Dtype                           
---  ------              --------------    -----                           
 0   service_request_id  1062842 non-null  object                          
 1   requested_date      1062842 non-null  datetime64[ns]                  
 2   updated_date        1062842 non-null  datetime64[ns]                  
 3   closed_date         1030749 non-null  datetime64[ns]                  
 4   status_description  1062842 non-null  object                          
 5   source              1062842 non-null  object                          
 6   service_name        1062842 non-null  object                          
 7   agency_responsible  1062842 non-null  object                          
 8   comm_code           989008 non-null   object                          
 9   comm_name           989009 non-null   object  