# Data Cleaning

weather column is not there so how are we gonna answer the guiding question, In which season do service requests occur most often? Are  we gonna find the season from the date of request?

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
data=pd.read_csv("311_Service_Requests_2yrs.csv")
data.head()

Unnamed: 0,version https://git-lfs.github.com/spec/v1
0,oid sha256:e23ee1f41bea095fd8c889824104ec1cee0...
1,size 294803354


In [46]:
data.shape

(1093918, 15)

### Data information, description, null value count

In [47]:
print(data.info(),"\n")
print("DATA DESCRIPTION: \n",data.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1093918 entries, 0 to 1093917
Data columns (total 15 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   service_request_id  1093918 non-null  object 
 1   requested_date      1093918 non-null  object 
 2   updated_date        1093918 non-null  object 
 3   closed_date         1054204 non-null  object 
 4   status_description  1093918 non-null  object 
 5   source              1093918 non-null  object 
 6   service_name        1093918 non-null  object 
 7   agency_responsible  1093713 non-null  object 
 8   address             0 non-null        float64
 9   comm_code           1017629 non-null  object 
 10  comm_name           1017630 non-null  object 
 11  location_type       1017842 non-null  object 
 12  longitude           1017618 non-null  float64
 13  latitude            1017618 non-null  float64
 14  point               1017618 non-null  object 
dtypes: float64(3), 

### Formatting

In [48]:
data['requested_date'] = pd.to_datetime(data['requested_date'],format='%Y/%m/%d %I:%M:%S %p')
data['updated_date'] = pd.to_datetime(data['updated_date'],format='%Y/%m/%d %I:%M:%S %p')
data['closed_date'] = pd.to_datetime(data['closed_date'], errors='coerce',format='%Y/%m/%d %I:%M:%S %p')#If 'coerce', then invalid parsing will be set as NaT.(Not a time)

In [49]:
data.dtypes

service_request_id            object
requested_date        datetime64[ns]
updated_date          datetime64[ns]
closed_date           datetime64[ns]
status_description            object
source                        object
service_name                  object
agency_responsible            object
address                      float64
comm_code                     object
comm_name                     object
location_type                 object
longitude                    float64
latitude                     float64
point                         object
dtype: object

### Address column has more null values

In [50]:
print("Sum of Total null values in each columns:\n",data.isnull().sum())

Sum of Total null values in each columns:
 service_request_id          0
requested_date              0
updated_date                0
closed_date             39714
status_description          0
source                      0
service_name                0
agency_responsible        205
address               1093918
comm_code               76289
comm_name               76288
location_type           76076
longitude               76300
latitude                76300
point                   76300
dtype: int64


### Handling missing values

In [51]:
missing_percent = (data.isnull().sum() / len(data)) * 100
print(missing_percent)

service_request_id      0.000000
requested_date          0.000000
updated_date            0.000000
closed_date             3.630437
status_description      0.000000
source                  0.000000
service_name            0.000000
agency_responsible      0.018740
address               100.000000
comm_code               6.973923
comm_name               6.973832
location_type           6.954452
longitude               6.974929
latitude                6.974929
point                   6.974929
dtype: float64


In [52]:
#As address column has more null values, lets remove the column for further analysis
data = data.drop(columns=['address','location_type'])
print(data.columns)

Index(['service_request_id', 'requested_date', 'updated_date', 'closed_date',
       'status_description', 'source', 'service_name', 'agency_responsible',
       'comm_code', 'comm_name', 'longitude', 'latitude', 'point'],
      dtype='object')


In [53]:
#column 'closed_date'

In [54]:
newData=data.copy()

In [55]:
newData=newData.dropna(subset=['closed_date','service_name','agency_responsible','comm_code','comm_name','longitude','latitude','point'])

In [56]:
missing_percent = (newData.isnull().sum() / len(newData)) * 100
print(missing_percent)

service_request_id    0.0
requested_date        0.0
updated_date          0.0
closed_date           0.0
status_description    0.0
source                0.0
service_name          0.0
agency_responsible    0.0
comm_code             0.0
comm_name             0.0
longitude             0.0
latitude              0.0
point                 0.0
dtype: float64


In [57]:
newData.shape

(979685, 13)

In [58]:
#column service name(updated)
#way to do is fill null with the most frequent value(ie, mode)
#Same goes for column agency_responsible

In [59]:
#column longitude,latitude and point 
#mean imputation when the data shows a *normal distribution*.
#OR median imputation when the variables are *skewed*.

#KNNImputer

## new Column creation function

In [62]:
def newColumnCreation(data):
    service_nunique_values_before = data['service_name'].nunique()
    
    # Extract main service name (removing subdivisions)
    data['new_serviceNames'] = data['service_name'].str.split(' -').str[0]
    
    service_nunique_values_after = data['new_serviceNames'].nunique()
    
    print("Number of Unique Service Names Before:", service_nunique_values_before)
    print("Number of Unique Service Names After:", service_nunique_values_after)
    
    # Agency Responsible Analysis
    agency_nunique_values_before = data['agency_responsible'].nunique()
    
    # Extract main agency name
    data['new_agencyResponsible'] = data['agency_responsible'].str.split(' -').str[0]
    
    agency_nunique_values_after = data['new_agencyResponsible'].nunique()
    
    print("Number of Unique Agencies Before:", agency_nunique_values_before)
    print("Number of Unique Agencies After:", agency_nunique_values_after)
    
    #return data
newColumnCreation(newData)

Number of Unique Service Names Before: 546
Number of Unique Service Names After: 76
Number of Unique Agencies Before: 71
Number of Unique Agencies After: 23


## Data cleaning for visual

In [68]:

def clean_service_requests(newData):
    # 1. Geographic Analysis
    newData["community_request_count"] = newData.groupby("comm_name")["service_request_id"].transform('count')

    # 2. Seasonal Trends
    newData["month"] = newData["requested_date"].dt.month
    newData["year"] = newData["requested_date"].dt.year

    # Function to categorize seasons
    def get_season(month):
        if month in [12, 1, 2]: return "Winter"
        elif month in [3, 4, 5]: return "Spring"
        elif month in [6, 7, 8]: return "Summer"
        else: return "Fall"

    newData["season"] = newData["month"].apply(get_season)

    # 3. Response Efficiency
    newData["response_time"] = (newData["updated_date"] - newData["requested_date"]).dt.days
    newData["resolution_time"] = (newData["closed_date"] - newData["requested_date"]).dt.days

    # Save the cleaned dataset
    newData.to_csv("service_requests_cleaned.csv", index=False)

    return newData


newData_cleaned = clean_service_requests(newData)
newData_cleaned

Unnamed: 0,service_request_id,requested_date,updated_date,closed_date,status_description,source,service_name,agency_responsible,comm_code,comm_name,...,latitude,point,new_serviceNames,new_agencyResponsible,community_request_count,month,year,season,response_time,resolution_time
8,23-00002077,2023-01-02,2023-01-07,2023-01-07,Closed,Phone,Parks - Snow and Ice Concerns - WAM,CS - Calgary Parks,MCK,MCKENZIE LAKE,...,50.914881,POINT (-113.988181270219 50.914880843208),Parks,CS,7343,1,2023,Winter,5,5
9,23-00002084,2023-01-02,2023-01-02,2023-01-02,Closed,Other,AS - Pick Up Stray,CS - Calgary Community Standards,FLN,FOREST LAWN,...,51.037848,POINT (-113.971230283867 51.037847718454),AS,CS,7698,1,2023,Winter,0,0
11,23-00002080,2023-01-02,2023-01-03,2023-01-03,Closed,Phone,WATS - Water Quality,UEP - Water Services,BRT,BRITANNIA,...,51.012675,POINT (-114.086314872035 51.012674541829),WATS,UEP,997,1,2023,Winter,1,1
12,23-00002085,2023-01-02,2023-01-03,2023-01-03,Closed,App,Parks - Snow and Ice Concerns - WAM,CS - Calgary Parks,BED,BEDDINGTON HEIGHTS,...,51.131644,POINT (-114.084911140805 51.131643985322),Parks,CS,6713,1,2023,Winter,1,1
13,23-00002086,2023-01-02,2023-01-03,2023-01-03,Closed,App,Parks - Snow and Ice Concerns - WAM,CS - Calgary Parks,QLD,QUEENSLAND,...,50.938224,POINT (-114.022571706061 50.938224120413),Parks,CS,2700,1,2023,Winter,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1093889,25-00062491,2025-01-27,2025-01-27,2025-01-27,Closed,Other,Roads - Traffic or Pedestrian Light Repair,OS - Mobility,HAR,HARVEST HILLS,...,51.147678,POINT (-114.05277797732376 51.14767758603047),Roads,OS,3807,1,2025,Winter,0,0
1093891,25-00062810,2025-01-27,2025-01-27,2025-01-27,Closed,Other,CT - Transit Pass Programs,OS - Calgary Transit,BNK,BANKVIEW,...,51.034128,POINT (-114.10048318978971 51.0341275275111),CT,OS,4627,1,2025,Winter,0,0
1093893,25-00062770,2025-01-27,2025-01-27,2025-01-27,Closed,Other,CT AC - Trip Feedback - CTA,OS - Calgary Transit,OGD,OGDEN,...,50.991873,POINT (-114.01243870034233 50.99187347808853),CT AC,OS,7740,1,2025,Winter,0,0
1093894,25-00065151,2025-01-27,2025-01-27,2025-01-27,Closed,Other,WATS - Sewage Back-up,OS - Water Services,FAL,FALCONRIDGE,...,51.103350,POINT (-113.94533995179364 51.10334979554837),WATS,OS,5763,1,2025,Winter,0,0


In [66]:
newData.info()

<class 'pandas.core.frame.DataFrame'>
Index: 979685 entries, 8 to 1093916
Data columns (total 15 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   service_request_id     979685 non-null  object        
 1   requested_date         979685 non-null  datetime64[ns]
 2   updated_date           979685 non-null  datetime64[ns]
 3   closed_date            979685 non-null  datetime64[ns]
 4   status_description     979685 non-null  object        
 5   source                 979685 non-null  object        
 6   service_name           979685 non-null  object        
 7   agency_responsible     979685 non-null  object        
 8   comm_code              979685 non-null  object        
 9   comm_name              979685 non-null  object        
 10  longitude              979685 non-null  float64       
 11  latitude               979685 non-null  float64       
 12  point                  979685 non-null  object  

## Merging community sector to main dataset

In [89]:
community_data=pd.read_csv("Community_Points_20250205.csv")
print(community_data['COMM_CODE'].isnull().sum(),community_data['SECTOR'].isnull().sum())

0 0


In [90]:
def merge_community_sector(main_data, community_data):
    # Rename the relevant columns in the community_data for clarity and consistency
    community_data.rename(columns={'COMM_CODE': 'comm_code', 'SECTOR': 'community_sector'}, inplace=True)
    
    # Merge the datasets based on the 'comm_code'
    merged_data = main_data.merge(community_data[['comm_code', 'community_sector']], on='comm_code', how='left')
    
    # Save the merged dataset
    merged_data.to_csv("merged_service_requests.csv", index=False)
    
    return merged_data

# Assuming newData and community_data are already loaded
merged_data = merge_community_sector(newData, community_data)
print(merged_data.info())
null_rows = merged_data[merged_data['community_sector'].isnull()]
null_comm_codes = null_rows['comm_code'].unique()
print(null_comm_codes)
comm_codes_to_check = ['12I', 'ABT', '01I', '11A', 'ABR']
def check_comm_codes_for_sector(df, comm_codes):
    filtered_data = df[df['comm_code'].isin(comm_codes)]
    sector_mapping = filtered_data[['comm_code', 'community_sector']]
    
    return sector_mapping
sector_mapping = check_comm_codes_for_sector(merged_data, comm_codes_to_check)
sector_mapping.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 979685 entries, 0 to 979684
Data columns (total 22 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   service_request_id       979685 non-null  object        
 1   requested_date           979685 non-null  datetime64[ns]
 2   updated_date             979685 non-null  datetime64[ns]
 3   closed_date              979685 non-null  datetime64[ns]
 4   status_description       979685 non-null  object        
 5   source                   979685 non-null  object        
 6   service_name             979685 non-null  object        
 7   agency_responsible       979685 non-null  object        
 8   comm_code                979685 non-null  object        
 9   comm_name                979685 non-null  object        
 10  longitude                979685 non-null  float64       
 11  latitude                 979685 non-null  float64       
 12  point           

In [91]:
merged_data.isnull().sum()

service_request_id            0
requested_date                0
updated_date                  0
closed_date                   0
status_description            0
source                        0
service_name                  0
agency_responsible            0
comm_code                     0
comm_name                     0
longitude                     0
latitude                      0
point                         0
new_serviceNames              0
new_agencyResponsible         0
community_request_count       0
month                         0
year                          0
season                        0
response_time                 0
resolution_time               0
community_sector           1404
dtype: int64

### Column "service_name" 

In [16]:
service_nunique_values=data['service_name'].nunique()
print(service_nunique_values)

640


In [38]:
'''service_unique_values=data['service_name'].unique()
print(service_unique_values)'''

"service_unique_values=data['service_name'].unique()\nprint(service_unique_values)"

In [18]:
service_unique_values_cnt=data['service_name'].value_counts()
print(service_unique_values_cnt)

service_name
WRS - Cart Management                      51648
Finance - Property Tax Account Inquiry     37459
Bylaw - Snow and Ice on Sidewalk           34438
Finance - ONLINE TIPP Agreement Request    28318
Corporate - Graffiti Concerns              22096
                                           ...  
CAI - Employee Complaint - Compliment          1
WATR - Water Brochure                          1
PSD - Major Mobility - Paving Program          1
WRS - Chatbot Feedback                         1
CPI - Employee Complaint - Compliment          1
Name: count, Length: 640, dtype: int64


In [19]:
data['new_serviceNames'] = data['service_name'].str.split(' -').str[0] #extracted main service name and removed sub division of that.
sn_unique_values = data['new_serviceNames'].unique()
print(sn_unique_values)

['Finance' 'Active Living Program Application' 'CN' 'CT' 'Recreation'
 'CT AC' 'REC' 'Parks' 'AS' 'WATS' 'Bylaw' 'Roads' 'Corporate'
 'CBS Inspection' '311 Contact Us' 'WATR' 'WRS' 'After Hours Transit'
 'Compliance' 'Opinions on Business Units' 'HR' 'CFD' 'CSC'
 'Animal / Bylaw' 'CBS' 'Partnerships' 'Customer Service & Communications'
 'UEP' 'CAI' 'Law' 'GFL' 'TP' 'CBS Concern' 'Calgary Housing' 'RSP' 'DBBS'
 'Public' "Mayor's Office" 'Parks PlayBins' 'ASMT' 'Roads Permits'
 'Green Line Inquiry' 'PSD' 'Stewardship of City Owned Land'
 'Facility Mgmt' 'REDS' 'TI' 'AT' 'BIA Requests' 'CEMA' 'CGS' 'Fleet'
 'CPA' 'CP' 'CMO' "City Clerk's" 'Business Safety' 'CED'
 'Parks Seasonal Vendor Application' 'IT' 'Fleet and Inventory'
 'Regulatory Affairs' 'Supply' 'UD' 'LEAD Program Inquiry'
 'Community Safety' 'DBBS Inspection' 'VFH' 'ESM' 'City Auditors'
 'DBBS Concern' 'Community Clean Up Event' 'CRP' 'CE'
 'Playfield Portisan Placement Request' 'CAC' 'ACPL' 'ZZZ Business Safety'
 'CPI' 'ZZZ VF

In [20]:
unique_values = data['new_serviceNames'].nunique()
print("Number of Unique values: ",unique_values)
unique_values_cnt = data['new_serviceNames'].value_counts()
print(unique_values_cnt)

Number of Unique values:  86
new_serviceNames
Roads                           183532
WRS                             141593
Bylaw                           115765
Parks                            94314
Finance                          87481
                                 ...  
Comm Strategies                      6
City Auditors                        5
Community Clean Up Event             2
CAC                                  1
Eau Claire Area Improvements         1
Name: count, Length: 86, dtype: int64


In [21]:
#count of particular value
count = (data['new_serviceNames'] == 'AS').sum()
count

24176

### Column "agency_responsible"

In [22]:
agency_nunique_values=data['agency_responsible'].nunique()
print(agency_nunique_values)

77


In [23]:
agency_unique_values=data['agency_responsible'].unique()
print(agency_unique_values)

['CFOD - Finance' 'CS - Recreation and Social Programs'
 'CS - Calgary Neighbourhoods' 'OS - Calgary Transit'
 'CS - Calgary Recreation' 'TRAN - Calgary Transit' 'CS - Calgary Parks'
 'CS - Calgary Community Standards' 'UEP - Water Services'
 'OS - Parks and Open Spaces' 'TRAN - Roads'
 'PD - Calgary Building Services'
 'CFOD - Customer Services and Communications' 'UEP - Water Resources'
 'UEP - Waste and Recycling Services' 'CPFS - Assessment and Tax'
 'Corporate Wide Service Requests'
 'CS - Emergency Management and Community Safety'
 'OS - Waste and Recycling Services' 'PICS - Human Resources'
 'CS - Calgary Fire' 'OS - Mobility' 'Tranc - Calgary Transit'
 'Partnerships' 'DCMO - Corporate Analytics and Innovation' 'LL - Law'
 'Uepc - Waste and Recycling Services' 'TRAN - Transportation Planning'
 'OS - Water Services' 'CS - Calgary Housing'
 'Recreation and Social Programs'
 'PDS - Development, Business and Building Services'
 'DCMO - Facility Management' 'Elected Officials'
 'PICS

In [24]:
agency_unique_values_cnt=data['agency_responsible'].value_counts()
print(agency_unique_values_cnt)

agency_responsible
CS - Emergency Management and Community Safety    168193
OS - Mobility                                     161207
OS - Waste and Recycling Services                 122552
CPFS - Assessment and Tax                          94478
OS - Parks and Open Spaces                         86389
                                                   ...  
Office of the City Auditor                             5
AO - Calgary Parking Authority                         4
Fleet and Inventory                                    4
CFOD - Information Technology                          1
PD - Calgary Approvals Coordination                    1
Name: count, Length: 77, dtype: int64


In [25]:
data['new_agencyResponsible'] = data['agency_responsible'].str.split(' -').str[0] 
ag_unique_values = data['new_agencyResponsible'].unique()
print(ag_unique_values)

['CFOD' 'CS' 'OS' 'TRAN' 'UEP' 'PD' 'CPFS'
 'Corporate Wide Service Requests' 'PICS' 'Tranc' 'Partnerships' 'DCMO'
 'LL' 'Uepc' 'Recreation and Social Programs' 'PDS' 'Elected Officials'
 'IS' 'LLSS' 'OSC' 'Affiliated Organizations' 'AO' 'Fleet and Inventory'
 'Office of the City Auditor' nan]


In [26]:
ag_unique_values = data['new_agencyResponsible'].nunique()
print("Number of Unique values: ",ag_unique_values)
ag_unique_values_cnt = data['new_agencyResponsible'].value_counts()
print(ag_unique_values_cnt)

Number of Unique values:  24
new_agencyResponsible
OS                                 478338
CS                                 272992
CPFS                                96809
PDS                                 86972
UEP                                 32318
TRAN                                28746
CFOD                                23488
PICS                                23179
PD                                  22525
OSC                                  7622
Corporate Wide Service Requests      5152
IS                                   5069
Recreation and Social Programs       3029
LLSS                                 2393
Uepc                                 2300
Tranc                                1228
DCMO                                  537
LL                                    477
Elected Officials                     367
AO                                    118
Partnerships                           40
Affiliated Organizations                5
Office of the City Audito

In [27]:
data.to_csv('311_service_2yr_basic.csv', index=False)