# 01.00 City of LA 311 Request Data: API Data Cleaning
## Issue 1279
---
Identify addresses or small areas that could benefit from more signage, increased community assistance, or other actions

# Package & Data Imports
-----

In [1]:
import pandas as pd
from pandas import json_normalize
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline
pd.set_option('display.max_columns', None)

import warnings
warnings.simplefilter("ignore")
with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=DeprecationWarning)

## LA 311 Raw Data Imports
-----
[API Source](https://data.lacity.org/City-Infrastructure-Service-Requests/MyLA311-Service-Request-Data-2022/i5ke-k6by) | [Documentation](https://dev.socrata.com/foundry/data.lacity.org/i5ke-k6by) | [Hack for LA API File](https://github.com/hackforla/311-data/blob/dev/server/utils/get_request_data_csv.py)

In [2]:
# CMD LINE: python get_311_request_data_csv.py "2021-10-01" "2022-10-01"
df_311_raw = pd.read_csv('../data/01Oct21_01Oct22_api.csv')
df_311 = df_311_raw.copy()

In [3]:
print(df_311.shape)
df_311.head()

(1096525, 16)


Unnamed: 0.1,Unnamed: 0,requestId,srnumber,councilId,councilName,typeId,typeName,agencyId,agencyName,sourceId,sourceName,createdDate,closedDate,address,latitude,longitude
0,0,8610592,1-2079512481,64,Pacoima,4,Bulky Items,3,Sanitation Bureau,8,Phone Call,2021-10-01T00:01:14,2021-10-08T09:24:53,"11614 N HERRICK AVE, 91340",34.281312,-118.425453
1,1,8610569,1-2079516251,64,Pacoima,5,Electronic Waste,3,Sanitation Bureau,8,Phone Call,2021-10-01T00:02:19,2021-10-08T13:36:33,"11614 N HERRICK AVE, 91340",34.281312,-118.425453
2,2,8610575,1-2079512561,72,Sherman Oaks,4,Bulky Items,3,Sanitation Bureau,8,Phone Call,2021-10-01T00:03:11,2021-10-13T12:31:58,"4539 N VISTA DEL MONTE AVE, 91403",34.154158,-118.450025
3,3,8612855,1-2079516331,38,Historic Cultural North,6,Illegal Dumping,3,Sanitation Bureau,3,Email,2021-10-01T00:04:14,2021-10-05T14:25:02,"918 W COLLEGE ST, 90012",34.066466,-118.245113
4,4,8610587,1-2079516401,64,Pacoima,7,Metal/Appliances,3,Sanitation Bureau,8,Phone Call,2021-10-01T00:05:25,2021-10-08T14:16:32,"11614 N HERRICK AVE, 91340",34.281312,-118.425453


# Cleaning
---

## Arrange Columns

In [4]:
print(df_311.shape)
# df_311.head()

(1096525, 16)


In [5]:
df_311.columns

Index(['Unnamed: 0', 'requestId', 'srnumber', 'councilId', 'councilName',
       'typeId', 'typeName', 'agencyId', 'agencyName', 'sourceId',
       'sourceName', 'createdDate', 'closedDate', 'address', 'latitude',
       'longitude'],
      dtype='object')

In [6]:
df_311 = df_311[['requestId', 'createdDate', 'closedDate', 'typeId', 'typeName', 'address', 'latitude',
       'longitude', 'agencyId', 'agencyName', 'sourceId', 'srnumber', 'sourceName', 'councilId', 'councilName']]

## Nulls & Duplicates

In [7]:
# check null is less than 30%
print(f'df shape: {df_311.shape}')
print(f'null #/total length: {(df_311.isnull().sum().sum())/len(df_311)}')

df shape: (1096525, 15)
null #/total length: 0.049535578304188234


In [8]:
# drop if less than 30%
df_311.dropna(inplace = True)
df_311.shape

(1042208, 15)

In [9]:
# check duplicate request IDs
print(f"number of duplicates: {df_311['requestId'].duplicated().sum()}")
df_311[df_311['requestId'].duplicated() == True]

number of duplicates: 6


Unnamed: 0,requestId,createdDate,closedDate,typeId,typeName,address,latitude,longitude,agencyId,agencyName,sourceId,srnumber,sourceName,councilId,councilName
96525,8810203,2021-10-27T16:10:27,2021-10-27T16:18:38,1,Graffiti,"711 S COLUMBIA AVE, 90017",34.052395,-118.268942,4,Office of Community Beautification,18,1-2102413556,Work Crew,92,Westlake South
136525,8890322,2021-11-08T09:04:04,2021-11-08T09:06:45,1,Graffiti,"800 E 28TH ST, 90011",34.018859,-118.26067,4,Office of Community Beautification,18,1-2111333835,Work Crew,74,South Central
506525,9593516,2022-04-04T13:57:29,2022-04-05T15:48:29,7,Metal/Appliances,"720 N HALLIDAY AVE, 90049",34.074131,-118.477691,3,Sanitation Bureau,8,1-2260892061,Phone Call,0,No council
556526,9699741,2022-04-19T10:54:38,2022-04-19T10:56:03,1,Graffiti,"1015 E MANCHESTER AVE, 90001",33.960323,-118.257181,4,Office of Community Beautification,18,1-2281808695,Work Crew,21,Empowerment Congress Southeast
836526,10248603,2022-07-12T11:31:16,2022-07-12T11:35:52,1,Graffiti,"516 S MOTT ST, 90033",34.036386,-118.209504,4,Office of Community Beautification,18,1-2399489439,Work Crew,5,Boyle Heights
926528,10397670,2022-08-10T12:05:06,2022-08-10T12:14:04,1,Graffiti,"5235 W MELROSE AVE, 90038",34.083632,-118.314867,4,Office of Community Beautification,18,1-2443566980,Work Crew,41,Hollywood Studio District


In [10]:
df_311[df_311['requestId'] == 8810203]

Unnamed: 0,requestId,createdDate,closedDate,typeId,typeName,address,latitude,longitude,agencyId,agencyName,sourceId,srnumber,sourceName,councilId,councilName
96524,8810203,2021-10-27T16:10:27,2021-10-27T16:18:38,1,Graffiti,"711 S COLUMBIA AVE, 90017",34.052395,-118.268942,4,Office of Community Beautification,18,1-2102413556,Work Crew,92,Westlake South
96525,8810203,2021-10-27T16:10:27,2021-10-27T16:18:38,1,Graffiti,"711 S COLUMBIA AVE, 90017",34.052395,-118.268942,4,Office of Community Beautification,18,1-2102413556,Work Crew,92,Westlake South


In [11]:
# drop duplicates
df_311.drop_duplicates(subset = 'requestId', inplace = True)
print(df_311.shape)

(1042202, 15)


## Data Types

In [12]:
df_311.dtypes

requestId        int64
createdDate     object
closedDate      object
typeId           int64
typeName        object
address         object
latitude       float64
longitude      float64
agencyId         int64
agencyName      object
sourceId         int64
srnumber        object
sourceName      object
councilId        int64
councilName     object
dtype: object

In [13]:
# Dates to datetime
df_311['createdDate'] = pd.to_datetime(df_311['createdDate'])
df_311['closedDate'] = pd.to_datetime(df_311['closedDate'])

# councilID to object
df_311['councilID'] = df_311['councilId'].apply(lambda x: str(x))

In [14]:
df_311.dtypes

requestId               int64
createdDate    datetime64[ns]
closedDate     datetime64[ns]
typeId                  int64
typeName               object
address                object
latitude              float64
longitude             float64
agencyId                int64
agencyName             object
sourceId                int64
srnumber               object
sourceName             object
councilId               int64
councilName            object
councilID              object
dtype: object

In [15]:
print(df_311.shape)
df_311.head()

(1042202, 16)


Unnamed: 0,requestId,createdDate,closedDate,typeId,typeName,address,latitude,longitude,agencyId,agencyName,sourceId,srnumber,sourceName,councilId,councilName,councilID
0,8610592,2021-10-01 00:01:14,2021-10-08 09:24:53,4,Bulky Items,"11614 N HERRICK AVE, 91340",34.281312,-118.425453,3,Sanitation Bureau,8,1-2079512481,Phone Call,64,Pacoima,64
1,8610569,2021-10-01 00:02:19,2021-10-08 13:36:33,5,Electronic Waste,"11614 N HERRICK AVE, 91340",34.281312,-118.425453,3,Sanitation Bureau,8,1-2079516251,Phone Call,64,Pacoima,64
2,8610575,2021-10-01 00:03:11,2021-10-13 12:31:58,4,Bulky Items,"4539 N VISTA DEL MONTE AVE, 91403",34.154158,-118.450025,3,Sanitation Bureau,8,1-2079512561,Phone Call,72,Sherman Oaks,72
3,8612855,2021-10-01 00:04:14,2021-10-05 14:25:02,6,Illegal Dumping,"918 W COLLEGE ST, 90012",34.066466,-118.245113,3,Sanitation Bureau,3,1-2079516331,Email,38,Historic Cultural North,38
4,8610587,2021-10-01 00:05:25,2021-10-08 14:16:32,7,Metal/Appliances,"11614 N HERRICK AVE, 91340",34.281312,-118.425453,3,Sanitation Bureau,8,1-2079516401,Phone Call,64,Pacoima,64


In [16]:
df_311.to_csv('../data/clean_01Oct21_01Oct22_api.csv', index = False)