In [29]:
import pandas as pd
import math

### Extract Data

In [3]:
URL = 'https://data.cityofnewyork.us/resource/43nn-pn8j.json'
offset = 0
limit = 1000

df_raw = pd.DataFrame()

In [4]:
while True:
    url = URL + f'?$limit={limit}&$offset={offset}'
    df_chunk = pd.read_json(url)
    df_raw = pd.concat([df_raw, df_chunk], ignore_index=True)

    if len(df_chunk) < limit:
        break

    offset += limit

### Check Dataset

In [5]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 224367 entries, 0 to 224366
Data columns (total 26 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   camis                  224367 non-null  int64  
 1   dba                    223811 non-null  object 
 2   boro                   224367 non-null  object 
 3   building               223971 non-null  object 
 4   street                 224367 non-null  object 
 5   zipcode                221594 non-null  object 
 6   phone                  224364 non-null  object 
 7   inspection_date        224367 non-null  object 
 8   critical_flag          224367 non-null  object 
 9   record_date            224367 non-null  object 
 10  latitude               224085 non-null  float64
 11  longitude              224085 non-null  float64
 12  community_board        220959 non-null  float64
 13  council_district       220963 non-null  float64
 14  census_tract           220963 non-nu

(No *location point1* in extract data)

In [6]:
df_raw.head()

Unnamed: 0,camis,dba,boro,building,street,zipcode,phone,inspection_date,critical_flag,record_date,...,bbl,nta,cuisine_description,action,score,grade,grade_date,inspection_type,violation_code,violation_description
0,50122211,STATEN ISLAND FERRY HAWKS,Staten Island,75,RICHMOND TERRACE,10301,9172843261.0,1900-01-01T00:00:00.000,Not Applicable,2024-04-14T06:00:11.000,...,5000020000.0,SI22,,,,,,,,
1,50135220,SHISO,Manhattan,214,EAST 9 STREET,10003,8563046681.0,1900-01-01T00:00:00.000,Not Applicable,2024-04-14T06:00:11.000,...,1004648000.0,MN22,,,,,,,,
2,50148541,TIPICO56 LLC,Manhattan,177,SHERMAN AVENUE,10034,9295034336.0,1900-01-01T00:00:00.000,Not Applicable,2024-04-14T06:00:11.000,...,1022210000.0,MN01,,,,,,,,
3,50149643,TIA ELI'S REATURANT,Brooklyn,5,EAST 31 STREET,11226,3479579837.0,1900-01-01T00:00:00.000,Not Applicable,2024-04-14T06:00:11.000,...,3048850000.0,BK95,,,,,,,,
4,50106703,J-SPEC,Manhattan,239,EAST 5 STREET,10003,3472903669.0,1900-01-01T00:00:00.000,Not Applicable,2024-04-14T06:00:11.000,...,1004610000.0,MN22,,,,,,,,


### Data Cleaning

Remove columns with entirely missing data

In [25]:
df_raw.dropna(how='all', inplace=True)
df_raw.shape

(224367, 26)

Remove columns outside our Dimensions/Facts Table

In [26]:
df_cleaned = df_raw.copy().drop(columns = ['bin','bbl','nta','census_tract','council_district','community_board'])
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 224367 entries, 0 to 224366
Data columns (total 20 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   camis                  224367 non-null  int64  
 1   dba                    223811 non-null  object 
 2   boro                   224367 non-null  object 
 3   building               223971 non-null  object 
 4   street                 224367 non-null  object 
 5   zipcode                221594 non-null  object 
 6   phone                  224364 non-null  object 
 7   inspection_date        224367 non-null  object 
 8   critical_flag          224367 non-null  object 
 9   record_date            224367 non-null  object 
 10  latitude               224085 non-null  float64
 11  longitude              224085 non-null  float64
 12  cuisine_description    222074 non-null  object 
 13  action                 222074 non-null  object 
 14  score                  213626 non-nu

Remove columns with less than 60% data

In [40]:
df_cleaned.dropna(axis=0, thresh=int(df_cleaned.shape[1]*0.6)+1, inplace = True)
df_cleaned.shape

(222073, 20)

In [41]:
df_cleaned.head()

Unnamed: 0,camis,dba,boro,building,street,zipcode,phone,inspection_date,critical_flag,record_date,latitude,longitude,cuisine_description,action,score,grade,grade_date,inspection_type,violation_code,violation_description
15,41395521,CITI FIELD FOXWOODS BAR,Queens,126,ROOSEVELT AVENUE,,7185958100.0,2017-06-13T00:00:00.000,Not Applicable,2024-04-14T06:00:09.000,0.0,0.0,American,No violations were recorded at the time of thi...,0.0,A,2017-06-13T00:00:00.000,Cycle Inspection / Initial Inspection,,
27,50083465,MALA PROJECT,Manhattan,122,1 AVENUE,10009.0,2126021005.0,2022-04-19T00:00:00.000,Critical,2024-04-14T06:00:09.000,40.727181,-73.985572,Chinese,Violations were cited in the following area(s).,33.0,,,Cycle Inspection / Initial Inspection,02B,Hot food item not held at or above 140º F.
30,41543970,PAUSE CAFE,Manhattan,3,CLINTON SREET,,2126775415.0,2022-09-27T00:00:00.000,Critical,2024-04-14T06:00:09.000,0.0,0.0,Middle Eastern,Violations were cited in the following area(s).,12.0,A,2022-09-27T00:00:00.000,Cycle Inspection / Re-inspection,02B,Hot TCS food item not held at or above 140 °F.
35,50070366,VILLAGE DINER,Queens,8174,LEFFERTS BLVD,11415.0,7188503787.0,2023-02-09T00:00:00.000,Not Critical,2024-04-14T06:00:09.000,40.707959,-73.830981,American,Violations were cited in the following area(s).,25.0,,,Cycle Inspection / Initial Inspection,09B,Thawing procedure improper.
36,41280221,WOORIJIP,Manhattan,12,WEST 32 STREET,10001.0,2122441115.0,2022-05-18T00:00:00.000,Not Critical,2024-04-14T06:00:09.000,40.747468,-73.986278,Korean,Violations were cited in the following area(s).,,,,Administrative Miscellaneous / Initial Inspection,22F,MISBRANDED AND LABELING
