In [1]:
#Basic dataframe & numerical libraries
import pandas as pd 

#Importing visualization libraries for exploratory analysis
import matplotlib.pyplot as plt 
import seaborn as sns 

#Importing to standardize formatting (geolocation)
import geopy

#Importing geocoder classes
from geopy.geocoders import GoogleV3

#This library (safetyfile) contains a Google Maps API key.
#It is excluded from the uploaded dataset in the interest of informational security.
import safetyfile
from safetyfile import googleapi

print(type(googleapi))

<class 'str'>


In [2]:
#Reading original CSV to dataframe
gtgarden = pd.read_csv('GreenThumb_Garden_Info_20240916.csv')
gtgarden.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 624 entries, 0 to 623
Data columns (total 27 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   assemblydist       624 non-null    int64  
 1   address            624 non-null    object 
 2   borough            624 non-null    object 
 3   communityboard     624 non-null    int64  
 4   congressionaldist  624 non-null    int64  
 5   coundist           624 non-null    int64  
 6   gardenname         624 non-null    object 
 7   juris              624 non-null    object 
 8   multipolygon       624 non-null    object 
 9   openhrsf           369 non-null    object 
 10  openhrsm           343 non-null    object 
 11  openhrssa          412 non-null    object 
 12  openhrssu          368 non-null    object 
 13  openhrsth          359 non-null    object 
 14  openhrstu          362 non-null    object 
 15  openhrsw           371 non-null    object 
 16  parksid            624 non

In [3]:
#Looking at a limited sample of entries
gtgarden.sample(5)

Unnamed: 0,assemblydist,address,borough,communityboard,congressionaldist,coundist,gardenname,juris,multipolygon,openhrsf,...,policeprecinct,statesenatedist,status,zipcode,BBL,NTA,CensusTract,lat,lon,crossStreets
233,66,511 LaGuardia Place,M,102,10,2,LaGuardia Corner Community Garden,DOT,MULTIPOLYGON (((-73.9989687344228 40.727911976...,6:00 p.m. to 8:00 p.m.,...,6,27,Active,10012,1005240000.0,MN23 /,55.01,40.727582,-73.998737,Bleecker & Houston Streets.
553,54,2379 Pitkin Avenue,B,305,8,42,Floral Vineyard,DPR,MULTIPOLYGON (((-73.88460294991685 40.67367165...,,...,75,19,Active,11207,3040000000.0,BK82 /,1150.0,40.673754,-73.884592,Cleveland & Ashford Street
589,53,207 South 2nd Street,B,301,7,34,El Puente: Espiritu Tierra Community Garden - ...,DPR,MULTIPOLYGON (((-73.95967599680367 40.71246980...,12:30 p.m. - 1:30 p.m. & 5:00 p.m. - 6:00 p.m.,...,90,59,Active,11211,3024070000.0,BK73 /,523.0,40.712503,-73.959351,Roebling Street & Driggs Avenue
6,72,5240 Broadway Bronx NY 10463,X,208,13,10,Marble Hill Garden - Marble Hill Houses (NYCHA),NYCHA,MULTIPOLYGON (((-73.90798819736936 40.87568562...,9:00 A.M. - 11:30 A.M.,...,50,31,Active,10463,1022150000.0,/,,,,Broadway & West 228th
470,55,1969 Fulton Street,B,316,8,41,Saratoga Farm,DPR,MULTIPOLYGON (((-73.91607709166529 40.67885021...,9:00 a.m. - 12:30 p.m. & 12:30 p.m. - 3:00 p.m.,...,73,25,Active,11233,3015370000.0,BK79 /,371.0,40.67876,-73.916018,Saratoga St. & Hull St.


In [4]:
#It looks like Pandas incorrectly read in ZIP Codes as floats...
#These function below should fix it.
def repairzip(textobj):
      return str(textobj).replace(',','')

In [5]:
#Let's put into action!
gtgarden['zipcode'] = gtgarden['zipcode'].apply(repairzip)
print(gtgarden['zipcode'].sample(5))

316    10027
475    11221
204    10027
378    11212
150    11693
Name: zipcode, dtype: object


In [6]:
#Finding coordinates problem entries, slicing into separate DataFrame
#We can use 'lat' as a proxy for both latitude and longitude: when one is absent, the other is absent

slice = gtgarden[pd.isnull(gtgarden['lat'])].copy()
slice.info()

<class 'pandas.core.frame.DataFrame'>
Index: 59 entries, 1 to 615
Data columns (total 27 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   assemblydist       59 non-null     int64  
 1   address            59 non-null     object 
 2   borough            59 non-null     object 
 3   communityboard     59 non-null     int64  
 4   congressionaldist  59 non-null     int64  
 5   coundist           59 non-null     int64  
 6   gardenname         59 non-null     object 
 7   juris              59 non-null     object 
 8   multipolygon       59 non-null     object 
 9   openhrsf           25 non-null     object 
 10  openhrsm           24 non-null     object 
 11  openhrssa          27 non-null     object 
 12  openhrssu          23 non-null     object 
 13  openhrsth          25 non-null     object 
 14  openhrstu          24 non-null     object 
 15  openhrsw           25 non-null     object 
 16  parksid            59 non-null  

In [7]:
#Creating GoogleV3 class, searches using Google Map API to identify submitted addresses
#The aforementioned API key is used here.

geolocator = GoogleV3(api_key=googleapi)

In [8]:
#Using .apply() to basically create a Google Maps query for the address
#Some addresses lack building numbers: adding in the garden name AND ZIP Code gets around this problem
slice['pseudoaddress'] = slice.apply(lambda row: f'{row['gardenname']} {row['address']} {row['zipcode']}', axis = 1)

In [9]:
#Extracting geocodes relevant to each item...
slice['geocode'] = slice['pseudoaddress'].apply(lambda x: geolocator.geocode(x))

#This returns a geocode inherently incorporating both latitude and longitude
#On the off-chance a location is not on Google Maps, however, it might return 'None' instead

In [10]:
#And applying back as necessary...
def gc_lat(geocode):
    try:
        return geocode.latitude
    except AttributeError as err:
        return None
    
def gc_lon(geocode):
    try:
        return geocode.longitude
    except AttributeError as err:
        return None

slice['lat'] = slice['geocode'].apply(gc_lat).astype('float')
slice['lon'] = slice['geocode'].apply(gc_lon).astype('float')

In [11]:
#It turns out that there's a singular row in which the Google API was unable to determine its location...
#At index 130 is the "South Beach community garden NYCHA" at 100 Kramer street 10306.
#It's entirely unindexed by Google Maps. We do still have a standard address.
#We can clean this one up manually.

slice.loc[130, 'lat'] = gc_lat( geolocator.geocode(slice.loc[130, 'address']) )
slice.loc[130, 'lon'] = gc_lon( geolocator.geocode(slice.loc[130, 'address']) )

#Some other addresses only state the street name in ALL CAPS rather than the address.
#Google Maps, based on the provided information, is still able to approximate these locations.

In [12]:
#Let's drop the added column "pseudoaddress" now that we no longer need it...
slice = slice.drop(columns=['geocode'])

In [13]:
#With that done, let's now join this content back into the main dataframe.
gtgarden.update(slice, overwrite=False, join='left', errors='ignore')
gtgarden.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 624 entries, 0 to 623
Data columns (total 27 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   assemblydist       624 non-null    int64  
 1   address            624 non-null    object 
 2   borough            624 non-null    object 
 3   communityboard     624 non-null    int64  
 4   congressionaldist  624 non-null    int64  
 5   coundist           624 non-null    int64  
 6   gardenname         624 non-null    object 
 7   juris              624 non-null    object 
 8   multipolygon       624 non-null    object 
 9   openhrsf           369 non-null    object 
 10  openhrsm           343 non-null    object 
 11  openhrssa          412 non-null    object 
 12  openhrssu          368 non-null    object 
 13  openhrsth          359 non-null    object 
 14  openhrstu          362 non-null    object 
 15  openhrsw           371 non-null    object 
 16  parksid            624 non

In [14]:
#Just to double check... no null values!
gtgarden[gtgarden['lon'].isnull()]

Unnamed: 0,assemblydist,address,borough,communityboard,congressionaldist,coundist,gardenname,juris,multipolygon,openhrsf,...,policeprecinct,statesenatedist,status,zipcode,BBL,NTA,CensusTract,lat,lon,crossStreets


In [15]:
#But we're seeing a problem with 'CensusTract'.
gtgarden[gtgarden['CensusTract'].isnull()].sample(5)

Unnamed: 0,assemblydist,address,borough,communityboard,congressionaldist,coundist,gardenname,juris,multipolygon,openhrsf,...,policeprecinct,statesenatedist,status,zipcode,BBL,NTA,CensusTract,lat,lon,crossStreets
168,32,"12001 142nd Place, South Ozone Parks",Q,412,5,28,12001 142nd Place Community Garden,DPR,MULTIPOLYGON (((-73.79876804928313 40.67618682...,,...,113,10,Inactive (No Group),11436,4120350000.0,/,,40.676191,-73.798589,
142,30,"31-57 51st St, Woodside, New York 11377",Q,401,7,25,Moore Jackson Cemetery/Garden,PRI,MULTIPOLYGON (((-73.90799011529285 40.75621456...,,...,114,12,Not GreenThumb,11377,4011310000.0,/,,40.756035,-73.907658,
132,63,BROAD STREET,R,501,11,49,Broad Street Community Garden,PRI,MULTIPOLYGON (((-74.0761954432587 40.625767229...,,...,120,23,Not GreenThumb,10304,5005250000.0,/,,40.624343,-74.080305,Brownell
469,46,2871 Surf Avenue,B,313,8,47,Surfside Garden Multi-Cultural Coalition,DPR,MULTIPOLYGON (((-73.99459582875232 40.57306887...,2:30 p.m. - 6:00 p.m.,...,60,23,Active,11224,3070520000.0,/,,40.573371,-73.994704,At W. 29th Street between Surf and Mermaid
366,57,"228 York Street, Brooklyn, NY 11201",B,302,7,35,Pulse Garden (NYCHA - Farragut Houses ),NYCHA,MULTIPOLYGON (((-73.98086416411832 40.70105330...,,...,84,25,Active,11201,3000710000.0,/,,40.700646,-73.98158,York Street & Hudson Ave


In [16]:
#While geopy doesn't have native support for US Census Geocoder API...
#A small package called 'censusgeocode' does.

import censusgeocode as cg

In [17]:
#Let's make another slice.
slice = gtgarden[gtgarden['CensusTract'].isnull()].copy()
slice.sample(5)['CensusTract']

86    NaN
114   NaN
615   NaN
11    NaN
9     NaN
Name: CensusTract, dtype: float64

In [18]:
#defining a function that can be used with apply
def extractcensustract(row):
    inlat = row['lat']
    inlon = row['lon']
    resultobj = cg.coordinates(x=inlon, y=inlat, returntype='geographies')
    tract = resultobj['Census Tracts'][0]['TRACT']
    tract = float(tract[:4]+"."+tract[4:])
    return tract

#Census tracts can either be expressed as a 6 digit code or as a float:
    #That is, tract 57.02 can be written as 005702 and vice versa.
    #For the purposes of this cleaning, we're converting all tracts into floats.
    #Actually, this makes them easier to find: most public resources use their float identity.

#Example
extractcensustract(slice.sample(1))

595.01

In [19]:
#Applying the function to the slice
slice['CensusTract'] = slice.apply(extractcensustract, axis=1)
slice[['address','CensusTract','lat','lon']].sample(5)

Unnamed: 0,address,CensusTract,lat,lon
200,15 East 99th Street NY 10029,166.0,40.78815,-73.950761
615,"46 Columbia Pl, Brooklyn, NY 11201",7.0,40.692538,-73.999049
139,BAYVIEW TERRACE,176.0,40.52653,-74.164978
372,201 Myrtle Avenue Brooklyn NY 11201,15.01,40.693527,-73.98001
485,786 Livonia Ave. Brooklyn,1124.0,40.665511,-73.887822


In [20]:
#Return again to the main dataframe!
gtgarden.update(slice, overwrite=False, join='left', errors='ignore')
gtgarden.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 624 entries, 0 to 623
Data columns (total 27 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   assemblydist       624 non-null    int64  
 1   address            624 non-null    object 
 2   borough            624 non-null    object 
 3   communityboard     624 non-null    int64  
 4   congressionaldist  624 non-null    int64  
 5   coundist           624 non-null    int64  
 6   gardenname         624 non-null    object 
 7   juris              624 non-null    object 
 8   multipolygon       624 non-null    object 
 9   openhrsf           369 non-null    object 
 10  openhrsm           343 non-null    object 
 11  openhrssa          412 non-null    object 
 12  openhrssu          368 non-null    object 
 13  openhrsth          359 non-null    object 
 14  openhrstu          362 non-null    object 
 15  openhrsw           371 non-null    object 
 16  parksid            624 non

In [21]:
#We see that some results still lack crossStreets: that is, intersections.
slice = gtgarden[gtgarden['crossStreets'].isna()].copy()
print(slice.sample(5))

#Unfortunately, Google's API doesn't support returning intersections.
#In some cases, identifiying intersections might be inappropriate.
#Given that we're cleaning this dataset for later visualization, this column isn't essential for user use.

#For now, we'll fill these with the string value 'N/A'.
#These can be updated with new values from an updated version of the sheet.

gtgarden['crossStreets'] = gtgarden['crossStreets'].fillna('N/A')

#We can, however, address some shorthand which might not show up well in our ultimate visualization.
gtgarden['crossStreets'] = gtgarden['crossStreets'].replace(r'[Bb][Tt][Ww][Nn]?', r'Between', regex=True)

     assemblydist                              address borough  \
143            37                   61-1 Tonsor Street       Q   
239            71                499 West 150th Street       M   
1              83  3601 Marolla Place, Bronx, NY 10466       X   
244            68                  326 Pleasant Avenue       M   
114            85      1225 Hoe Avenue Bronx, NY 10459       X   

     communityboard  congressionaldist  coundist  \
143             405                  7        30   
239             109                 13         7   
1               212                 14        12   
244             111                 13         8   
114             203                 15        17   

                                    gardenname juris  \
143                 Ridgewood Community Garden   DOE   
239  Lucille McClarey Wicked Friendship Garden  NYRP   
1            Ujamaa Northeast Community Garden   DOT   
244                     Los Amigos Garden NYRP  NYRP   
114       

In [22]:
#It still looks like we have some blank values here and there...
gtgarden.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 624 entries, 0 to 623
Data columns (total 27 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   assemblydist       624 non-null    int64  
 1   address            624 non-null    object 
 2   borough            624 non-null    object 
 3   communityboard     624 non-null    int64  
 4   congressionaldist  624 non-null    int64  
 5   coundist           624 non-null    int64  
 6   gardenname         624 non-null    object 
 7   juris              624 non-null    object 
 8   multipolygon       624 non-null    object 
 9   openhrsf           369 non-null    object 
 10  openhrsm           343 non-null    object 
 11  openhrssa          412 non-null    object 
 12  openhrssu          368 non-null    object 
 13  openhrsth          359 non-null    object 
 14  openhrstu          362 non-null    object 
 15  openhrsw           371 non-null    object 
 16  parksid            624 non

In [23]:
#Column indices [9,15] are all describing open hours.
#A bit confusingly, they go in the order of: [Friday, Monday, Saturday, Sunday, Thursday, Tuesday, Wednesday].
#We can conver this to [Monday, Tuesday, Wednesday, Thursday, Friday, Saturday, Sunday] at a later point.

slice = gtgarden.iloc[:,9:16].copy()
slice.sample(15)

Unnamed: 0,openhrsf,openhrsm,openhrssa,openhrssu,openhrsth,openhrstu,openhrsw
188,,,,,,,
143,,,,,,,
310,8:00 a.m. - 8:00 p.m.,8:00 a.m. - 8:00 p.m.,8:00 a.m. - 8:00 p.m.,8:00 a.m. - 8:00 p.m.,8:00 a.m. - 8:00 p.m.,8:00 a.m. - 8:00 p.m.,8:00 a.m. - 8:00 p.m.
280,2:00 p.m. - 6:00 p.m.,,2:00 p.m. - 6:00 p.m.,,2:00 p.m. - 6:00 p.m.,2:00 p.m. - 6:00 p.m.,2:00 p.m. - 6:00 p.m.
166,10:00 a..m. - 3:00 p..m..,CLOSED,10:00 a..m. - 3:00 p..m..,CLOSED,10:00 a..m. - 3:00 p..m..,10:00 a..m. - 3:00 p..m..,CLOSED
508,1:00 p.m. - 4:00 p.m.,1:00 p.m. - 4:00 p.m.,1100 a.m. - 4:00 p.m.,TBD - when garden gate is open,1:00 p.m. - 4:00 p.m.,1:00 p.m. - 4:00 p.m.,1:00 p.m. - 4:00 p.m.
338,CLOSED,CLOSED,CLOSED,CLOSED,CLOSED,CLOSED,CLOSED
494,,,10:00 a.m. - 3:00 p.m.,,,,
340,,,,,,,
243,,,,,,,


In [24]:
#There seem to be some tiny errors...
slice[slice=='close'].count()

#For example, instances of having written "close" or "Close" or even "Closed" as opposed to standard "CLOSED".
#This is an easy fix.

openhrsf     0
openhrsm     0
openhrssa    1
openhrssu    2
openhrsth    0
openhrstu    1
openhrsw     2
dtype: int64

In [25]:
#This regex searches for any variation on "CLOSED", D-optional, and replace them with "CLOSED".
slice = slice.replace(r'[Cc][Ll][Oo][Ss][Ee][dD]?','CLOSED', regex=True)

In [26]:
#There's still tiny inconsistencies like how some entries include "a" or "p" in place of "a.m." or "p.m."
#We can also fix that easily.

slice.sample(15)

Unnamed: 0,openhrsf,openhrsm,openhrssa,openhrssu,openhrsth,openhrstu,openhrsw
188,,,,,,,
528,9:00a - 11:30a,9:00a - 11:30a,10:00p - 3:00p,,9:00a - 11:30a,9:00a - 11:30a,9:00a - 11:30a
168,,,,,,,
470,9:00 a.m. - 12:30 p.m. & 12:30 p.m. - 3:00 p.m.,9:00 a.m. - 12:30 p.m. & 12:30 p.m. - 3:00 p.m.,9:00 a.m. - 12:30 p.m. & 12:30 p.m. - 3:00 p.m.,,9:00 a.m. - 12:30 p.m. & 12:30 p.m. - 3:00 p.m.,9:00 a.m. - 12:30 p.m. & 12:30 p.m. - 3:00 p.m.,9:00 a.m. - 12:30 p.m. & 12:30 p.m. - 3:00 p.m.
263,,,9a.m. - dusk,9a.m. - dusk,,,
135,,,,,,,
50,11:00 a.m. - 5:00 p.m.,1:00 p.m. - 5:00 p.m.,11:00 a.m. - 5:00 p.m.,,11:00 a.m. - 5:00 p.m.,11:00 a.m. - 5:00 p.m.,11:00 a.m. - 5:00 p.m.
349,,,7:00 a.m. - 8:00 p.m.,7:00 a.m. - 8:00 p.m.,,,
281,3:00 p.m. - 7:00 p.m.,3:00 p.m. - 7:00 p.m.,3:00 p.m. - 7:00 p.m.,,3:00 p.m. - 7:00 p.m.,3:00 p.m. - 7:00 p.m.,3:00 p.m. - 7:00 p.m.
226,,,,,,,


In [27]:
#Correcting single character formatting
slice = slice.replace(r'(\d*:?\d*)([Aa])(\s)','\\1 a.m.\\3',regex=True)
slice = slice.replace(r'(\d*:?\d*)([Aa])(\s?$)','\\1 a.m.\\3',regex=True)
slice = slice.replace(r'(\d*:?\d*)([Pp])(\s)','\\1 p.m.\\3',regex=True)
slice = slice.replace(r'(\d*:?\d*)([Pp])(\s?$)','\\1 p.m.\\3',regex=True)

#Eliminating inconsistent spacing and stray numerals
slice = slice.replace(r'([1-9])(:)([Pp])','\\1:00 \\3',regex=True)
slice = slice.replace(r'([1-9])(:)([Aa])','\\1:00 \\3',regex=True)
slice = slice.replace(r'^([1-9]?[1-9])(:)?\s*([Pp])','\\1:00 \\3',regex=True)
slice = slice.replace(r'^([1-9]?[1-9])(:)?\s*([Aa])','\\1:00 \\3',regex=True)

#General consistency
slice = slice.replace(r'[Aa].?[Mm].?','a.m.', regex=True)
slice = slice.replace(r'[Pp].?[Mm].?','p.m.', regex=True)
slice = slice.replace(r'-','to', regex=True)
slice = slice.replace(r'Noon','12:00 p.m.', regex=True)
slice = slice.replace(r'.-', ' -', regex=True)
slice = slice.replace(r'\s?(:)\s?',':',regex=True)
slice = slice.replace(r'(\w)(to)(\w)',r'\1 to \3', regex=True)


#Dealing with lists
slice = slice.replace(r'\s?(,|&|;)(\s*)(\d)',r';\n\3', regex=True)


slice.sample(15)

Unnamed: 0,openhrsf,openhrsm,openhrssa,openhrssu,openhrsth,openhrstu,openhrsw
288,CLOSED,CLOSED,9:00 a.m. to 1:00 p.m.,9:00 a.m. to 1:00 p.m.,CLOSED,CLOSED,3:00 p.m.to5:00 p.m.
352,9:00 a.m. to 12:00 p.m.;\n2:00 p.m. to 4:00 p.m.,10:00 a.m. to 12:00 p.m.,9:00 a.m. to 2:00 p.m.,9:00 a.m. to 12:00 p.m.;\n2:00 p.m. to 6:00 p.m.,10:30 a.m. to 12:30 p.m.;\n4:00 p.m. to 5:00 p.m.,10:00 a.m. to 12:00 p.m.;\n3:30 p.m. to 6:30 p.m.,2:00 p.m. to 4:00 p.m.;\n5:30 p.m. to 6:30 p.m.
330,9:00 a.m.to 3:00 p.m.,9:00 a.m.to 3:00 p.m.,9:00 a.m.to 3:00 p.m.,9:00 a.m.to 3:00 p.m.,9:00 a.m.to 3:00 p.m.,9:00 a.m.to 3:00 p.m.,9:00 a.m.to 3:00 p.m.
162,,,,,,,
489,12:00 p.m. to 5:00 p.m.,1:00 p.m. to 5:00 p.m.,,,,1:00 p.m. to 5:00 p.m.,1:00 p.m. to 5:00 p.m.
519,CLOSED,11:00 a.m. to 1:00 p.m.,11:00 a.m. to 1:00 p.m.,11:00 a.m. to 1:00 p.m.,11:00 a.m. to 1:00 p.m.,11:00 a.m. to 1:00 p.m.,11:00 a.m. to 1:00 p.m.
290,,,,9:00 a.m. to 7:00 p.m.,,,
574,10:00 a.m. to 5:00 p.m.,10:00 a.m. to 5:00 p.m.,10:00 a.m. to 5:00 p.m.,,10:00 a.m. to 5:00 p.m.,10:00 a.m. to 5:00 p.m.,10:00 a.m. to 5:00 p.m.
244,,,,,,,
375,8:00 a.m. to 3:00 p.m.,8:00 a.m. to 3:00 p.m.,8:00 a.m. to 3:00 p.m.,8:00 a.m. to 3:00 p.m.,8:00 a.m. to 3:00 p.m.,8:00 a.m. to 3:00 p.m.,8:00 a.m. to 3:00 p.m.


In [28]:
#Unlike latitude or longitude, we can't extrapolate other information to fill these times.
#It's probably not appropriate to assume that they're closed during unlisted times either...

#Pending further updates on the original sheet, we can fill these with a 'N/A' label.

#There's data-original oddities like 'Sunset to Sundown' at '955 Columbus Avenue'.
#It might best to leave these alone: again, there's not other information to extrapolate from for proper corrections.

slice = slice.fillna('N/A')
slice.sample(15)

Unnamed: 0,openhrsf,openhrsm,openhrssa,openhrssu,openhrsth,openhrstu,openhrsw
355,CLOSED,CLOSED,12:00 p.m. to 4:00 p.m.,12:00 p.m. to 4:00 p.m.,CLOSED,5:30 p.m. to 7:30 p.m.,CLOSED
612,,,10:00 a.m.to 06:00 p.m.,,,,
320,11:00 a.m. to 2:00 p.m.,11:00 a.m. to 2:00 p.m.,11:00 a.m. to 4:00 p.m.,11:00 a.m. to 4:00 p.m.,11:00 a.m. to 2:00 p.m.,11:00 a.m. to 2:00 p.m.,11:00 a.m. to 2:00 p.m.
134,5:00 p.m. to 7:00 p.m.,5:00 p.m. to 7:00 p.m.,10:00 a.m. to 5:00 p.m.,10:00 a.m. to 5:00 p.m.,5:00 p.m. to 7:00 p.m.,5:00 p.m. to 7:00 p.m.,5:00 p.m. to 7:00 p.m.
623,,,,,,,
419,,,,,,,
485,12:00 p.m. to 2:00 p.m.,CLOSED,9:00 a.m. to 2:00 p.m.,CLOSED,12:00 p.m. to 2:00 p.m.,CLOSED,12:00 p.m. to 2:00 p.m.
490,4:00 p.m. to 8:00 p.m.,4:00 p.m. to 8:00 p.m.,4:00 p.m. to 8:00 p.m.,4:00 p.m. to 8:00 p.m.,4:00 p.m. to 8:00 p.m.,4:00 p.m. to 8:00 p.m.,4:00 p.m. to 8:00 p.m.
33,8:00 a.m. to 1:00 p.m.,8:00 a.m. to 1:00 p.m.,8:00 a.m. to 1:00 p.m.,8:00 a.m. to 1:00 p.m.,,,
57,,,12:00 p.m. to 5:00 p.m.,12:00 p.m. to 5:00 p.m.,,5:00 p.m. to 7:00 p.m.,


In [29]:
#Back to the main dataframe.
gtgarden.update(slice, overwrite=True, join='left', errors='ignore')
gtgarden.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 624 entries, 0 to 623
Data columns (total 27 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   assemblydist       624 non-null    int64  
 1   address            624 non-null    object 
 2   borough            624 non-null    object 
 3   communityboard     624 non-null    int64  
 4   congressionaldist  624 non-null    int64  
 5   coundist           624 non-null    int64  
 6   gardenname         624 non-null    object 
 7   juris              624 non-null    object 
 8   multipolygon       624 non-null    object 
 9   openhrsf           624 non-null    object 
 10  openhrsm           624 non-null    object 
 11  openhrssa          624 non-null    object 
 12  openhrssu          624 non-null    object 
 13  openhrsth          624 non-null    object 
 14  openhrstu          624 non-null    object 
 15  openhrsw           624 non-null    object 
 16  parksid            624 non

In [30]:
#Nice! Now let's clear away non-necessities...
gtgarden = gtgarden.map(lambda x: x.strip() if isinstance(x, str) else x)
gtgarden.sample(5)

Unnamed: 0,assemblydist,address,borough,communityboard,congressionaldist,coundist,gardenname,juris,multipolygon,openhrsf,...,policeprecinct,statesenatedist,status,zipcode,BBL,NTA,CensusTract,lat,lon,crossStreets
615,52,"46 Columbia Pl, Brooklyn, NY 11201",B,302,10,33,Willowtown Community Garden,DPR,MULTIPOLYGON (((-73.99893442421437 40.69261977...,,...,84,26,Not GreenThumb,11201,3002580000.0,/,7.0,40.692538,-73.999049,
509,55,1833 Strauss Street,B,316,8,41,Fred McLeod Community Garden,DPR,MULTIPOLYGON (((-73.91633621882795 40.66922119...,CLOSED,...,73,25,Active,11212,3034940000.0,BK81 /,902.0,40.669343,-73.916462,Between East New York Ave. and Pitkin Ave.
612,56,742 Monroe Street,B,303,8,41,Citizens for a Better Community Garden,DPR,MULTIPOLYGON (((-73.92698678594263 40.68808606...,,...,81,25,Active,11221,3016430000.0,BK35 /,385.0,40.687841,-73.926698,Monroe St. & Madison St.
533,55,633 Powell Street,B,316,8,42,A Patch of Inspiration,DPR,MULTIPOLYGON (((-73.90070909283055 40.65854196...,CLOSED,...,73,19,Active,11212,3038580000.0,BK81 /,920.0,40.658494,-73.900895,New Lots Avenue and Powell St.
466,55,615 Saratoga Avenue,B,316,8,41,Isabahlia Community Garden,DPR,MULTIPOLYGON (((-73.91620033574875 40.66588236...,9:00 a.m. to 12:00 p.m.,...,73,25,Active,11212,3003537000.0,BK81 /,902.0,40.665838,-73.916236,Dumont & Sutter Avenues


In [31]:
#Let's sort the columns into a more logical order.
#We'll prioritize unique information, like name, address, and coordinates first.
#Status will also be prioritized.

#More categorical tags, like congressional districts, can be moved after them.
#We'll move open hours to the very back...

gtgarden = gtgarden[['parksid',
                    'gardenname',
                    'status',
                    'address',
                    'lat',
                    'lon',
                    'BBL',
                    'borough',
                    'crossStreets',
                    'zipcode',
                    'openhrsf',
                    'openhrsm',
                    'openhrstu',
                    'openhrsw',
                    'openhrsth',
                    'openhrsf',
                    'openhrssa',
                    'openhrssu',
                    'CensusTract',
                    'assemblydist',
                    'communityboard',
                    'NTA',
                    'congressionaldist',
                    'coundist',
                    'statesenatedist',
                    'policeprecinct',
                    'juris',
                    'multipolygon']]

gtgarden.sample(5)

Unnamed: 0,parksid,gardenname,status,address,lat,lon,BBL,borough,crossStreets,zipcode,...,CensusTract,assemblydist,communityboard,NTA,congressionaldist,coundist,statesenatedist,policeprecinct,juris,multipolygon
216,MGT039,Maggie's Garden,Active,564 West 149th Street,40.828532,-73.948126,1020800000.0,M,Amsterdam & Broadway,10031,...,233.0,71,109,MN04 /,13,7,30,30,NYRP,MULTIPOLYGON (((-73.94816375581138 40.82838622...
458,B572-GT001,Hattie Carthan Herban Farm,Active,49 Van Buren St,40.690255,-73.943578,3017910000.0,B,Tompkins Ave and Throop Ave,11221,...,263.0,56,303,BK75 /,8,36,25,79,DPR,MULTIPOLYGON (((-73.9436401176154 40.690390582...
48,XGT006,Fordham Bedford Lot-Busters,Active,2599 Bainbridge Avenue,40.864227,-73.892627,2032930000.0,X,At W. 193rd Street,10458,...,399.01,78,207,BX05 /,13,15,33,52,BLT,MULTIPOLYGON (((-73.89282045275597 40.86419331...
229,MGT024,Lower East Side People Care,Active,25 Rutgers Street,40.713006,-73.989824,1002710000.0,M,Henry & Madison,10002,...,6.0,65,103,MN28 /,10,1,27,7,MLT,MULTIPOLYGON (((-73.98997663563858 40.71303269...
198,Q045-GT001,Long Island City Community Garden,Active,5-30 49th Avenue,40.743432,-73.955078,4000320000.0,Q,Vernon Blvd. and 5th St.,11101,...,7.01,37,402,QN31 /,7,26,59,108,DPR,MULTIPOLYGON (((-73.95508212323573 40.74357343...


In [32]:
#One last thing! It's really weird that the boroughs are acronymized in this way...
#The good thing is that the creators of this dataset made every borough have a unique one-character symbol.
#We'll replace them with the function below:

def boroughsort(chara):
    if chara == 'M':
        return 'Manhattan'
    elif chara == 'X':
        return 'Bronx'
    elif chara == 'B':
        return 'Brooklyn'
    elif chara == 'Q':
        return 'Queens'
    else:
        return 'Staten Island'
    
gtgarden['borough'] = gtgarden['borough'].apply(boroughsort)
gtgarden['borough'].sample(5)

466     Brooklyn
32         Bronx
232    Manhattan
48         Bronx
301    Manhattan
Name: borough, dtype: object

In [33]:
#This seems good enough to go!
#Let's output our new, cleaned, upgraded dataset.

gtgarden_postclean = gtgarden

In [34]:
#Write cleaned dataframe to CSV
gtgarden_postclean.to_csv("greenthumb_garden_clean.csv", sep=',', encoding='utf-8', index=False)