In [70]:
#Basic dataframe & numerical libraries
import pandas as pd 

#Importing visualization libraries for exploratory analysis
import matplotlib.pyplot as plt 
import seaborn as sns 

#Importing to standardize formatting (geolocation)
import geopy

#Importing geocoder classes
from geopy.geocoders import GoogleV3

#This library (safetyfile) contains a Google Maps API key.
#It is excluded from the uploaded dataset in the interest of informational security.
import safetyfile
from safetyfile import googleapi

print(type(googleapi))

<class 'str'>


In [71]:
#Reading original CSV to dataframe
gtgarden = pd.read_csv('GreenThumb_Garden_Info_20240916.csv')
gtgarden.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 624 entries, 0 to 623
Data columns (total 27 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   assemblydist       624 non-null    int64  
 1   address            624 non-null    object 
 2   borough            624 non-null    object 
 3   communityboard     624 non-null    int64  
 4   congressionaldist  624 non-null    int64  
 5   coundist           624 non-null    int64  
 6   gardenname         624 non-null    object 
 7   juris              624 non-null    object 
 8   multipolygon       624 non-null    object 
 9   openhrsf           369 non-null    object 
 10  openhrsm           343 non-null    object 
 11  openhrssa          412 non-null    object 
 12  openhrssu          368 non-null    object 
 13  openhrsth          359 non-null    object 
 14  openhrstu          362 non-null    object 
 15  openhrsw           371 non-null    object 
 16  parksid            624 non

In [72]:
#Looking at a limited sample of entries
gtgarden.sample(5)

Unnamed: 0,assemblydist,address,borough,communityboard,congressionaldist,coundist,gardenname,juris,multipolygon,openhrsf,...,policeprecinct,statesenatedist,status,zipcode,BBL,NTA,CensusTract,lat,lon,crossStreets
286,66,48 East 1st Street,M,103,10,2,First Street Garden,DPR,MULTIPOLYGON (((-73.98949720033305 40.72412798...,CLOSED,...,9,27,Active,10003,1004430000.0,MN22 /,36.02,40.724013,-73.989606,1st & 2nd Avenues
61,79,377 East 160th Street,X,203,15,17,Melrose New Generation Community Garden,DPR,MULTIPOLYGON (((-73.9151008011141 40.823940128...,1:00 p.m. to 4:00 p.m.,...,42,32,Active,10451,2024070000.0,BX35 /,141.0,40.82403,-73.914906,Melrose and Courtland Ave.
346,74,84 Avenue B,M,103,10,2,6th Street & Avenue B Garden,DPR,MULTIPOLYGON (((-73.98239309332045 40.72418156...,TBA Evening Events,...,9,27,Active,10009,1004010000.0,MN22 /,32.0,40.724308,-73.982041,At E. 6th Street
377,55,2176 FULTON STREET,B,316,8,37,Oak Grove Pentecostal Holiness Church,PRI,MULTIPOLYGON (((-73.90933386833268 40.67809240...,,...,73,25,Not GreenThumb,11233,3015520000.0,BK79 /,369.0,40.677959,-73.909494,
397,60,673 Sheffield Avenue,B,305,8,42,Sheffield Garden,BQLT,MULTIPOLYGON (((-73.89281344456302 40.66074906...,,...,75,19,Active,11207,3042980000.0,BK85 /,1128.0,40.660675,-73.892962,btw New Lots Ave. and Hegeman Ave.


In [73]:
#It looks like Pandas incorrectly read in ZIP Codes as floats...
#These function below should fix it.
def repairzip(textobj):
      return str(textobj).replace(',','')

In [74]:
#Let's put into action!
gtgarden['zipcode'] = gtgarden['zipcode'].apply(repairzip)
print(gtgarden['zipcode'].sample(5))

394    11205
180    11435
203    10280
570    11207
129    10453
Name: zipcode, dtype: object


In [75]:
#Finding coordinates problem entries, slicing into separate DataFrame
#We can use 'lat' as a proxy for both latitude and longitude: when one is absent, the other is absent

slice = gtgarden[pd.isnull(gtgarden['lat'])].copy()
slice.info()

<class 'pandas.core.frame.DataFrame'>
Index: 59 entries, 1 to 615
Data columns (total 27 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   assemblydist       59 non-null     int64  
 1   address            59 non-null     object 
 2   borough            59 non-null     object 
 3   communityboard     59 non-null     int64  
 4   congressionaldist  59 non-null     int64  
 5   coundist           59 non-null     int64  
 6   gardenname         59 non-null     object 
 7   juris              59 non-null     object 
 8   multipolygon       59 non-null     object 
 9   openhrsf           25 non-null     object 
 10  openhrsm           24 non-null     object 
 11  openhrssa          27 non-null     object 
 12  openhrssu          23 non-null     object 
 13  openhrsth          25 non-null     object 
 14  openhrstu          24 non-null     object 
 15  openhrsw           25 non-null     object 
 16  parksid            59 non-null  

In [76]:
#Creating GoogleV3 class, searches using Google Map API to identify submitted addresses
#The aforementioned API key is used here.

geolocator = GoogleV3(api_key=googleapi)

In [77]:
#Using .apply() to basically create a Google Maps query for the address
#Some addresses lack building numbers: adding in the garden name AND ZIP Code gets around this problem
slice['pseudoaddress'] = slice.apply(lambda row: f'{row['gardenname']} {row['address']} {row['zipcode']}', axis = 1)

In [78]:
#Extracting geocodes relevant to each item...
slice['geocode'] = slice['pseudoaddress'].apply(lambda x: geolocator.geocode(x))

#This returns a geocode inherently incorporating both latitude and longitude
#On the off-chance a location is not on Google Maps, however, it might return 'None' instead

In [79]:
#And applying back as necessary...
def gc_lat(geocode):
    try:
        return geocode.latitude
    except AttributeError as err:
        return None
    
def gc_lon(geocode):
    try:
        return geocode.longitude
    except AttributeError as err:
        return None

slice['lat'] = slice['geocode'].apply(gc_lat).astype('float')
slice['lon'] = slice['geocode'].apply(gc_lon).astype('float')

In [80]:
#It turns out that there's a singular row in which the Google API was unable to determine its location...
#At index 130 is the "South Beach community garden NYCHA" at 100 Kramer street 10306.
#It's entirely unindexed by Google Maps. We do still have a standard address.
#We can clean this one up manually.

slice.loc[130, 'lat'] = gc_lat( geolocator.geocode(slice.loc[130, 'address']) )
slice.loc[130, 'lon'] = gc_lon( geolocator.geocode(slice.loc[130, 'address']) )

#Some other addresses only state the street name in ALL CAPS rather than the address.
#Google Maps, based on the provided information, is still able to approximate these locations.

In [81]:
#Let's drop the added column "pseudoaddress" now that we no longer need it...
slice = slice.drop(columns=['geocode'])

In [82]:
#With that done, let's now join this content back into the main dataframe.
gtgarden.update(slice, overwrite=False, join='left', errors='ignore')
gtgarden.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 624 entries, 0 to 623
Data columns (total 27 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   assemblydist       624 non-null    int64  
 1   address            624 non-null    object 
 2   borough            624 non-null    object 
 3   communityboard     624 non-null    int64  
 4   congressionaldist  624 non-null    int64  
 5   coundist           624 non-null    int64  
 6   gardenname         624 non-null    object 
 7   juris              624 non-null    object 
 8   multipolygon       624 non-null    object 
 9   openhrsf           369 non-null    object 
 10  openhrsm           343 non-null    object 
 11  openhrssa          412 non-null    object 
 12  openhrssu          368 non-null    object 
 13  openhrsth          359 non-null    object 
 14  openhrstu          362 non-null    object 
 15  openhrsw           371 non-null    object 
 16  parksid            624 non

In [83]:
#Just to double check... no null values!
gtgarden[gtgarden['lon'].isnull()]

Unnamed: 0,assemblydist,address,borough,communityboard,congressionaldist,coundist,gardenname,juris,multipolygon,openhrsf,...,policeprecinct,statesenatedist,status,zipcode,BBL,NTA,CensusTract,lat,lon,crossStreets


In [84]:
#But we're seeing a problem with 'CensusTract'.
gtgarden[gtgarden['CensusTract'].isnull()].sample(5)

Unnamed: 0,assemblydist,address,borough,communityboard,congressionaldist,coundist,gardenname,juris,multipolygon,openhrsf,...,policeprecinct,statesenatedist,status,zipcode,BBL,NTA,CensusTract,lat,lon,crossStreets
203,61,"200-218 Albany St, New York, NY 10280",M,101,10,1,Liberty Community Gardens (LCG),PRI,MULTIPOLYGON (((-74.01544203984173 40.70976340...,,...,1,27,Not GreenThumb,10280,,/,,40.70964,-74.015419,
368,51,"219 34th St. Brooklyn, NY 11232",B,307,10,38,Los Colibries Community Garden,PRI,MULTIPOLYGON (((-74.00166565366716 40.65492651...,,...,72,26,Not GreenThumb,11232,3006850000.0,/,,40.654731,-74.001848,
9,84,515 Jackson Avenue Bronx NY 10455,X,201,15,8,El Jardín de Los Amigos de Moore - Moore House...,NYCHA,MULTIPOLYGON (((-73.90952703795728 40.81233841...,9:00 A.M - 12:00 P.M,...,40,29,Active,10455,2025570000.0,/,,40.812506,-73.909657,Jackson Avenue & East 149th street
12,87,1960 PUGSLEY AVENUE,X,209,14,18,Adlai E. Stevenson High School-Ecological Garden,DOE,MULTIPOLYGON (((-73.85482157523771 40.82039347...,,...,43,34,Not GreenThumb,10473,2036040000.0,/,,40.821514,-73.855414,Lafayette & Stickball Avenue
8,82,2761 Sampson Avenue Bronx NY 10465,X,210,14,13,Love Community Garden - Throggs Neck houses (N...,NYCHA,MULTIPOLYGON (((-73.82293873721201 40.81895142...,,...,45,34,Not GreenThumb,10465,2055820000.0,/,,40.819272,-73.822838,Sampson Avenue & Swinton Avenue


In [85]:
#While geopy doesn't have native support for US Census Geocoder API...
#A small package called 'censusgeocode' does.

import censusgeocode as cg

In [86]:
#Let's make another slice.
slice = gtgarden[gtgarden['CensusTract'].isnull()].copy()
slice.sample(5)['CensusTract']

143   NaN
211   NaN
199   NaN
376   NaN
370   NaN
Name: CensusTract, dtype: float64

In [87]:
#defining a function that can be used with apply
def extractcensustract(row):
    inlat = row['lat']
    inlon = row['lon']
    resultobj = cg.coordinates(x=inlon, y=inlat, returntype='geographies')
    tract = resultobj['Census Tracts'][0]['TRACT']
    tract = float(tract[:4]+"."+tract[4:])
    return tract

#Census tracts can either be expressed as a 6 digit code or as a float:
    #That is, tract 57.02 can be written as 005702 and vice versa.
    #For the purposes of this cleaning, we're converting all tracts into floats.
    #Actually, this makes them easier to find: most public resources use their float identity.

#Example
extractcensustract(slice.sample(1))

289.0

In [88]:
#Applying the function to the slice
slice['CensusTract'] = slice.apply(extractcensustract, axis=1)
slice[['address','CensusTract','lat','lon']].sample(5)

Unnamed: 0,address,CensusTract,lat,lon
372,201 Myrtle Avenue Brooklyn NY 11201,15.01,40.693527,-73.98001
365,"736 E 8th St, Brooklyn, NY",482.0,40.632936,-73.969163
368,"219 34th St. Brooklyn, NY 11232",84.0,40.654731,-74.001848
148,"41-38 69th St, Flushing, NY 11377",265.02,40.743327,-73.89585
100,438-44 Clarmont Pkwy,169.0,40.838943,-73.90377


In [89]:
#Return again to the main dataframe!
gtgarden.update(slice, overwrite=False, join='left', errors='ignore')
gtgarden.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 624 entries, 0 to 623
Data columns (total 27 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   assemblydist       624 non-null    int64  
 1   address            624 non-null    object 
 2   borough            624 non-null    object 
 3   communityboard     624 non-null    int64  
 4   congressionaldist  624 non-null    int64  
 5   coundist           624 non-null    int64  
 6   gardenname         624 non-null    object 
 7   juris              624 non-null    object 
 8   multipolygon       624 non-null    object 
 9   openhrsf           369 non-null    object 
 10  openhrsm           343 non-null    object 
 11  openhrssa          412 non-null    object 
 12  openhrssu          368 non-null    object 
 13  openhrsth          359 non-null    object 
 14  openhrstu          362 non-null    object 
 15  openhrsw           371 non-null    object 
 16  parksid            624 non

In [90]:
#We see that some results still lack crossStreets: that is, intersections.
slice = gtgarden[gtgarden['crossStreets'].isna()].copy()
print(slice.sample(5))

#Unfortunately, Google's API doesn't support returning intersections.
#In some cases, identifiying intersections might be inappropriate.
#Given that we're cleaning this dataset for later visualization, this column isn't essential for user use.

#For now, we'll fill these with the string value 'N/A'.
#These can be updated with new values from an updated version of the sheet.

gtgarden['crossStreets'] = gtgarden['crossStreets'].fillna('N/A')

#We can, however, address some shorthand which might not show up well in our ultimate visualization.
gtgarden['crossStreets'] = gtgarden['crossStreets'].replace(r'[Bb][Tt][Ww][Nn]?', r'Between', regex=True)

     assemblydist                              address borough  \
244            68                  326 Pleasant Avenue       M   
239            71                499 West 150th Street       M   
360            53  50 Humbold street Brooklyn NY 11206       B   
373            53                     131 Moore street       B   
256            70                227 West 115th Street       M   

     communityboard  congressionaldist  coundist  \
244             111                 13         8   
239             109                 13         7   
360             301                  7        34   
373             301                  7        34   
256             110                 13         9   

                                    gardenname  juris  \
244                     Los Amigos Garden NYRP   NYRP   
239  Lucille McClarey Wicked Friendship Garden   NYRP   
360   Bushwick Garden- Bushwick houses (NYCHA)  NYCHA   
373        Hylan Garden - Hylan Houses (NYCHA)  NYCHA   
256  

In [91]:
#It still looks like we have some blank values here and there...
gtgarden.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 624 entries, 0 to 623
Data columns (total 27 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   assemblydist       624 non-null    int64  
 1   address            624 non-null    object 
 2   borough            624 non-null    object 
 3   communityboard     624 non-null    int64  
 4   congressionaldist  624 non-null    int64  
 5   coundist           624 non-null    int64  
 6   gardenname         624 non-null    object 
 7   juris              624 non-null    object 
 8   multipolygon       624 non-null    object 
 9   openhrsf           369 non-null    object 
 10  openhrsm           343 non-null    object 
 11  openhrssa          412 non-null    object 
 12  openhrssu          368 non-null    object 
 13  openhrsth          359 non-null    object 
 14  openhrstu          362 non-null    object 
 15  openhrsw           371 non-null    object 
 16  parksid            624 non

In [92]:
#Column indices [9,15] are all describing open hours.
#A bit confusingly, they go in the order of: [Friday, Monday, Saturday, Sunday, Thursday, Tuesday, Wednesday].
#We can conver this to [Monday, Tuesday, Wednesday, Thursday, Friday, Saturday, Sunday] at a later point.

slice = gtgarden.iloc[:,9:16].copy()
slice.sample(15)

Unnamed: 0,openhrsf,openhrsm,openhrssa,openhrssu,openhrsth,openhrstu,openhrsw
123,1:00 p.m. to 2:00 p.m.,10:00 a.m. - 11:00 a.m.,9:30 a.m. - 11:30 a.m.,10:00 a.m. - 1:00 p.m.,2:00 p.m. to 3:00 p.m.,9:30 a.m. - 10:30 a.m.,2:00 p.m. to 3:00 p.m.
522,CLOSED,8:30 a.m. - 10:30 a.m.,11:00 a.m. - 2:00 p.m.,11:00 a.m. - 2:00 p.m.,10:00 a.m. - 12:00 p.m.,9:00 a.m. - 11:00 a.m.,9:00 a.m. - 11:00 a.m.
208,,,,,,,
122,12:00 p.m. - 3:00 p.m.,12:00 p.m. - 3:00 p.m.,12:00 p.m. - 3:00 p.m.,12:00 p.m. - 3:00 p.m.,12:00 p.m. - 3:00 p.m.,12:00 p.m. - 3:00 p.m.,12:00 p.m. - 3:00 p.m.
401,,,,,,,
500,CLOSED,CLOSED,10:00 a.m. - 7:00 p.m.,10:00 a.m. - 7:00 p.m.,CLOSED,CLOSED,10:00 a.m. - 7:00 p.m.
234,,,,,,,
264,9:00 a.m. - 7:00 p.m.,9:00 a.m. - 7:00 p.m.,CLOSED,10:00 a.m. - 5:00 p.m.,9:00 a.m. - 7:00 p.m.,9:00 a.m. - 7:00 p.m.,9:00 a.m. - 7:00 p.m.
209,,,,,,,
114,10:00 a.m. - 6:00 p.m.,12:00 p.m. - 5:00 p.m.,8:00 a.m. - 2:00 p.m.,8:00 a.m. - 2:00 p.m.,10:00 a.m. - 6:00 p.m.,12:00 p.m. - 5:00 p.m.,10:00 a.m. - 3:00 p.m.


In [93]:
#There seem to be some tiny errors...
slice[slice=='close'].count()

#For example, instances of having written "close" or "Close" or even "Closed" as opposed to standard "CLOSED".
#This is an easy fix.

openhrsf     0
openhrsm     0
openhrssa    1
openhrssu    2
openhrsth    0
openhrstu    1
openhrsw     2
dtype: int64

In [94]:
#This regex searches for any variation on "CLOSED", D-optional, and replace them with "CLOSED".
slice = slice.replace(r'[Cc][Ll][Oo][Ss][Ee][dD]?','CLOSED', regex=True)

In [95]:
#There's still tiny inconsistencies like how some entries include "a" or "p" in place of "a.m." or "p.m."
#We can also fix that easily.

slice.sample(15)

Unnamed: 0,openhrsf,openhrsm,openhrssa,openhrssu,openhrsth,openhrstu,openhrsw
241,,,,,,,
10,,,,,,,
278,9:00 a.m. - 5:00 p.m.,9:00 a.m. - 5:00 p.m.,9:00 a.m. - 5:00 p.m.,9:00 a.m. - 5:00 p.m.,9:00 a.m. - 5:00 p.m.,9:00 a.m. - 5:00 p.m.,9:00 a.m. - 5:00 p.m.
230,,,,,,,
435,8:00 a.m. - 8:00 p.m.,8:00 a.m. - 8:00 p.m.,8:00 a.m. - 8:00 p.m.,8:00 a.m. - 8:00 p.m.,8:00 a.m. - 8:00 p.m.,8:00 a.m. - 8:00 p.m.,8:00 a.m. - 8:00 p.m.
523,1:00 p.m. - 4:00 p.m.,1:00 p.m. - 4:00 p.m.,1:00 p.m. - 4:00 p.m.,1:00 p.m. - 4:00 p.m.,1:00 p.m. - 4:00 p.m.,1:00 p.m. - 4:00 p.m.,1:00 p.m. - 4:00 p.m
530,4:00 p.m.- 7:00 p.m.,4:00 p.m.- 7:00 p.m.,9:00 a.m.- 2:00 p.m.,CLOSED,4:00 p.m.- 7:00 p.m.,4:00 p.m.- 7:00 p.m.,4:00 p.m.- 7:00 p.m.
265,4:00 p.m. - 8:00 p.m.,4:00 p.m. - 7:00 p.m.,12:00 a.m. - 5:00 p.m.,12:00 a.m. - 5:00 p.m.,4:00 p.m. - 7:00 p.m.,4:00 p.m. - 7:00 p.m.,4:00 p.m. - 7:00 p.m.
120,12:00 p.m. - 4:00 p.m.,,12:00 p.m. - 4:00 p.m.,,12:00 p.m. - 4:00 p.m.,12:00 p.m. - 4:00 p.m.,12:00 p.m. - 4:00 p.m.
342,9:00 a.m. - 3:00 p.m.,9:00 a.m. - 3:00 p.m.,9:00 a.m. - 3:00 p.m.,9:00 a.m. - 3:00 p.m.,9:00 a.m. - 3:00 p.m.,9:00 a.m. - 3:00 p.m.,9:00 a.m. - 3:00 p.m.


In [96]:
#Correcting single character formatting
slice = slice.replace(r'(\d*:?\d*)([Aa])(\s)','\\1 a.m.\\3',regex=True)
slice = slice.replace(r'(\d*:?\d*)([Aa])(\s?$)','\\1 a.m.\\3',regex=True)
slice = slice.replace(r'(\d*:?\d*)([Pp])(\s)','\\1 p.m.\\3',regex=True)
slice = slice.replace(r'(\d*:?\d*)([Pp])(\s?$)','\\1 p.m.\\3',regex=True)

#Eliminating inconsistent spacing and stray numerals
slice = slice.replace(r'([1-9])(:)([Pp])','\\1:00 \\3',regex=True)
slice = slice.replace(r'([1-9])(:)([Aa])','\\1:00 \\3',regex=True)
slice = slice.replace(r'^([1-9]?[1-9])(:)?\s*([Pp])','\\1:00 \\3',regex=True)
slice = slice.replace(r'^([1-9]?[1-9])(:)?\s*([Aa])','\\1:00 \\3',regex=True)

#General consistency
slice = slice.replace(r'[Aa].?[Mm].?','a.m.', regex=True)
slice = slice.replace(r'[Pp].?[Mm].?','p.m.', regex=True)
slice = slice.replace(r'-','to', regex=True)
slice = slice.replace(r'Noon','12:00 p.m.', regex=True)
slice = slice.replace(r'.-', ' -', regex=True)
slice = slice.replace(r'\s?(:)\s?',':',regex=True)
slice = slice.replace(r'(\w)(to)(\w)',r'\1 to \3', regex=True)


#Dealing with lists
slice = slice.replace(r'\s?(,|&|;)(\s*)(\d)',r';\n\3', regex=True)


slice.sample(15)

Unnamed: 0,openhrsf,openhrsm,openhrssa,openhrssu,openhrsth,openhrstu,openhrsw
86,11:00 a.m.to 3:00 p.m.,11:00 a.m.to 3:00 p.m.,11:00 a.m.to 3:00 p.m.,,11:00 a.m.to 3:00 p.m.,11:00 a.m.to 3:00 p.m.,11:00 a.m.to 3:00 p.m.
382,,,,,,,
137,,,,,,,
571,12:00 p.m. to 3:00 p.m.,10:00 a.m. to 1:00 p.m.,3:00 p.m. to 4:30 p.m.,3:00 p.m. to 5:30 p.m.,10:00 a.m. to 1:00 p.m.,12:00 p.m. to 3:00 p.m.,3:00 p.m. to 6:00 p.m.
6,9:00 a.m. to 11:30 a.m.,9:00 a.m. to 11:30 a.m.,9:00 a.m. to 11:30 a.m.,CLOSED,9:00 a.m. to 11:30 a.m.,9:00 a.m. to 11:30 a.m.,9:00 a.m. to 11:30 a.m.
132,,,,,,,
286,CLOSED,CLOSED,11:00 a.m. to 4:00 p.m.,11:00 a.m. to 4:00 p.m.,CLOSED,CLOSED,CLOSED
107,10:00 a.m. to 6:00 p.m.,CLOSED,10:00 a.m. to 6:00 p.m.,10:00 a.m. to 6:00 p.m.,10:00 a.m. to 6:00 p.m.,CLOSED,10:00 a.m. to 6:00 p.m.
517,CLOSED,CLOSED,9:00 a.m. to 7:00 p.m.,9:00 a.m. to 7:00 p.m.,CLOSED,CLOSED,CLOSED
247,,,,,,,


In [97]:
#Unlike latitude or longitude, we can't extrapolate other information to fill these times.
#It's probably not appropriate to assume that they're closed during unlisted times either...

#Pending further updates on the original sheet, we can fill these with a 'N/A' label.

#There's data-original oddities like 'Sunset to Sundown' at '955 Columbus Avenue'.
#It might best to leave these alone: again, there's not other information to extrapolate from for proper corrections.

slice = slice.fillna('N/A')
slice.sample(15)

Unnamed: 0,openhrsf,openhrsm,openhrssa,openhrssu,openhrsth,openhrstu,openhrsw
438,,,,,,,
96,8:30 a.m. to 1:00 p.m.;\n2:00 p.m. to Sunset,8:30 a.m. to 1:00 p.m.;\n2:00 p.m. to Sunset,8:30 a.m. to 1:00 p.m.;\n2:00 p.m. to Sunset,CLOSED,8:30 a.m. to 1:00 p.m.;\n2:00 p.m. to Sunset,8:30 a.m. to 1:00 p.m.;\n2:00 p.m. to Sunset,8:30 a.m. to 1:00 p.m.;\n2:00 p.m. to Sunset
216,12:30 p.m. to 2:30 p.m.,12:00 p.m. to 4:00 p.m.,,2:00 p.m. to 4:00 p.m.,12:00 p.m. to 4:00 p.m.;\n6:00 p.m. to 8:00 p.m.,12:00 p.m. to 4:00 p.m.;\n6:00 p.m. to 8:00 p.m.,12:00 p.m. to 4:00 p.m.;\n6:00 p.m. to 8:00 p.m.
395,,,,,,,
73,12:00 p.m. to 6:00 p.m.,CLOSED,12:00 p.m. to 6:00 p.m.,12:00 p.m. to 6:00 p.m.,11:00 a.m. to 5:00 p.m.,11:00 a.m. to 5:00 p.m.,11:00 a.m. to 5:00 p.m.
199,,,,,,,
170,CLOSED,CLOSED,10:00 a.m. to 3:00 p.m.,10:00 a.m. to 3:00 p.m.,2:00 p.m. to 7:00 p.m.,10:00 a.m. to 3:00 p.m.,CLOSED
10,,,,,,,
586,12:00 p.m. to 4:00 p.m.,,2:00 p.m. to 6:00 p.m.,11:00 a.m. to 3:00 .p.m.,,,
204,11:00 a.m. 1:00 p.m.,11:00 a.m. 1:00 p.m.,11:00 a.m. 1:00 p.m.,CLOSED,11:00 a.m. 1:00 p.m.,11:00 a.m. 1:00 p.m.,11:00 a.m. 1:00 p.m.


In [98]:
#Back to the main dataframe.
gtgarden.update(slice, overwrite=True, join='left', errors='ignore')
gtgarden.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 624 entries, 0 to 623
Data columns (total 27 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   assemblydist       624 non-null    int64  
 1   address            624 non-null    object 
 2   borough            624 non-null    object 
 3   communityboard     624 non-null    int64  
 4   congressionaldist  624 non-null    int64  
 5   coundist           624 non-null    int64  
 6   gardenname         624 non-null    object 
 7   juris              624 non-null    object 
 8   multipolygon       624 non-null    object 
 9   openhrsf           624 non-null    object 
 10  openhrsm           624 non-null    object 
 11  openhrssa          624 non-null    object 
 12  openhrssu          624 non-null    object 
 13  openhrsth          624 non-null    object 
 14  openhrstu          624 non-null    object 
 15  openhrsw           624 non-null    object 
 16  parksid            624 non

In [99]:
#Nice! Now let's clear away non-necessities...
gtgarden = gtgarden.map(lambda x: x.strip() if isinstance(x, str) else x)
gtgarden.sample(5)

Unnamed: 0,assemblydist,address,borough,communityboard,congressionaldist,coundist,gardenname,juris,multipolygon,openhrsf,...,policeprecinct,statesenatedist,status,zipcode,BBL,NTA,CensusTract,lat,lon,crossStreets
78,77,949 Woodycrest Avenue,X,204,15,16,Woodycrest Community Garden,DPR,MULTIPOLYGON (((-73.92926159853674 40.83114340...,6:00 p.m. to 8:00 p.m.,...,44,29,Active,10452,2025110000.0,BX26 /,189.0,40.83127,-73.929398,At 162 Street
308,65,171 Stanton Street,M,103,10,1,Clinton Community Garden (LES),DPR,MULTIPOLYGON (((-73.98434445491532 40.72012974...,12:00 p.m. to 7:00 p.m.,...,7,27,Active,10002,1003440000.0,MN28 /,22.01,40.72001,-73.984459,At Clinton Street
87,77,110 East 176th Street,X,205,15,14,Green Patch on Walton,DPR,MULTIPOLYGON (((-73.9098651537377 40.847911991...,1:00 p.m. to 4:00 p.m.,...,46,33,Active,10453,2028260000.0,BX41 /,227.01,40.847916,-73.909331,At Walton Avenue
194,25,47-32 Colden Street,Q,407,6,20,Evergreen Community Garden,DPR,MULTIPOLYGON (((-73.82258060091881 40.74830265...,8:00 a.m. to 6:00 p.m.,...,109,16,Active,11355,4051560000.0,QN62 /,837.0,40.748464,-73.820845,Juniper and Colden
576,53,17 Ten Eyck Street,B,301,7,34,Ten Eyck Garden,DPR,MULTIPOLYGON (((-73.95016091966126 40.70993898...,9:00 a.m.to 11:00 a.m.;\n6:00 p.m.to 8:00 p.m.,...,90,18,Active,11206,3027910000.0,BK73 /,513.0,40.709804,-73.950097,Union & Lorimer


In [100]:
#Let's sort the columns into a more logical order.
#We'll prioritize unique information, like name, address, and coordinates first.
#Status will also be prioritized.

#More categorical tags, like congressional districts, can be moved after them.
#We'll move open hours to the very back...

gtgarden = gtgarden[['parksid',
                    'gardenname',
                    'status',
                    'address',
                    'lat',
                    'lon',
                    'BBL',
                    'borough',
                    'crossStreets',
                    'zipcode',
                    'openhrsm',
                    'openhrstu',
                    'openhrsw',
                    'openhrsth',
                    'openhrsf',
                    'openhrssa',
                    'openhrssu',
                    'CensusTract',
                    'assemblydist',
                    'communityboard',
                    'NTA',
                    'congressionaldist',
                    'coundist',
                    'statesenatedist',
                    'policeprecinct',
                    'juris',
                    'multipolygon']]

gtgarden.sample(5)

Unnamed: 0,parksid,gardenname,status,address,lat,lon,BBL,borough,crossStreets,zipcode,...,CensusTract,assemblydist,communityboard,NTA,congressionaldist,coundist,statesenatedist,policeprecinct,juris,multipolygon
397,BGT051,Sheffield Garden,Active,673 Sheffield Avenue,40.660675,-73.892962,3042980000.0,B,Between New Lots Ave. and Hegeman Ave.,11207,...,1128.0,60,305,BK85 /,8,42,19,75,BQLT,MULTIPOLYGON (((-73.89281344456302 40.66074906...
550,B470-GT001,Granite Street Community Garden,Active,30 Granite Street,40.68239,-73.907602,3034610000.0,B,Bushwick Ave & Broadway,11207,...,405.0,54,304,BK78 /,7,37,18,83,DPR,MULTIPOLYGON (((-73.9077750458321 40.682449217...
318,M326-GT001,Liz Christy Garden,Active,110 E Houston Street,40.724153,-73.991883,1004560000.0,M,The Bowery & 2nd Avenue,10003,...,36.02,66,103,MN22 /,10,2,27,9,DPR,MULTIPOLYGON (((-73.99154946616487 40.72406989...
391,BGT057,Westbrook Memorial Garden,Active,1233 Pacific Street,40.677979,-73.95156,3012000000.0,B,Bedford & Nostrand Avenues,11216,...,315.0,57,308,BK61 /,9,36,20,77,BQLT,MULTIPOLYGON (((-73.95145171965187 40.67810783...
94,X317-GT001,159th Street Community Garden,Active,379 East 159th Street,40.823333,-73.91503,2024060000.0,X,Courtland & Melrose Avenues,10451,...,141.0,79,203,BX35 /,15,17,32,42,DPR,MULTIPOLYGON (((-73.91480894628248 40.82340076...


In [101]:
#One last thing! It's really weird that the boroughs are acronymized in this way...
#The good thing is that the creators of this dataset made every borough have a unique one-character symbol.
#We'll replace them with the function below:

def boroughsort(chara):
    if chara == 'M':
        return 'Manhattan'
    elif chara == 'X':
        return 'Bronx'
    elif chara == 'B':
        return 'Brooklyn'
    elif chara == 'Q':
        return 'Queens'
    else:
        return 'Staten Island'
    
gtgarden['borough'] = gtgarden['borough'].apply(boroughsort)
gtgarden['borough'].sample(5)

420     Brooklyn
276    Manhattan
229    Manhattan
369     Brooklyn
193       Queens
Name: borough, dtype: object

In [102]:
#These are corrections to a few... small unique errors in the original dataset.
#For example, this garden in the Bronx being designated as Brooklyn.

print(gtgarden.loc[5,'gardenname'])
print(gtgarden.loc[5,'borough'])
gtgarden.loc[5, 'borough'] = 'Bronx'

#Easy fix.
#We've made this dataset usable, but it might take some more work than this to make it perfect.

People Garden- Patterson Houses (NYCHA)
Brooklyn


In [103]:
#Still, though. This seems good enough to go!
#Let's output our new, cleaned, upgraded dataset.

gtgarden_postclean = gtgarden

In [104]:
#Write cleaned dataframe to CSV!
gtgarden_postclean.to_csv("greenthumb_garden_clean.csv", sep=',', encoding='utf-8', index=False)