Source: https://catalog.data.gov/dataset/2015-street-tree-census-tree-data
<br>
Source: https://data.cityofnewyork.us/Environment/2015-Street-Tree-Census-Tree-Data/uvpi-gqnh

The 2015 Tree Census dataset for New York City is collected by volunteers and staff members from NYC Parks & Rec. <br>
Here, we clean and standardize the data.

In [1]:
import pandas as pd
import numpy as np
from numpy.random import seed
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('2015-tree-census.csv')
tree = data # save a copy of data as tree

In [3]:
tree.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 683788 entries, 0 to 683787
Data columns (total 45 columns):
tree_id             683788 non-null int64
block_id            683788 non-null int64
created_at          683788 non-null object
tree_dbh            683788 non-null int64
stump_diam          683788 non-null int64
curb_loc            683788 non-null object
status              683788 non-null object
health              652172 non-null object
spc_latin           652169 non-null object
spc_common          652169 non-null object
steward             652173 non-null object
guards              652172 non-null object
sidewalk            652172 non-null object
user_type           683788 non-null object
problems            652124 non-null object
root_stone          683788 non-null object
root_grate          683788 non-null object
root_other          683788 non-null object
trunk_wire          683788 non-null object
trnk_light          683788 non-null object
trnk_other          683788 non-nu

In [4]:
tree.shape

(683788, 45)

In [5]:
# display the first five rows

tree.head()

Unnamed: 0,tree_id,block_id,created_at,tree_dbh,stump_diam,curb_loc,status,health,spc_latin,spc_common,...,boro_ct,state,latitude,longitude,x_sp,y_sp,council district,census tract,bin,bbl
0,180683,348711,08/27/2015,3,0,OnCurb,Alive,Fair,Acer rubrum,red maple,...,4073900,New York,40.723092,-73.844215,1027431.148,202756.7687,29.0,739.0,4052307.0,4022210000.0
1,200540,315986,09/03/2015,21,0,OnCurb,Alive,Fair,Quercus palustris,pin oak,...,4097300,New York,40.794111,-73.818679,1034455.701,228644.8374,19.0,973.0,4101931.0,4044750000.0
2,204026,218365,09/05/2015,3,0,OnCurb,Alive,Good,Gleditsia triacanthos var. inermis,honeylocust,...,3044900,New York,40.717581,-73.936608,1001822.831,200716.8913,34.0,449.0,3338310.0,3028870000.0
3,204337,217969,09/05/2015,10,0,OnCurb,Alive,Good,Gleditsia triacanthos var. inermis,honeylocust,...,3044900,New York,40.713537,-73.934456,1002420.358,199244.2531,34.0,449.0,3338342.0,3029250000.0
4,189565,223043,08/30/2015,21,0,OnCurb,Alive,Good,Tilia americana,American linden,...,3016500,New York,40.666778,-73.975979,990913.775,182202.426,39.0,165.0,3025654.0,3010850000.0


In [6]:
# display the last three rows

tree.tail(3)

Unnamed: 0,tree_id,block_id,created_at,tree_dbh,stump_diam,curb_loc,status,health,spc_latin,spc_common,...,boro_ct,state,latitude,longitude,x_sp,y_sp,council district,census tract,bin,bbl
683785,166161,401670,08/22/2015,12,0,OnCurb,Alive,Good,Acer rubrum,red maple,...,5020100,New York,40.620762,-74.136517,946351.4,165466.0763,50.0,201.0,5011657.0,5004080000.0
683786,184028,504204,08/29/2015,9,0,OnCurb,Alive,Good,Acer rubrum,red maple,...,2023502,New York,40.850828,-73.903115,1011054.0,249271.9507,15.0,23502.0,2007757.0,2028120000.0
683787,200607,306527,09/03/2015,23,0,OnCurb,Alive,Fair,Acer rubrum,red maple,...,4134100,New York,40.732165,-73.787526,1043136.0,206095.5383,24.0,1341.0,4153657.0,4071360000.0


In [7]:
# display the columns

tree.columns

Index(['tree_id', 'block_id', 'created_at', 'tree_dbh', 'stump_diam',
       'curb_loc', 'status', 'health', 'spc_latin', 'spc_common', 'steward',
       'guards', 'sidewalk', 'user_type', 'problems', 'root_stone',
       'root_grate', 'root_other', 'trunk_wire', 'trnk_light', 'trnk_other',
       'brch_light', 'brch_shoe', 'brch_other', 'address', 'postcode',
       'zip_city', 'community board', 'borocode', 'borough', 'cncldist',
       'st_assem', 'st_senate', 'nta', 'nta_name', 'boro_ct', 'state',
       'latitude', 'longitude', 'x_sp', 'y_sp', 'council district',
       'census tract', 'bin', 'bbl'],
      dtype='object')

In [8]:
# drop any duplicates

tree = tree.drop_duplicates()

tree.shape # there does not appear to be any duplicates

(683788, 45)

In [9]:
# display null values

tree.isnull().sum()

tree_id                 0
block_id                0
created_at              0
tree_dbh                0
stump_diam              0
curb_loc                0
status                  0
health              31616
spc_latin           31619
spc_common          31619
steward             31615
guards              31616
sidewalk            31616
user_type               0
problems            31664
root_stone              0
root_grate              0
root_other              0
trunk_wire              0
trnk_light              0
trnk_other              0
brch_light              0
brch_shoe               0
brch_other              0
address                 0
postcode                0
zip_city                0
community board         0
borocode                0
borough                 0
cncldist                0
st_assem                0
st_senate               0
nta                     0
nta_name                0
boro_ct                 0
state                   0
latitude                0
longitude   

In [10]:
# drop columns that are inconsistent or not useful for analysis
# drop state column since all trees are in NY
# drop council district, census tract, bin, and bbl since
# they are not used for analysis

# dropping borocode since borough is already listed

drop_list = ['state', 'borocode', 'council district', 'census tract', 'bin', 'bbl', 'x_sp', 'y_sp']

In [11]:
tree.drop(drop_list, axis='columns', inplace=True)

In [12]:
tree.isnull().sum()

tree_id                0
block_id               0
created_at             0
tree_dbh               0
stump_diam             0
curb_loc               0
status                 0
health             31616
spc_latin          31619
spc_common         31619
steward            31615
guards             31616
sidewalk           31616
user_type              0
problems           31664
root_stone             0
root_grate             0
root_other             0
trunk_wire             0
trnk_light             0
trnk_other             0
brch_light             0
brch_shoe              0
brch_other             0
address                0
postcode               0
zip_city               0
community board        0
borough                0
cncldist               0
st_assem               0
st_senate              0
nta                    0
nta_name               0
boro_ct                0
latitude               0
longitude              0
dtype: int64

In [13]:
# since there are 683788 rows and 45 columns of tree data available
# it is okay to remove rows that are missing values

# drop rows that contain missing values

# tree.dropna(subset=[''], inplace=True)

# count the remaining missing values in each column

# tree.isnull().sum()

In [14]:
# looking at the datatypes since they affect the types of operations that can be performed

tree.dtypes

tree_id              int64
block_id             int64
created_at          object
tree_dbh             int64
stump_diam           int64
curb_loc            object
status              object
health              object
spc_latin           object
spc_common          object
steward             object
guards              object
sidewalk            object
user_type           object
problems            object
root_stone          object
root_grate          object
root_other          object
trunk_wire          object
trnk_light          object
trnk_other          object
brch_light          object
brch_shoe           object
brch_other          object
address             object
postcode             int64
zip_city            object
community board      int64
borough             object
cncldist             int64
st_assem             int64
st_senate            int64
nta                 object
nta_name            object
boro_ct              int64
latitude           float64
longitude          float64
d

In [15]:
tree.status.value_counts(dropna=False) # we want to see any missing data too

Alive    652173
Stump     17654
Dead      13961
Name: status, dtype: int64

Observe that Stump and Dead totals 31615, around the number of trees with missing data for <br>
health (31616) <br>
spc_latin (31619) <br>
spc_common (31619) <br>
steward (31615) <br>
guards (31616) <br>
sidewalk (31616)<br>
and problems (31664).
<br>
If the tree is a stump or dead, then there is no way to discern the health of the tree. There is also no way to know the Latin/Scientific or common name of the tree (also irrelevant). Likewise, stewards are not indicated for dead trees nor are there guards and sidewalk flags.
<br>
Since the numbers missing for the above columns are roughly the same, the next step is to investigate the small discrepancies.

In [16]:
tree.health.value_counts(dropna=False) # shows missing data

Good    528850
Fair     96504
NaN      31616
Poor     26818
Name: health, dtype: int64

Since 31616 rows lack 'health' data and 31615 rows are classified as Stump or Dead in terms of 'status', there may be a row that does not contain 'health' data and is marked as Alive.
<br >
<br >
Let's replace the health column rows that lack data with Dead since a tree that is a stump is probably deceased.
<br >
Note: this is only for rows that have a status labeled as Stump or Dead (since status is not missing any values).

In [17]:
# filter rows that have a Dead or Stump status and missing health (NaN)

miss_health = tree[(tree.health.isnull()) & (tree.status == 'Alive')]
miss_health

# located the row!
# this row can be removed since the health is unknown, despite the tree being alive

Unnamed: 0,tree_id,block_id,created_at,tree_dbh,stump_diam,curb_loc,status,health,spc_latin,spc_common,...,community board,borough,cncldist,st_assem,st_senate,nta,nta_name,boro_ct,latitude,longitude
32864,245041,413012,09/21/2015,16,0,OnCurb,Alive,,Fraxinus pennsylvanica,green ash,...,503,Staten Island,51,62,24,SI32,Rossville-Woodrow,5020801,40.548597,-74.216412


In [18]:
# removing the row

tree = tree.drop(tree.index[32864])

tree[tree['tree_id'] == 245041] # confirmed

Unnamed: 0,tree_id,block_id,created_at,tree_dbh,stump_diam,curb_loc,status,health,spc_latin,spc_common,...,community board,borough,cncldist,st_assem,st_senate,nta,nta_name,boro_ct,latitude,longitude


The City of New York mentions that dead trees and stumps do not have data recorded for the following fields:
<br >
steward (31615)
<br >
guards (31616)
<br >
sidewalk (31616). 
<br >
Let's investigate and locate any rows that lack data or may be incorrectly labeled.

In [19]:
guards_sidewalk = tree[(tree.guards.isnull())  & (tree.sidewalk.isnull())]
guards_sidewalk # 31615 rows × 39 columns

# this is consistent with the count for 'steward'

Unnamed: 0,tree_id,block_id,created_at,tree_dbh,stump_diam,curb_loc,status,health,spc_latin,spc_common,...,community board,borough,cncldist,st_assem,st_senate,nta,nta_name,boro_ct,latitude,longitude
37,211205,302080,09/09/2015,0,16,OnCurb,Stump,,,,...,401,Queens,22,36,12,QN72,Steinway,4009500,40.774993,-73.922037
56,187807,506266,08/29/2015,0,0,OnCurb,Dead,,,,...,207,Bronx,11,80,36,BX43,Norwood,2042500,40.871927,-73.882349
196,208322,222858,09/07/2015,10,0,OnCurb,Dead,,,,...,304,Brooklyn,37,53,18,BK77,Bushwick North,3044500,40.705742,-73.917849
209,209058,415850,09/08/2015,6,0,OnCurb,Dead,,,,...,503,Staten Island,50,64,24,SI25,Oakwood-Oakwood Beach,5012806,40.559061,-74.106038
265,210544,216081,09/08/2015,6,0,OnCurb,Dead,,,,...,301,Brooklyn,33,50,26,BK76,Greenpoint,3057100,40.726271,-73.947287
284,188609,107627,08/30/2015,9,0,OnCurb,Dead,,,,...,108,Manhattan,5,76,28,MN32,Yorkville,1013600,40.771992,-73.951657
429,189412,108374,08/30/2015,7,0,OnCurb,Dead,,,,...,109,Manhattan,7,70,30,MN09,Morningside Heights,1020701,40.807979,-73.959649
437,189526,223109,08/30/2015,4,0,OnCurb,Dead,,,,...,306,Brooklyn,39,44,21,BK37,Park Slope-Gowanus,3015500,40.669389,-73.979367
590,179748,348463,08/27/2015,20,0,OnCurb,Dead,,,,...,406,Queens,29,27,14,QN17,Forest Hills,4075701,40.721514,-73.836307
639,192569,302968,08/31/2015,2,0,OnCurb,Dead,,,,...,402,Queens,26,37,12,QN31,Hunters Point-Sunnyside-West Maspeth,4018501,40.738044,-73.921552


In [20]:
# check for when guards is null and sidewalk is not

guards_null = tree[(tree.guards.isnull()) & (~tree.sidewalk.isnull())]
guards_null

# tree is alive and in poor health and is not indentified (species)
# will delete this row

Unnamed: 0,tree_id,block_id,created_at,tree_dbh,stump_diam,curb_loc,status,health,spc_latin,spc_common,...,community board,borough,cncldist,st_assem,st_senate,nta,nta_name,boro_ct,latitude,longitude
427537,630814,323764,07/18/2016,11,0,OnCurb,Alive,Poor,,,...,411,Queens,19,26,11,QN45,Douglas Manor-Douglaston-Little Neck,4148300,40.771945,-73.750414


In [21]:
# dropping the row

tree = tree.drop(index=427537)

tree[tree['tree_id'] == 630814] # confirmed

Unnamed: 0,tree_id,block_id,created_at,tree_dbh,stump_diam,curb_loc,status,health,spc_latin,spc_common,...,community board,borough,cncldist,st_assem,st_senate,nta,nta_name,boro_ct,latitude,longitude


In [22]:
# check when guards is not null and sidewalk is

sidewalk_null = tree[(~tree.guards.isnull()) & (tree.sidewalk.isnull())]
sidewalk_null

# tree is alive, in good health, and is identified
# will delete this row

Unnamed: 0,tree_id,block_id,created_at,tree_dbh,stump_diam,curb_loc,status,health,spc_latin,spc_common,...,community board,borough,cncldist,st_assem,st_senate,nta,nta_name,boro_ct,latitude,longitude
346236,540677,202468,12/29/2015,7,0,OffsetFromCurb,Alive,Good,Gleditsia triacanthos var. inermis,honeylocust,...,305,Brooklyn,37,54,19,QN56,Ozone Park,3118800,40.67909,-73.864029


In [23]:
# dropping the row

tree = tree.drop(index=346236)

tree[tree['tree_id'] == 540677] # confirmed

Unnamed: 0,tree_id,block_id,created_at,tree_dbh,stump_diam,curb_loc,status,health,spc_latin,spc_common,...,community board,borough,cncldist,st_assem,st_senate,nta,nta_name,boro_ct,latitude,longitude


It looks like there is a row without 'guard' data and another row without 'sidewalk' data. Put together, they consistently show 31,615 rows that lack data, same as steward.

In [24]:
# look for row with sidewalk, without steward

steward_sidewalk_null = tree[(~tree.steward.isnull()) & (tree.sidewalk.isnull())]

steward_sidewalk_null # none

Unnamed: 0,tree_id,block_id,created_at,tree_dbh,stump_diam,curb_loc,status,health,spc_latin,spc_common,...,community board,borough,cncldist,st_assem,st_senate,nta,nta_name,boro_ct,latitude,longitude


In [25]:
# look for row with guards, without steward

guards_guards_null = tree[(~tree.steward.isnull()) & (tree.guards.isnull())]
guards_guards_null # none

Unnamed: 0,tree_id,block_id,created_at,tree_dbh,stump_diam,curb_loc,status,health,spc_latin,spc_common,...,community board,borough,cncldist,st_assem,st_senate,nta,nta_name,boro_ct,latitude,longitude


In [26]:
# look for row with steward

steward_null_sidewalk = tree[(tree.steward.isnull()) & (~tree.sidewalk.isnull())] # none
steward_null_guards = tree[(tree.steward.isnull()) & (~tree.guards.isnull())] # none

In [27]:
tree.isnull().sum()

tree_id                0
block_id               0
created_at             0
tree_dbh               0
stump_diam             0
curb_loc               0
status                 0
health             31615
spc_latin          31618
spc_common         31618
steward            31615
guards             31615
sidewalk           31615
user_type              0
problems           31663
root_stone             0
root_grate             0
root_other             0
trunk_wire             0
trnk_light             0
trnk_other             0
brch_light             0
brch_shoe              0
brch_other             0
address                0
postcode               0
zip_city               0
community board        0
borough                0
cncldist               0
st_assem               0
st_senate              0
nta                    0
nta_name               0
boro_ct                0
latitude               0
longitude              0
dtype: int64

In [28]:
tree.shape

(683785, 37)