# Data Exploration

This notebook explores data prior to the 2015 tree census

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gp

%matplotlib inline
plt.style.use('fivethirtyeight')

In [21]:
#Data reading
trees95 = pd.read_csv('../data/1995_Street_Tree_Census.csv') 
trees05 = pd.read_csv('../data/2005_Street_Tree_Census.csv')
trees15 = pd.read_csv('../data/2015_Street_Tree_Census_-_Tree_Data.csv')

### Exploring '95 data

In [3]:
trees95.head()

Unnamed: 0,RecordId,Address,House_Number,Street,Zip_Original,CB_Original,Site,Species,Diameter,Condition,...,Latitude,CB_New,Zip_New,CensusTract_2010,CensusBlock_2010,NTA_2010,SegmentID,Spc_Common,Spc_Latin,Location
0,1,245 E 17 ST,245,E 17 ST,10003,106,Front,PLAC,8,Unknown,...,40.734551,106,10003,48,2000,MN21,33134,LONDON PLANETREE,PLATANUS ACERIFOLIA,"(40.734551, -73.984235)"
1,2,80 N MOORE ST,80,N MOORE ST,10013,101,Side,ACPL,7,Good,...,40.720159,101,10013,39,2001,MN24,31567,"MAPLE, NORWAY",ACER PLATANOIDES,"(40.720159, -74.010532)"
2,3,80 N MOORE ST,80,N MOORE ST,10013,101,Side,ACPL,6,Good,...,40.720159,101,10013,39,2001,MN24,31567,"MAPLE, NORWAY",ACER PLATANOIDES,"(40.720159, -74.010532)"
3,4,80 N MOORE ST,80,N MOORE ST,10013,101,Side,ACPL,7,Excellent,...,40.720159,101,10013,39,2001,MN24,31567,"MAPLE, NORWAY",ACER PLATANOIDES,"(40.720159, -74.010532)"
4,5,80 N MOORE ST,80,N MOORE ST,10013,101,Side,ACPL,6,Good,...,40.720159,101,10013,39,2001,MN24,31567,"MAPLE, NORWAY",ACER PLATANOIDES,"(40.720159, -74.010532)"


In [4]:
trees95.columns

Index([u'RecordId', u'Address', u'House_Number', u'Street', u'Zip_Original',
       u'CB_Original', u'Site', u'Species', u'Diameter', u'Condition',
       u'Wires', u'Sidewalk_Condition', u'Support_Structure', u'Borough', u'X',
       u'Y', u'Longitude', u'Latitude', u'CB_New', u'Zip_New',
       u'CensusTract_2010', u'CensusBlock_2010', u'NTA_2010', u'SegmentID',
       u'Spc_Common', u'Spc_Latin', u'Location'],
      dtype='object')

In [5]:
trees95.shape

(516989, 27)

In [6]:
pd.unique(trees95['Condition'])

array(['Unknown', 'Good', 'Excellent', 'Poor', 'Dead', 'Stump',
       'Planting Space', 'Shaft', 'Fair', 'Critical'], dtype=object)

In [12]:
print len(trees95[trees95['Condition'] == 'Fair'])
print len(trees95[trees95['Condition'] == 'Poor'])
print len(trees95[trees95['Condition'] == 'Good'])
print len(trees95[trees95['Condition'] == 'Excellent'])

327
38571
332562
100286


In [35]:
trees95.loc[trees95.Condition == 'Unknown'] = ''
trees95.loc[trees95.Condition == 'Dead'] = ''
trees95.loc[trees95.Condition == 'Stump'] = ''
trees95.loc[trees95.Condition == 'Planting Space'] = ''
trees95.loc[trees95.Condition == 'Shaft'] = ''
pd.unique(trees95['Condition'])

array(['', 'Good', 'Excellent', 'Poor', 'Fair', 'Critical'], dtype=object)

In [36]:
trees95.loc[trees95.Condition == 'Critical'] = 'Poor'


In [37]:
pd.unique(trees95['Condition'])

array(['', 'Good', 'Excellent', 'Poor', 'Fair'], dtype=object)

In [38]:
trees95.loc[trees95.Condition == 'Fair'] = 'Poor'
trees95.loc[trees95.Condition == 'Good'] = 'Fair'
trees95.loc[trees95.Condition == 'Excellent'] = 'Good'


In [39]:
pd.unique(trees95['Condition'])

array(['', 'Fair', 'Good', 'Poor'], dtype=object)

In [40]:
trees95.rename(index=str, columns={"Condition": "health"}, inplace=True)

In [43]:
pd.unique(trees95['health'])

array(['', 'Fair', 'Good', 'Poor'], dtype=object)

In [41]:
trees95.to_csv('../data/trees95.csv')

In [45]:
trees95.drop('Location', axis = 1, inplace=True)

In [51]:
trees95.to_csv('../data/trees95.csv', index_label=False)

### Exploring '05 Data

In [22]:
trees05.head()

Unnamed: 0,OBJECTID,cen_year,tree_dbh,tree_loc,pit_type,soil_lvl,status,spc_latin,spc_common,vert_other,...,cncldist,st_assem,st_senate,nta,nta_name,boro_ct,x_sp,y_sp,objectid_1,Location 1
0,592373,2005,6,Front,Sidewalk Pit,Level,Good,PYRUS CALLERYANA,"PEAR, CALLERY",No,...,44,48,17,BK88,Borough Park,3021600.0,984182,169769,0,"1139 57 STREET\nNew York\n(40.632653207600001,..."
1,592374,2005,6,Across,Sidewalk Pit,Level,Good,PLATANUS ACERIFOLIA,LONDON PLANETREE,No,...,46,59,19,BK45,Georgetown-Marine Park-Bergen Beach-Mill Basin,3070600.0,1011608,165205,1,2220 BERGEN AVENUE\nNew York\n(40.620083746799...
2,592375,2005,13,Front,Continuous Pit,Level,Good,ACER PLATANOIDES CRIMSON KING,"MAPLE, NORWAY-CR KNG",No,...,46,59,19,BK45,Georgetown-Marine Park-Bergen Beach-Mill Basin,3070600.0,1012259,164445,2,2360 BERGEN AVENUE\nNew York\n(40.617995671700...
3,592376,2005,13,Across,Sidewalk Pit,Level,Good,PLATANUS ACERIFOLIA,LONDON PLANETREE,No,...,46,59,19,BK45,Georgetown-Marine Park-Bergen Beach-Mill Basin,3070600.0,1011733,165063,3,2254 BERGEN AVENUE\nNew York\n(40.619693599599...
4,592377,2005,15,Across,Sidewalk Pit,Level,Good,PLATANUS ACERIFOLIA,LONDON PLANETREE,No,...,46,59,19,BK45,Georgetown-Marine Park-Bergen Beach-Mill Basin,3070600.0,1012160,164564,4,2332 BERGEN AVENUE\nNew York\n(40.618322614400...


In [23]:
trees05.columns

Index([u'OBJECTID', u'cen_year', u'tree_dbh', u'tree_loc', u'pit_type',
       u'soil_lvl', u'status', u'spc_latin', u'spc_common', u'vert_other',
       u'vert_pgrd', u'vert_tgrd', u'vert_wall', u'horz_blck', u'horz_grate',
       u'horz_plant', u'horz_other', u'sidw_crack', u'sidw_raise',
       u'wire_htap', u'wire_prime', u'wire_2nd', u'wire_other', u'inf_canopy',
       u'inf_guard', u'inf_wires', u'inf_paving', u'inf_outlet', u'inf_shoes',
       u'inf_lights', u'inf_other', u'trunk_dmg', u'zipcode', u'zip_city',
       u'cb_num', u'borocode', u'boroname', u'cncldist', u'st_assem',
       u'st_senate', u'nta', u'nta_name', u'boro_ct', u'x_sp', u'y_sp',
       u'objectid_1', u'Location 1'],
      dtype='object')

In [53]:
pd.unique(trees05.objectid_1)

array(['Fair', 5, 'Good', ..., 592240, 592244, 592254], dtype=object)

In [24]:
pd.unique(trees05['status'])

array(['Good', 'Poor', 'Excellent', 'Dead'], dtype=object)

In [25]:
print len(trees05[trees05['status'] == 'Poor'])
print len(trees05[trees05['status'] == 'Good'])
print len(trees05[trees05['status'] == 'Excellent'])


49131
393464
141657


In [26]:
trees05.loc[trees05.status == 'Good'] = 'Fair'
trees05.loc[trees05.status == 'Excellent'] = 'Good'

print len(trees05[trees05['status'] == 'Fair'])
print len(trees05[trees05['status'] == 'Poor'])
print len(trees05[trees05['status'] == 'Good'])
print len(trees05[trees05['status'] == 'Excellent'])


393464
49131
141657
0


In [28]:
pd.unique(trees05['status'])

array(['Fair', 'Poor', 'Good', 'Dead'], dtype=object)

In [29]:
trees05.loc[trees05.status == 'Dead'] = ''


In [30]:
pd.unique(trees05['status'])

array(['Fair', 'Poor', 'Good', ''], dtype=object)

In [33]:
trees05.rename(index=str, columns={"status": "health"}, inplace=True)

In [48]:
trees05.drop('Location 1', axis=1, inplace=True)

In [50]:
trees05.to_csv('../data/trees05.csv', index_label=False)

### Exploring '15 

In [None]:
trees15.head()

In [None]:
trees15.columns

In [11]:
pd.unique(trees15.health)

array(['Good', ' ', 'Poor', 'Fair'], dtype=object)

### Attaching to Census Block

In [None]:
shp = gp.GeoDataFrame.from_file('../data/nycb2010_16b/nycb2010.shp')
