#### Initial Set Up

In [1]:
# Import Dependencies
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from datetime import datetime

# Remove dataFrame display size restrictions
#pd.set_option("display.max_rows", None, "display.max_columns", None)

# Create path
path = "resources/trees.csv"

# Read in csv
df = pd.read_csv(path)

# Display first 5 records
df.head()

# Grab original dimensions before clean
original_dimensions = df.shape
print(f'The original dimensions of the trees dataset (rows/columns): {original_dimensions}') 

The original dimensions of the trees dataset (rows/columns): (683788, 41)


#### Clean Up

In [3]:
# Ensure date planted only reflects the year planted
df['year_planted'] = pd.DatetimeIndex(df['created_at']).year

In [4]:
# Remove irrelevant columns
df = df[df.columns.difference(['tree_dbh', 'created_at', 'stump_diam', 'curb_loc', 'spc_latin', 'steward', 'guards', 'sidewalk', 'user_type', 'problems', 
                              'root_stone', 'root_grate', 'cb_num', 'cncldist', 'st_senate', 'zip_city', 'st_assem', 'address', 'block_id', 'root_other', 'trunk_wire', 'created_at' 'trnk_light', 'state', 'x_sp', 'y_sp', 'trnk_other',
       'brch_light', 'brch_shoe', 'brch_other'])]

columns_removed = df.shape
print(f'The dimensions of the tree dataset after removing irrelevant columns: {columns_removed}') 

The dimensions of the tree dataset after removing irrelevant columns: (683788, 14)


In [5]:
# Rename column names
df = df.rename(columns={'spc_common': 'species', "boroname": "borough", 'nta_name': 'neighborhood'})

df

Unnamed: 0,boro_ct,borocode,borough,health,latitude,longitude,nta,neighborhood,species,status,tree_id,trnk_light,year_planted,zipcode
0,4125700,4,Queens,Good,40.724339,-73.805180,QN37,Kew Gardens Hills,green ash,Alive,606945,No,2016,11366
1,4030902,4,Queens,Good,40.756626,-73.894167,QN28,Jackson Heights,honeylocust,Alive,160321,No,2015,11370
2,4028800,4,Queens,Good,40.679777,-73.788463,QN76,Baisley Park,Callery pear,Alive,541347,No,2015,11434
3,3005000,3,Brooklyn,Good,40.622743,-74.037543,BK31,Bay Ridge,Callery pear,Alive,613930,No,2016,11209
4,4095400,4,Queens,Good,40.596514,-73.797622,QN12,Hammels-Arverne-Edgemere,'Schubert' chokecherry,Alive,18353,No,2015,11692
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
683783,3005300,3,Brooklyn,Poor,40.672566,-74.011473,BK33,Carroll Gardens-Columbia Street-Red Hook,purple-leaf plum,Alive,237788,No,2015,11231
683784,4157903,4,Queens,,40.730434,-73.710600,QN44,Glen Oaks-Floral Park-New Hyde Park,,Dead,249489,No,2015,11001
683785,3048200,3,Brooklyn,,40.633890,-73.969779,BK42,Flatbush,,Dead,230261,No,2015,11230
683786,4017800,4,Queens,Good,40.676190,-73.813135,QN55,South Ozone Park,northern red oak,Alive,623784,No,2016,11420


In [6]:
# Fill blank values of health column with dead or stump, based on value of status column
df['health'].fillna(df['status'], inplace=True)

#### Add Categorical Encoding & Binary Values

In [7]:
# Convert type of columns to 'category'
df['status'] = df['status'].astype('category')

# # Assigning numerical values and store in another column
df['status_tier'] = df['status'].cat.codes


In [8]:
# creating instance of one-hot-encoder
enc = OneHotEncoder(handle_unknown='ignore')
# passing bridge-types-cat column (label encoded values of bridge_types)
enc_df = pd.DataFrame(enc.fit_transform(df[['status_tier']]).toarray())


# merge with main df on key values
df = df.join(enc_df)
df

Unnamed: 0,boro_ct,borocode,borough,health,latitude,longitude,nta,neighborhood,species,status,tree_id,trnk_light,year_planted,zipcode,status_tier,0,1,2
0,4125700,4,Queens,Good,40.724339,-73.805180,QN37,Kew Gardens Hills,green ash,Alive,606945,No,2016,11366,0,1.0,0.0,0.0
1,4030902,4,Queens,Good,40.756626,-73.894167,QN28,Jackson Heights,honeylocust,Alive,160321,No,2015,11370,0,1.0,0.0,0.0
2,4028800,4,Queens,Good,40.679777,-73.788463,QN76,Baisley Park,Callery pear,Alive,541347,No,2015,11434,0,1.0,0.0,0.0
3,3005000,3,Brooklyn,Good,40.622743,-74.037543,BK31,Bay Ridge,Callery pear,Alive,613930,No,2016,11209,0,1.0,0.0,0.0
4,4095400,4,Queens,Good,40.596514,-73.797622,QN12,Hammels-Arverne-Edgemere,'Schubert' chokecherry,Alive,18353,No,2015,11692,0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
683783,3005300,3,Brooklyn,Poor,40.672566,-74.011473,BK33,Carroll Gardens-Columbia Street-Red Hook,purple-leaf plum,Alive,237788,No,2015,11231,0,1.0,0.0,0.0
683784,4157903,4,Queens,Dead,40.730434,-73.710600,QN44,Glen Oaks-Floral Park-New Hyde Park,,Dead,249489,No,2015,11001,1,0.0,1.0,0.0
683785,3048200,3,Brooklyn,Dead,40.633890,-73.969779,BK42,Flatbush,,Dead,230261,No,2015,11230,1,0.0,1.0,0.0
683786,4017800,4,Queens,Good,40.676190,-73.813135,QN55,South Ozone Park,northern red oak,Alive,623784,No,2016,11420,0,1.0,0.0,0.0


In [9]:
df.dtypes

boro_ct            int64
borocode           int64
borough           object
health            object
latitude         float64
longitude        float64
nta               object
neighborhood      object
species           object
status          category
tree_id            int64
trnk_light        object
year_planted       int64
zipcode            int64
status_tier         int8
0                float64
1                float64
2                float64
dtype: object

In [None]:
# Export Clean DataFrame to CSV
df.to_csv("output/tree_data_clean.csv")