#### Initial Set Up

In [1]:
# Import Dependencies
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from datetime import datetime

# Remove dataFrame display size restrictions
#pd.set_option("display.max_rows", None, "display.max_columns", None)

# Create path
path = "Resources/trees.csv"

# Read in csv
df = pd.read_csv(path)

# Display first 5 records
df.head()

# Grab original dimensions before clean
original_dimensions = df.shape
print(f'The original dimensions of the trees dataset (rows/columns): {original_dimensions}') 

The original dimensions of the trees dataset (rows/columns): (683788, 41)


#### Clean Up

In [2]:
# Ensure date planted only reflects the year planted
df['year_planted'] = pd.DatetimeIndex(df['created_at']).year

In [3]:
# Remove irrelevant columns
df = df[df.columns.difference(['tree_dbh', 'created_at', 'stump_diam', 'curb_loc', 'spc_latin', 'steward', 'guards', 'sidewalk', 'user_type', 'problems', 
                              'root_stone', 'root_grate', 'cb_num', 'trnk_light', 'cncldist', 'st_senate', 'zip_city', 'st_assem', 'address', 'block_id', 'root_other', 'trunk_wire', 'created_at' 'trnk_light', 'state', 'x_sp', 'y_sp', 'trnk_other',
       'brch_light', 'brch_shoe', 'brch_other'])]

columns_removed = df.shape
print(f'The dimensions of the tree dataset after removing irrelevant columns: {columns_removed}') 

The dimensions of the tree dataset after removing irrelevant columns: (683788, 13)


In [4]:
# Rename column names
df = df.rename(columns={'spc_common': 'species', "boroname": "borough", 'nta_name': 'neighborhood'})

df

Unnamed: 0,boro_ct,borocode,borough,health,latitude,longitude,nta,neighborhood,species,status,tree_id,year_planted,zipcode
0,4125700,4,Queens,Good,40.724339,-73.805180,QN37,Kew Gardens Hills,green ash,Alive,606945,2016,11366
1,4030902,4,Queens,Good,40.756626,-73.894167,QN28,Jackson Heights,honeylocust,Alive,160321,2015,11370
2,4028800,4,Queens,Good,40.679777,-73.788463,QN76,Baisley Park,Callery pear,Alive,541347,2015,11434
3,3005000,3,Brooklyn,Good,40.622743,-74.037543,BK31,Bay Ridge,Callery pear,Alive,613930,2016,11209
4,4095400,4,Queens,Good,40.596514,-73.797622,QN12,Hammels-Arverne-Edgemere,'Schubert' chokecherry,Alive,18353,2015,11692
...,...,...,...,...,...,...,...,...,...,...,...,...,...
683783,3005300,3,Brooklyn,Poor,40.672566,-74.011473,BK33,Carroll Gardens-Columbia Street-Red Hook,purple-leaf plum,Alive,237788,2015,11231
683784,4157903,4,Queens,,40.730434,-73.710600,QN44,Glen Oaks-Floral Park-New Hyde Park,,Dead,249489,2015,11001
683785,3048200,3,Brooklyn,,40.633890,-73.969779,BK42,Flatbush,,Dead,230261,2015,11230
683786,4017800,4,Queens,Good,40.676190,-73.813135,QN55,South Ozone Park,northern red oak,Alive,623784,2016,11420


In [5]:
# Fill blank values of health column with dead or stump, based on value of status column
df['health'].fillna(df['status'], inplace=True)

In [6]:
# Convert all strings to lowercase
df["borough"] = df["borough"].str.lower()
df["health"] = df["health"].str.lower()
df["status"] = df["status"].str.lower()
df["neighborhood"] = df["neighborhood"].str.lower()

In [7]:
# Clean heath column 
df['health'] = df['health'].str.replace('alive', 'good')
df['health'] = df['health'].str.replace('stump', 'dead')

#### Add Categorical Encoding & Binary Values - Category - Alive, Dead, Stump

In [8]:
# Convert type of columns to 'category'
df['status'] = df['status'].astype('category')

# # Assigning numerical values and store in another column
df['status_tier'] = df['status'].cat.codes


In [9]:
# creating instance of one-hot-encoder
enc = OneHotEncoder(handle_unknown='ignore')
# passing bridge-types-cat column (label encoded values of bridge_types)
enc_df = pd.DataFrame(enc.fit_transform(df[['status_tier']]).toarray())
enc_df.dtypes


0    float64
1    float64
2    float64
dtype: object

In [10]:
# merge with main df on key values
df = df.join(enc_df)
df

Unnamed: 0,boro_ct,borocode,borough,health,latitude,longitude,nta,neighborhood,species,status,tree_id,year_planted,zipcode,status_tier,0,1,2
0,4125700,4,queens,good,40.724339,-73.805180,QN37,kew gardens hills,green ash,alive,606945,2016,11366,0,1.0,0.0,0.0
1,4030902,4,queens,good,40.756626,-73.894167,QN28,jackson heights,honeylocust,alive,160321,2015,11370,0,1.0,0.0,0.0
2,4028800,4,queens,good,40.679777,-73.788463,QN76,baisley park,Callery pear,alive,541347,2015,11434,0,1.0,0.0,0.0
3,3005000,3,brooklyn,good,40.622743,-74.037543,BK31,bay ridge,Callery pear,alive,613930,2016,11209,0,1.0,0.0,0.0
4,4095400,4,queens,good,40.596514,-73.797622,QN12,hammels-arverne-edgemere,'Schubert' chokecherry,alive,18353,2015,11692,0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
683783,3005300,3,brooklyn,poor,40.672566,-74.011473,BK33,carroll gardens-columbia street-red hook,purple-leaf plum,alive,237788,2015,11231,0,1.0,0.0,0.0
683784,4157903,4,queens,dead,40.730434,-73.710600,QN44,glen oaks-floral park-new hyde park,,dead,249489,2015,11001,1,0.0,1.0,0.0
683785,3048200,3,brooklyn,dead,40.633890,-73.969779,BK42,flatbush,,dead,230261,2015,11230,1,0.0,1.0,0.0
683786,4017800,4,queens,good,40.676190,-73.813135,QN55,south ozone park,northern red oak,alive,623784,2016,11420,0,1.0,0.0,0.0


In [11]:
# Rename columns that were added from encoder array
df[[0, 1, 2]] = df[[0, 1, 2]].astype(str)
df = df.rename(columns={0: 'status_alive', 1: 'status_dead', 2: 'status_stump'}) 

#### Add Categorical Encoding & Binary Values - status ( good, fair poor, dead)

In [12]:
# Convert type of columns to 'category'
df['health'] = df['health'].astype('category')

# # Assigning numerical values and store in another column
df['health_level'] = df['health'].cat.codes


In [13]:
# creating instance of one-hot-encoder
enc1 = OneHotEncoder(handle_unknown='ignore')
# passing bridge-types-cat column (label encoded values of bridge_types)
enc_df1 = pd.DataFrame(enc1.fit_transform(df[['health_level']]).toarray())

# merge with main df on key values
df = df.join(enc_df1)
df

# Rename columns that were added from encoder array
df[[0, 1, 2, 3]] = df[[0, 1, 2, 3]].astype(str)
df = df.rename(columns={0: 'health_level_dead', 1: 'health_level_fair', 2: 'health_level_good', 3: 'health_level_poor'}) 

In [14]:
df.index.name = 'index'

In [15]:
df

Unnamed: 0_level_0,boro_ct,borocode,borough,health,latitude,longitude,nta,neighborhood,species,status,...,zipcode,status_tier,status_alive,status_dead,status_stump,health_level,health_level_dead,health_level_fair,health_level_good,health_level_poor
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,4125700,4,queens,good,40.724339,-73.805180,QN37,kew gardens hills,green ash,alive,...,11366,0,1.0,0.0,0.0,2,0.0,0.0,1.0,0.0
1,4030902,4,queens,good,40.756626,-73.894167,QN28,jackson heights,honeylocust,alive,...,11370,0,1.0,0.0,0.0,2,0.0,0.0,1.0,0.0
2,4028800,4,queens,good,40.679777,-73.788463,QN76,baisley park,Callery pear,alive,...,11434,0,1.0,0.0,0.0,2,0.0,0.0,1.0,0.0
3,3005000,3,brooklyn,good,40.622743,-74.037543,BK31,bay ridge,Callery pear,alive,...,11209,0,1.0,0.0,0.0,2,0.0,0.0,1.0,0.0
4,4095400,4,queens,good,40.596514,-73.797622,QN12,hammels-arverne-edgemere,'Schubert' chokecherry,alive,...,11692,0,1.0,0.0,0.0,2,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
683783,3005300,3,brooklyn,poor,40.672566,-74.011473,BK33,carroll gardens-columbia street-red hook,purple-leaf plum,alive,...,11231,0,1.0,0.0,0.0,3,0.0,0.0,0.0,1.0
683784,4157903,4,queens,dead,40.730434,-73.710600,QN44,glen oaks-floral park-new hyde park,,dead,...,11001,1,0.0,1.0,0.0,0,1.0,0.0,0.0,0.0
683785,3048200,3,brooklyn,dead,40.633890,-73.969779,BK42,flatbush,,dead,...,11230,1,0.0,1.0,0.0,0,1.0,0.0,0.0,0.0
683786,4017800,4,queens,good,40.676190,-73.813135,QN55,south ozone park,northern red oak,alive,...,11420,0,1.0,0.0,0.0,2,0.0,0.0,1.0,0.0


In [16]:
# Export Clean DataFrame to CSV
df.to_csv("output/tree_data_clean.csv")