In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('2015-street-tree-census-tree-data.csv')

#### Take a peak.

In [3]:
df.shape

(683788, 45)

In [5]:
df.isna().sum()

tree_id                 0
block_id                0
created_at              0
tree_dbh                0
stump_diam              0
curb_loc                0
status                  0
health              31616
spc_latin           31619
spc_common          31619
steward             31615
guards              31616
sidewalk            31616
user_type               0
problems            31664
root_stone              0
root_grate              0
root_other              0
trunk_wire              0
trnk_light              0
trnk_other              0
brch_light              0
brch_shoe               0
brch_other              0
address                 0
postcode                0
zip_city                0
community board         0
borocode                0
borough                 0
cncldist                0
st_assem                0
st_senate               0
nta                     0
nta_name                0
boro_ct                 0
state                   0
latitude                0
longitude   

#### Drop columns that will not be used for analysis.

In [39]:
df1 = df.drop(columns = ['council district','census tract','bin','bbl'])

#### NAs!
There are a lot of NAs in *spc_latin*, *spc_common*, *health*,*steward*,*problem*, *guards*, *sidewalk*. A noticable thing is that these fields have a similar number of NAs. My initial guess is that only live trees (*status* = 'Alive') has these information. To prove the theory, I splited **df1** into two dataframes: **test_df**, which only has live trees; **test_df2**, which only has dead trees or stumps.

In [12]:
df1['status'].unique()

array(['Alive', 'Stump', 'Dead'], dtype=object)

In [63]:
test_df = df1[df1['status'] == 'Alive']

In [83]:
test_df2 = df1[df1['status'] != 'Alive']

Check if the split is correct.

In [109]:
len(test_df) + len(test_df2) == len(df1)

True

Check NAs in **test_df** and **test_df2**.

In [64]:
test_df.isnull().sum()

tree_id             0
block_id            0
created_at          0
tree_dbh            0
stump_diam          0
curb_loc            0
status              0
health              1
spc_latin           5
spc_common          5
steward             0
guards              1
sidewalk            1
user_type           0
problems           49
root_stone          0
root_grate          0
root_other          0
trunk_wire          0
trnk_light          0
trnk_other          0
brch_light          0
brch_shoe           0
brch_other          0
address             0
postcode            0
zip_city            0
community board     0
borocode            0
borough             0
cncldist            0
st_assem            0
st_senate           0
nta                 0
nta_name            0
boro_ct             0
state               0
latitude            0
longitude           0
x_sp                0
y_sp                0
dtype: int64

In [85]:
test_df2.isna().sum()

tree_id                0
block_id               0
created_at             0
tree_dbh               0
stump_diam             0
curb_loc               0
status                 0
health             31615
spc_latin          31614
spc_common         31614
steward            31615
guards             31615
sidewalk           31615
user_type              0
problems           31615
root_stone             0
root_grate             0
root_other             0
trunk_wire             0
trnk_light             0
trnk_other             0
brch_light             0
brch_shoe              0
brch_other             0
address                0
postcode               0
zip_city               0
community board        0
borocode               0
borough                0
cncldist               0
st_assem               0
st_senate              0
nta                    0
nta_name               0
boro_ct                0
state                  0
latitude               0
longitude              0
x_sp                   0


In [110]:
len(test_df2)

31615

There are total 31615 records in **test_df2**, and all these records do not have *health*,*steward*,*guards*,*sidewalk* and *problem* value. Most records, expert for 1 record, do not have *spc_latin* and *spc_common* value. Let's take a look at this record.

In [86]:
spc_null = test_df2[test_df2['spc_common'].notnull()]

In [87]:
dead_but_has_spc = spc_null[spc_null['status'] != 'Alive']

In [88]:
dead_but_has_spc

Unnamed: 0,tree_id,block_id,created_at,tree_dbh,stump_diam,curb_loc,status,health,spc_latin,spc_common,...,st_assem,st_senate,nta,nta_name,boro_ct,state,latitude,longitude,x_sp,y_sp
263498,473703,102385,2015-11-19T00:00:00.000,12,0,OffsetFromCurb,Dead,,,,...,65,26,MN28,Lower East Side,1000600,New York,40.710384,-73.988208,987519.0479,198088.8568


As far as the research concerns, only the species of live trees matter. Therefore, replace *spc_common* and *spc_lation* of the record *tree_id* = 473703 with Null.

In [101]:
test_df2.loc[test_df2['tree_id'] == 473703, 'spc_latin'] = None
test_df2.loc[test_df2['tree_id'] == 473703, 'spc_common'] = None

In [102]:
test_df2[test_df2['tree_id'] == 473703]

Unnamed: 0,tree_id,block_id,created_at,tree_dbh,stump_diam,curb_loc,status,health,spc_latin,spc_common,...,st_assem,st_senate,nta,nta_name,boro_ct,state,latitude,longitude,x_sp,y_sp
263498,473703,102385,2015-11-19T00:00:00.000,12,0,OffsetFromCurb,Dead,,,,...,65,26,MN28,Lower East Side,1000600,New York,40.710384,-73.988208,987519.0479,198088.8568


In [103]:
test_df2.isna().sum()

tree_id                0
block_id               0
created_at             0
tree_dbh               0
stump_diam             0
curb_loc               0
status                 0
health             31615
spc_latin          31615
spc_common         31615
steward            31615
guards             31615
sidewalk           31615
user_type              0
problems           31615
root_stone             0
root_grate             0
root_other             0
trunk_wire             0
trnk_light             0
trnk_other             0
brch_light             0
brch_shoe              0
brch_other             0
address                0
postcode               0
zip_city               0
community board        0
borocode               0
borough                0
cncldist               0
st_assem               0
st_senate              0
nta                    0
nta_name               0
boro_ct                0
state                  0
latitude               0
longitude              0
x_sp                   0


Now that we has done with dead/stump trees, let's look at live trees. For live trees, all columns should not have missing values. Drop the row if there is NA in one or more columns. 

In [66]:
test_df_cleaned = test_df.dropna()

In [67]:
test_df_cleaned.shape

(652118, 41)

Check how many roles were dropped.

In [68]:
652173 - 652118

55

In [69]:
test_df_cleaned.isna().sum()

tree_id            0
block_id           0
created_at         0
tree_dbh           0
stump_diam         0
curb_loc           0
status             0
health             0
spc_latin          0
spc_common         0
steward            0
guards             0
sidewalk           0
user_type          0
problems           0
root_stone         0
root_grate         0
root_other         0
trunk_wire         0
trnk_light         0
trnk_other         0
brch_light         0
brch_shoe          0
brch_other         0
address            0
postcode           0
zip_city           0
community board    0
borocode           0
borough            0
cncldist           0
st_assem           0
st_senate          0
nta                0
nta_name           0
boro_ct            0
state              0
latitude           0
longitude          0
x_sp               0
y_sp               0
dtype: int64

Finally, after cleaned **test_df** and **test_df2**, union the two dataframe and reset index.

In [104]:
final_df = pd.concat([test_df_cleaned,test_df2],ignore_index=True)

In [105]:
final_df.shape

(683733, 41)

In [112]:
683788 - 55 == len(final_df)

True

In [107]:
final_df.isna().sum()

tree_id                0
block_id               0
created_at             0
tree_dbh               0
stump_diam             0
curb_loc               0
status                 0
health             31615
spc_latin          31615
spc_common         31615
steward            31615
guards             31615
sidewalk           31615
user_type              0
problems           31615
root_stone             0
root_grate             0
root_other             0
trunk_wire             0
trnk_light             0
trnk_other             0
brch_light             0
brch_shoe              0
brch_other             0
address                0
postcode               0
zip_city               0
community board        0
borocode               0
borough                0
cncldist               0
st_assem               0
st_senate              0
nta                    0
nta_name               0
boro_ct                0
state                  0
latitude               0
longitude              0
x_sp                   0


#### Export and Done:)

In [113]:
final_df.to_csv('NY Street Tree Cleaned.csv', index = False)