# NYC Trees

### Importing the necessary libraries

In [254]:
import pandas as pd
import numpy as np

### Importing the csv file

In [255]:
df = pd.read_csv('assets/data_100000.csv')
df.head()

Unnamed: 0,created_at,tree_id,block_id,the_geom,tree_dbh,stump_diam,curb_loc,status,health,spc_latin,...,st_assem,st_senate,nta,nta_name,boro_ct,state,latitude,longitude,x_sp,y_sp
0,08/27/2015,180683,348711,POINT (-73.84421521958048 40.723091773924274),3,0,OnCurb,Alive,Fair,Acer rubrum,...,28,16,QN17,Forest Hills,4073900,New York,40.723092,-73.844215,1027431.0,202756.768749
1,09/03/2015,200540,315986,POINT (-73.81867945834878 40.79411066708779),21,0,OnCurb,Alive,Fair,Quercus palustris,...,27,11,QN49,Whitestone,4097300,New York,40.794111,-73.818679,1034456.0,228644.837379
2,09/05/2015,204026,218365,POINT (-73.93660770459083 40.717580740099116),3,0,OnCurb,Alive,Good,Gleditsia triacanthos var. inermis,...,50,18,BK90,East Williamsburg,3044900,New York,40.717581,-73.936608,1001823.0,200716.891267
3,09/05/2015,204337,217969,POINT (-73.93445615919741 40.713537494833226),10,0,OnCurb,Alive,Good,Gleditsia triacanthos var. inermis,...,53,18,BK90,East Williamsburg,3044900,New York,40.713537,-73.934456,1002420.0,199244.253136
4,08/30/2015,189565,223043,POINT (-73.97597938483258 40.66677775537875),21,0,OnCurb,Alive,Good,Tilia americana,...,44,21,BK37,Park Slope-Gowanus,3016500,New York,40.666778,-73.975979,990913.8,182202.425999


### Changing the `created_at` column to the correct date type

In [256]:
df['created_at'] = pd.to_datetime(df['created_at'])
df['day'] = df['created_at'].dt.day
df['month'] = df['created_at'].dt.month
df['year'] = df['created_at'].dt.year
df[['created_at', 'day', 'month', 'year']].head()

Unnamed: 0,created_at,day,month,year
0,2015-08-27,27,8,2015
1,2015-09-03,3,9,2015
2,2015-09-05,5,9,2015
3,2015-09-05,5,9,2015
4,2015-08-30,30,8,2015


### Dropping the ID columns

In [258]:
df = df.drop(columns = ['tree_id', 'block_id'])

### Changing Curb Loc to binary (0,1) 
    0 - OnCurb
    1 - OffsetFromCurb

In [262]:
df['curb_loc'].replace({ "OnCurb" : 0, "OffsetFromCurb" : 1} , inplace = True)

In [263]:
df['curb_loc'].value_counts()

0    96129
1     3871
Name: curb_loc, dtype: int64

### Changing Status to numerical
    0 - Alive
    1 - Stump
    2 - Dead

In [267]:
df['status'].replace({ "Alive" : 0, "Stump" : 1, "Dead": 2 } , inplace = True)

In [268]:
df['status'].value_counts()

0    95008
1     2831
2     2161
Name: status, dtype: int64

### Changing Health to numerical
    0 - Good
    1 - Fair
    2 - Poor

In [276]:
df['health'].replace({ "Good" : 0, "Fair" : 1, "Poor": 2 } , inplace = True)

In [279]:
df['health'].value_counts()

0.0    72340
1.0    17266
2.0     5401
Name: health, dtype: int64

### Checking the SPC latin column and creating a feature SPC latin family


In [282]:
def latin_family(i):
    plant = str(i)
    if plant == "nan": 
        return i
    else:
        family = plant.split(" ")[0]
        return family

In [283]:
df['spc_latin_family'] = df['spc_latin'].apply(latin_family)
df['spc_latin_family'].value_counts().head(20)

Acer             12526
Quercus          12259
Platanus         11976
Gleditsia        10648
Pyrus             8886
Tilia             7671
Prunus            4758
Zelkova           4379
Ginkgo            3629
Styphnolobium     3557
Fraxinus          2809
Ulmus             2394
Liquidambar       1274
Malus              622
Koelreuteria       611
Syringa            541
Carpinus           486
Cornus             440
Metasequoia        418
Cercis             417
Name: spc_latin_family, dtype: int64

### Checking the `SPC common` column and creating a feature SPC latin family


In [284]:
def plant_family(i):
    plant = str(i)
    if plant == "nan": 
        return i
    else:
        family = plant.split(" ")[-1]
        return family

In [285]:
df['spc_common_family'] = df['spc_common'].apply(plant_family)
df['spc_common_family'].value_counts()

maple           12512
oak             12259
planetree       11976
honeylocust     10648
pear             8886
                ...  
buckeye             9
larch               9
smoketree           7
alder               5
Osage-orange        4
Name: spc_common_family, Length: 66, dtype: int64

### Changing the steward  column to numerical
        0 = None  (not recorded for stump or dead trees)
        1 = 1or2
        2 = 2or4        
        3 = 4orMore

In [286]:
df['steward'].replace({ "None" : 0, "1or2" : 1, "3or4" : 2, "4orMore" : 3} , inplace = True)
df['steward'].tail()

99995    1.0
99996    1.0
99997    1.0
99998    0.0
99999    1.0
Name: steward, dtype: float64

### Changing the guards column to numerical
        0 = None  (not recorded for stump or dead trees)
        1 = Helpful
        2 = Harmful     
        3 = Unsure

In [287]:
df['guards'].replace({ "None" : 0, "Helpful" : 1, "Harmful" : 2, "Unsure" : 3} , inplace = True)
df['guards'].head()

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: guards, dtype: float64

### Changing the sidewalk column to numerical
        0 = NoDamage
        1 = Damage

In [288]:
df['sidewalk'].replace({ "NoDamage" : 0, "Damage" : 1} , inplace = True)
df['sidewalk'].head()

0    0.0
1    1.0
2    1.0
3    1.0
4    1.0
Name: sidewalk, dtype: float64

### Changing the sidewalk column to numerical
        0 = Volunteer
        1 = TreesCount Staff
        2 = NYC Parks Staff

In [289]:
df['user_type'].replace({ "Volunteer" : 0, "TreesCount Staff" : 1, "NYC Parks Staff" : 2} , inplace = True)
df['user_type'].head()

0    1
1    1
2    0
3    0
4    0
Name: user_type, dtype: int64

### Checking problems column

In [290]:
df['problems'].value_counts()

None                                             55672
Stones                                           16121
BranchLights                                      4398
Stones,BranchLights                               2949
RootOther                                         2250
                                                 ...  
RootOther,WiresRope,TrunkLights,BranchLights         1
Sneakers,BranchOther                                 1
MetalGrates,WiresRope,BranchLights                   1
MetalGrates,RootOther,TrunkOther,BranchLights        1
BranchLights,Sneakers,BranchOther                    1
Name: problems, Length: 151, dtype: int64

### Checking the `root_stone, root_grate, root_other` and combining them into `root_problems` column

In [291]:
root_columns = ['root_stone', 'root_grate', 'root_other']

df['root_problems'] = df[root_columns].any(axis=1)
df['root_problems'] = df['root_problems'].apply(lambda x : "yes" if x else "no")
df['root_problems'].tail()

99995    yes
99996    yes
99997    yes
99998    yes
99999    yes
Name: root_problems, dtype: object

Now changing the value to numerical:
    
    0 = no
    1 = yes

In [292]:
df['root_problems'].replace({ "no" : 0, "yes" : 1} , inplace = True)
df['root_problems'].head()

0    1
1    1
2    1
3    1
4    1
Name: root_problems, dtype: int64

Now we can delete the 3 root columns we dont need.

In [293]:
df = df.drop(columns = ['root_stone', 'root_grate', 'root_other'])

### Checking the `trnk_wire, trnk_light, trnk_other` and combining them into `trunk_problems` column

In [294]:
trunk_columns = ['trnk_wire', 'trnk_light', 'trnk_other']

df['trunk_problems'] = df[trunk_columns].any(axis=1)
df['trunk_problems'] = df['trunk_problems'].apply(lambda x : "yes" if x else "no")
df['trunk_problems'].head()

0    yes
1    yes
2    yes
3    yes
4    yes
Name: trunk_problems, dtype: object

Now changing the value to numerical:
    
    0 = no
    1 = yes

In [295]:
df['trunk_problems'].replace({ "no" : 0, "yes" : 1} , inplace = True)
df['trunk_problems'].head()

0    1
1    1
2    1
3    1
4    1
Name: trunk_problems, dtype: int64

Now we can delete the 3 trunk columns we dont need.

In [296]:
df = df.drop(columns = ['trnk_wire', 'trnk_light', 'trnk_other'])

### Checking the `'brnch_ligh', 'brnch_shoe', 'brnch_other'` and combining them into `branch_problems` column

In [298]:
branch_columns = ['brnch_ligh', 'brnch_shoe', 'brnch_othe']

df['branch_problems'] = df[branch_columns].any(axis=1)
df['branch_problems'] = df['branch_problems'].apply(lambda x : "yes" if x else "no")
df['branch_problems'].head()

0    yes
1    yes
2    yes
3    yes
4    yes
Name: branch_problems, dtype: object

Now changing the value to numerical:
    
    0 = no
    1 = yes

In [299]:
df['branch_problems'].replace({ "no" : 0, "yes" : 1} , inplace = True)
df['branch_problems'].head()

0    1
1    1
2    1
3    1
4    1
Name: branch_problems, dtype: int64

Now we can delete the 3 branch columns we dont need.

In [300]:
df = df.drop(columns =  ['brnch_ligh', 'brnch_shoe', 'brnch_othe'])

### Checking the address columns

In [301]:
df.columns

Index(['created_at', 'the_geom', 'tree_dbh', 'stump_diam', 'curb_loc',
       'status', 'health', 'spc_latin', 'spc_common', 'steward', 'guards',
       'sidewalk', 'user_type', 'problems', 'address', 'zipcode', 'zip_city',
       'cb_num', 'borocode', 'boroname', 'cncldist', 'st_assem', 'st_senate',
       'nta', 'nta_name', 'boro_ct', 'state', 'latitude', 'longitude', 'x_sp',
       'y_sp', 'day', 'month', 'year', 'spc_latin_family', 'spc_common_family',
       'root_problems', 'trunk_problems', 'branch_problems'],
      dtype='object')

In [302]:
df = df.drop(columns =  ['state'])

In [305]:
df.dtypes

created_at           datetime64[ns]
the_geom                     object
tree_dbh                      int64
stump_diam                    int64
curb_loc                      int64
status                        int64
health                      float64
spc_latin                    object
spc_common                   object
steward                     float64
guards                      float64
sidewalk                    float64
user_type                     int64
problems                     object
address                      object
zipcode                       int64
zip_city                     object
cb_num                        int64
borocode                      int64
boroname                     object
cncldist                      int64
st_assem                      int64
st_senate                     int64
nta                          object
nta_name                     object
boro_ct                       int64
latitude                    float64
longitude                   

### Saving 1st version to CSV

In [308]:
df.to_csv(r'datasets\NYC-Trees-1st-version', index = False)

### Changing the problems column

In [309]:
df.columns

Index(['created_at', 'the_geom', 'tree_dbh', 'stump_diam', 'curb_loc',
       'status', 'health', 'spc_latin', 'spc_common', 'steward', 'guards',
       'sidewalk', 'user_type', 'problems', 'address', 'zipcode', 'zip_city',
       'cb_num', 'borocode', 'boroname', 'cncldist', 'st_assem', 'st_senate',
       'nta', 'nta_name', 'boro_ct', 'latitude', 'longitude', 'x_sp', 'y_sp',
       'day', 'month', 'year', 'spc_latin_family', 'spc_common_family',
       'root_problems', 'trunk_problems', 'branch_problems'],
      dtype='object')

In [311]:
def check_prob(i):
    prob = str(i)
    if prob == "nan": 
        return i
    elif prob == "None":
        return i
    else:
        problems = []
        li = prob.split(",")
        for i in li:
            if "stone" in i.lower():
                problems.append("Stones")
            elif "trunk" in i.lower():
                problems.append("Trunk")
            elif "branch" in i.lower():
                problems.append("Branch")
            elif "root" in i.lower():
                problems.append("Root")
            elif "wires" in i.lower():
                problems.append("Wires")
            elif "metal" in i.lower():
                problems.append("Metal")
            elif "sneaker" in i.lower():
                problems.append("Sneakers")
            else:
                problems.append(i)
        
        prob1 = []
        for i in problems:
            if i not in prob1:
                prob1.append(i)
                
        sorted_prob = sorted(prob1)
     
        probs = ', '.join(map(str, sorted_prob))
        return probs

In [312]:
df['problem(s)'] = df['problems'].apply(check_prob)
df['problem(s)'].value_counts().head(20)

None                           55672
Stones                         16121
Branch                          6189
Branch, Stones                  3954
Root                            2250
Trunk                           1914
Stones, Trunk                   1002
Root, Stones                     941
Branch, Root                     836
Branch, Trunk                    757
Wires                            658
Branch, Stones, Trunk            575
Metal                            574
Branch, Root, Trunk              522
Branch, Root, Stones             427
Branch, Wires                    394
Root, Trunk                      345
Branch, Stones, Wires            336
Stones, Wires                    306
Branch, Root, Stones, Trunk      214
Name: problem(s), dtype: int64

Changing the values to numerical values:

    0 = None                           
    1 = Stones                         
    2 = Branch                         
    3 = Branch, Stones                 
    4 = Root                           
    5 = Trunk                          
    6 = Stones, Trunk                  
    7 = Root, Stones                   
    8 = Branch, Root                   
    9 = Branch, Trunk                  
    10 = Wires                         
    11 = Branch, Stones, Trunk         
    12 = Others

### Updating the null values using Imputer median

In [135]:
df.isnull().sum().sort_values(ascending=False)

health               4993
sidewalk             4992
spc_common_family    4992
spc_latin_family     4992
spc_latin            4992
spc_common           4992
steward              4992
guards               4992
problems             4992
day                     0
boro_ct                 0
latitude                0
longitude               0
x_sp                    0
y_sp                    0
created_at              0
month                   0
year                    0
nta                     0
root_problems           0
trunk_problems          0
nta_name                0
boroname                0
st_senate               0
st_assem                0
cncldist                0
the_geom                0
borocode                0
cb_num                  0
zip_city                0
zipcode                 0
address                 0
user_type               0
status                  0
curb_loc                0
stump_diam              0
tree_dbh                0
branch_problems         0
dtype: int64

In [137]:
from sklearn.impute import SimpleImputer

In [160]:
df3 = df.filter(['Health','sidewalk', 'spc_common_family',
                 'spc_latin_family',
                 'steward', 'guards'], axis = 1)
df3.isnull().sum()

sidewalk             4992
spc_common_family    4992
spc_latin_family     4992
steward              4992
guards               4992
dtype: int64

In [164]:
imr = SimpleImputer(missing_values=np.nan, strategy='median')
print("1")
#imr.fit(df3)
imr.fit(df3)
print("2")
imputed_data = imr.transform(df3.values)

1


ValueError: Cannot use median strategy with non-numeric data:
could not convert string to float: 'apple'

In [140]:
df.isnull().sum().sort_values(ascending=False)

health               4993
sidewalk             4992
spc_common_family    4992
spc_latin_family     4992
spc_latin            4992
spc_common           4992
steward              4992
guards               4992
problems             4992
day                     0
boro_ct                 0
latitude                0
longitude               0
x_sp                    0
y_sp                    0
created_at              0
month                   0
year                    0
nta                     0
root_problems           0
trunk_problems          0
nta_name                0
boroname                0
st_senate               0
st_assem                0
cncldist                0
the_geom                0
borocode                0
cb_num                  0
zip_city                0
zipcode                 0
address                 0
user_type               0
status                  0
curb_loc                0
stump_diam              0
tree_dbh                0
branch_problems         0
dtype: int64