### Cleaning data gathered from plants.PlantRecommender.get_all_native_plants()

In [123]:
import pandas as pd
import numpy as np
import seaborn as sns
from plants import PlantRecommender
import warnings
warnings.filterwarnings('ignore')

In [158]:
df = pd.read_csv('all_native_plants.csv')

In [36]:
height_cols = df['Plant Height'].str.split(r'^\D*([0-9]+)\D*([0-9]*) *(feet\b|ft\b|inches\b|in\b).*$', expand=True)

In [38]:
height_cols.drop([0,4], axis=1, inplace=True)

In [41]:
height_cols.rename(columns={1:'Min Height', 2:'Max Height', 3:'Units'}, inplace=True)

In [42]:
spread_cols = df['Plant Spread'].str.split(r'^\D*([0-9]+)\D*([0-9]*) *(feet\b|ft\b|inches\b|in\b).*$', expand=True)

In [43]:
spread_cols.drop([0,4], axis=1, inplace=True)

In [44]:
spread_cols.rename(columns={1:'Min Spread', 2:'Max Spread', 3:'Units'}, inplace=True)

In [45]:
spread_cols

Unnamed: 0,Min Spread,Max Spread,Units
0,,,
1,,,
2,15,25,feet
3,15,25,feet
4,,,
...,...,...,...
5591,,,
5592,,,
5593,,,
5594,,,


In [159]:
df.drop(['Plant Height', 'Plant Spread'], axis=1, inplace=True)
df = pd.concat([df, height_cols, spread_cols], axis=1)

In [100]:
to_drop = ['1"-2"', 'Bees', 'Birds', 'Blue', 'Bog gardening', 'Bumblebees',
       'Butterflies','Cuttings: Stem', 'Deer Resistant', 
       'Flower Time_Late spring or early summer',
       'Flower Time_Late summer or early fall',
       'Flower Time_Late winter or early spring', 'Flower Time_Spring',
       'Flower Time_Summer', 'Flowers_Fragrant', 'Flowers_Inconspicuous',
       'Flowers_Showy', 'Fruit_Edible to birds','Fruit_Showy','Layering', 
        'Leaves_Fragrant','Needs excellent drainage in pots', 'Pink','Purple',
        'Salt tolerant','Suitable in 3 gallon or larger', 'Under 1"', 'Various insects',
        'White','Xeriscapic']

In [160]:
df['Min Height'] = df['Min Height'].astype(float)
df['Max Height'] = np.where(df['Max Height']=='', df['Min Height'], df['Max Height'])
df['Max Height'] = df['Max Height'].astype(float)

In [161]:
df['Min Spread'] = df['Min Spread'].astype(float)
df['Max Spread'] = np.where(df['Max Spread']=='', df['Min Spread'], df['Max Spread'])
df['Max Spread'] = df['Max Spread'].astype(float)

In [124]:
rec = PlantRecommender()

In [125]:
cat = rec.categorical_attributes
cat

['Genus', 'Species', 'Varieties', 'Life cycle']

In [126]:
boolean = rec.boolean_attributes
boolean

['Coarse Soil',
 'Medium Soil',
 'Fine Soil',
 'Herb/Forb',
 'Shrub',
 'Tree',
 'Cactus/Succulent',
 'Grass/Grass-like',
 'Fern',
 'Vine',
 'Full Sun',
 'Full Sun to Partial Shade',
 'Partial or Dappled Shade',
 'Partial Shade to Full Shade',
 'Full Shade',
 'In Water',
 'Wet',
 'Wet Mesic',
 'Mesic',
 'Dry Mesic',
 'Dry',
 'Extremely acid (3.5 – 4.4)',
 'Very strongly acid (4.5 – 5.0)',
 'Strongly acid (5.1 – 5.5)',
 'Moderately acid (5.6 – 6.0)',
 'Slightly acid (6.1 – 6.5)',
 'Neutral (6.6 – 7.3)',
 'Slightly alkaline (7.4 – 7.8)',
 'Moderately alkaline (7.9 – 8.4)',
 'Strongly alkaline (8.5 – 9.0)',
 'Leaves_Good fall color',
 'Leaves_Glaucous',
 'Leaves_Unusual foliage color',
 'Leaves_Evergreen',
 'Leaves_Semi-evergreen',
 'Leaves_Deciduous',
 'Leaves_Fragrant',
 'Leaves_Malodorous',
 'Leaves_Variegated',
 'Leaves_Spring ephemeral',
 'Leaves_Needled',
 'Leaves_Broadleaf',
 'Leaves_Other',
 'Fruit_Showy',
 'Fruit_Edible to birds',
 'Fruit_Dehiscent',
 'Fruit_Indehiscent',
 'Fruit_

In [127]:
num = rec.numeric_attributes
num

['Minimum cold hardiness',
 'Maximum recommended zone',
 'Plant Height',
 'Plant Spread',
 'Inflorescence Height',
 'Foliage Mound Height']

In [163]:
df[boolean] = df[boolean].fillna('False')
df[['Tree', 'Shrub']][:20]

Unnamed: 0,Tree,Shrub
0,False,1.0
1,True,False
2,True,False
3,True,False
4,False,True
5,True,False
6,True,False
7,True,False
8,False,False
9,False,True


In [164]:
df[boolean] = df[boolean].replace({'0.0': False, '1.0': True})
df[['Tree', 'Shrub']][:20]

Unnamed: 0,Tree,Shrub
0,False,True
1,True,False
2,True,False
3,True,False
4,False,True
5,True,False
6,True,False
7,True,False
8,False,False
9,False,True


In [172]:
# thing = df[['Tree', 'Shrub']]
# df[['Tree', 'Shrub']] = thing

In [173]:
df[boolean] = df[boolean].applymap(lambda x: x=='True' or x==True)
df[['Tree', 'Shrub']][:20]

Unnamed: 0,Tree,Shrub
0,False,True
1,True,False
2,True,False
3,True,False
4,False,True
5,True,False
6,True,False
7,True,False
8,False,False
9,False,True


In [174]:
shrubs = df[(df['Tree']==False) & (df['Shrub']==True)]