### Cleaning data gathered from plants.PlantRecommender.get_all_native_plants()

In [123]:
import pandas as pd
import numpy as np
import seaborn as sns
from plants import PlantRecommender
import warnings
warnings.filterwarnings('ignore')

In [329]:
df = pd.read_csv('all_native_plants_original.csv')

In [254]:
height_cols = df['Plant Height'].str.split(r'^\D*([0-9]*\.*[0-9]*)\D*([0-9]*\.*[0-9]*) *(feet\b|ft\b|inches\b|in\b).*$', expand=True)

In [255]:
height_cols.drop([0,4], axis=1, inplace=True)

In [256]:
height_cols.rename(columns={1:'Min Height', 2:'Max Height', 3:'Height Units'}, inplace=True)

In [258]:
spread_cols = df['Plant Spread'].str.split(r'^\D*([0-9]*\.*[0-9]*)\D*([0-9]*\.*[0-9]*) *(feet\b|ft\b|inches\b|in\b).*$', expand=True)

In [259]:
spread_cols.drop([0,4], axis=1, inplace=True)

In [260]:
spread_cols.rename(columns={1:'Min Spread', 2:'Max Spread', 3:'Spread Units'}, inplace=True)

In [330]:
df.drop(['Plant Height', 'Plant Spread'], axis=1, inplace=True)
df = pd.concat([df, height_cols, spread_cols], axis=1)

In [331]:
# to_drop = ['1"-2"', 'Bees', 'Birds', 'Blue', 'Bog gardening', 'Bumblebees',
#        'Butterflies','Cuttings: Stem', 'Deer Resistant', 
#        'Flower Time_Late spring or early summer',
#        'Flower Time_Late summer or early fall',
#        'Flower Time_Late winter or early spring', 'Flower Time_Spring',
#        'Flower Time_Summer', 'Flowers_Fragrant', 'Flowers_Inconspicuous',
#        'Flowers_Showy', 'Fruit_Edible to birds','Fruit_Showy','Layering', 
#         'Leaves_Fragrant','Needs excellent drainage in pots', 'Pink','Purple',
#         'Salt tolerant','Suitable in 3 gallon or larger', 'Under 1"', 'Various insects',
#         'White','Xeriscapic']

In [332]:
df['Min Height'] = df['Min Height'].astype(float)
df['Max Height'] = np.where(df['Max Height']=='', df['Min Height'], df['Max Height'])
df['Max Height'] = df['Max Height'].astype(float)

In [333]:
df['Min Spread'] = np.where(df['Min Spread']=='', np.nan, df['Min Spread'])
df['Min Spread'] = df['Min Spread'].astype(float)
df['Max Spread'] = np.where(df['Max Spread']=='', df['Min Spread'], df['Max Spread'])
df['Max Spread'] = df['Max Spread'].astype(float)

In [265]:
rec = PlantRecommender()

TimeoutException: Message: connection refused


In [125]:
cat = rec.categorical_attributes
cat

['Genus', 'Species', 'Varieties', 'Life cycle']

In [126]:
boolean = rec.boolean_attributes
boolean

['Coarse Soil',
 'Medium Soil',
 'Fine Soil',
 'Herb/Forb',
 'Shrub',
 'Tree',
 'Cactus/Succulent',
 'Grass/Grass-like',
 'Fern',
 'Vine',
 'Full Sun',
 'Full Sun to Partial Shade',
 'Partial or Dappled Shade',
 'Partial Shade to Full Shade',
 'Full Shade',
 'In Water',
 'Wet',
 'Wet Mesic',
 'Mesic',
 'Dry Mesic',
 'Dry',
 'Extremely acid (3.5 – 4.4)',
 'Very strongly acid (4.5 – 5.0)',
 'Strongly acid (5.1 – 5.5)',
 'Moderately acid (5.6 – 6.0)',
 'Slightly acid (6.1 – 6.5)',
 'Neutral (6.6 – 7.3)',
 'Slightly alkaline (7.4 – 7.8)',
 'Moderately alkaline (7.9 – 8.4)',
 'Strongly alkaline (8.5 – 9.0)',
 'Leaves_Good fall color',
 'Leaves_Glaucous',
 'Leaves_Unusual foliage color',
 'Leaves_Evergreen',
 'Leaves_Semi-evergreen',
 'Leaves_Deciduous',
 'Leaves_Fragrant',
 'Leaves_Malodorous',
 'Leaves_Variegated',
 'Leaves_Spring ephemeral',
 'Leaves_Needled',
 'Leaves_Broadleaf',
 'Leaves_Other',
 'Fruit_Showy',
 'Fruit_Edible to birds',
 'Fruit_Dehiscent',
 'Fruit_Indehiscent',
 'Fruit_

In [127]:
num = rec.numeric_attributes
num

['Minimum cold hardiness',
 'Maximum recommended zone',
 'Plant Height',
 'Plant Spread',
 'Inflorescence Height',
 'Foliage Mound Height']

In [334]:
df[boolean] = df[boolean].fillna('False')
df[boolean] = df[boolean].replace({'0.0': False, '1.0': True, 'False': False, 'True': True})

In [335]:
df['Min Height'] = np.where((df['Height Units']=='inches') | (df['Height Units']=='in'),
                           df['Min Height']/12.0, df['Min Height'])

In [336]:
df['Max Height'] = np.where((df['Height Units']=='inches') | (df['Height Units']=='in'),
                           df['Max Height']/12.0, df['Max Height'])

In [337]:
df['Min Spread'] = np.where((df['Spread Units']=='inches') | (df['Spread Units']=='in'),
                           df['Min Spread']/12.0, df['Min Spread'])

In [338]:
df['Max Spread'] = np.where((df['Height Units']=='inches') | (df['Height Units']=='in'),
                           df['Max Spread']/12.0, df['Max Spread'])

Note: all heights are now in feet.

In [339]:
df['Minimum cold hardiness'][:10]

0    Zone 5a -28.9 °C (-20 °F) to -26.1 °C (-15 °F)
1                                               NaN
2          Zone 3 -40 °C (-40 °F) to -37.2 °C (-35)
3          Zone 3 -40 °C (-40 °F) to -37.2 °C (-35)
4                                               NaN
5                                               NaN
6                                               NaN
7    Zone 4a -34.4 °C (-30 °F) to -31.7 °C (-25 °F)
8                                               NaN
9          Zone 7a -17.8 °C (0 °F) to -15 °C (5 °F)
Name: Minimum cold hardiness, dtype: object

In [350]:
df['Minimum cold hardiness'] = df['Minimum cold hardiness'].str.extract(r'([0-9]+)')

In [355]:
df['Maximum recommended zone'] = df['Maximum recommended zone'].str.extract(r'([0-9]+)')

In [359]:
df['Minimum cold hardiness'] = df['Minimum cold hardiness'].astype(float)
df['Maximum recommended zone'] = df['Maximum recommended zone'].astype(float)

In [360]:
df.to_csv('all_native_plants.csv', index=False)

In [368]:
zone = 7
plants = df[(df['Minimum cold hardiness']<=zone) & 
            ((df['Maximum recommended zone']==np.nan) | 
            (df['Maximum recommended zone']>=zone))]

In [369]:
plants.shape

(942, 265)

In [365]:
plants = plants[(df['Maximum recommended zone']==np.nan) | 
            (df['Maximum recommended zone']>=zone)]

In [366]:
plants.shape

(942, 265)