### clean data from PFAF

In [20]:
import pandas as pd
import re
import numpy as np
from collections import Counter

In [118]:
pfaf = pd.read_csv('pfaf_data.csv')

In [3]:
pfaf.to_csv('pfaf_data_original.csv', index=False)

In [119]:
pfaf = pfaf.drop_duplicates()

In [89]:
pfaf.columns

Index(['Care', 'Common Name', 'Cultivation details', 'Edibility Rating',
       'Edible Uses', 'Family', 'Found In', 'Genus', 'Habitats',
       'Known Hazards', 'Medicinal Rating', 'Medicinal Uses', 'Other Names',
       'Other Uses', 'Physical Characteristics', 'Propagation', 'Range',
       'Species', 'Summary', 'Synonyms', 'USDA hardiness', 'Weed Potential',
       'Unnamed: 22'],
      dtype='object')

In [90]:
for c in pfaf.columns:
    print(c, pfaf[c].isnull().sum())

Care 0
Common Name 4009
Cultivation details 4007
Edibility Rating 4005
Edible Uses 4007
Family 4005
Found In 5432
Genus 0
Habitats 4112
Known Hazards 4005
Medicinal Rating 4005
Medicinal Uses 4005
Other Names 5435
Other Uses 4005
Physical Characteristics 4005
Propagation 4027
Range 4006
Species 0
Summary 5145
Synonyms 4823
USDA hardiness 4005
Weed Potential 5449
Unnamed: 22 5577


In [91]:
pfaf['Edibility Rating'].unique()

array([nan, '   (1 of 5)', '   (3 of 5)', '   (0 of 5)', '   (2 of 5)',
       '   (4 of 5)', '   (5 of 5)'], dtype=object)

In [120]:
pfaf.fillna('', inplace=True)
pfaf['Edibility Rating'].replace('', '0', inplace=True)

In [121]:
pfaf['Edibility Rating'] = pfaf['Edibility Rating'].str.extract(r'([0-9])').astype(int)

In [122]:
pfaf['Edible Uses'].iloc[1]

'Edible Parts: Inner bark\nEdible Uses: Gum  Tea\n\nYoung shoot tips are used as a substitute for tea[177, 183]. The pitch obtained from the bark can be hardened (probably by immersing it in cold water[K]) and used as a chewing gum[257]. Inner bark[257]. No further information is given, but inner bark can be dried, ground into a powder and then used with grain flours etc to make bread and other preparations[257].'

In [123]:
edible_cols = pfaf['Edible Uses'].str.split('\n', expand=True)
edible_cols[['Genus', 'Species']] = pfaf[['Genus', 'Species']]
edible_cols.fillna('', inplace=True)

In [143]:
edible_parts_counts = Counter()
edible_uses_counts = Counter()
edible_cols[edible_cols[0].str.contains('Edible Parts')][0].str.split().apply(
    edible_parts_counts.update)
edible_cols[edible_cols[1].str.contains('Edible Uses')][1].str.split().apply(
    edible_uses_counts.update)
edible_parts_counts

Counter({'Edible': 1188,
         'Parts:': 1188,
         'Inner': 59,
         'bark': 59,
         'Root': 282,
         'Seed': 361,
         'Seedpod': 42,
         'Sap': 35,
         'Leaves': 400,
         'Flowers': 155,
         'Fruit': 331,
         'Stem': 86,
         'Nectar': 5,
         'Apical': 1,
         'bud': 1,
         'Oil': 80,
         'Shoots': 1,
         'Manna': 3,
         'Pollen': 3})

In [134]:
edible_cols[edible_cols[0].str.contains('Nectar')]

Unnamed: 0,0,1,2,3,Genus,Species
106,Edible Parts: Nectar Seed,Edible Uses:,,Seed - cooked. Said to be as sweet as a chestn...,Aesculus,flava
1658,Edible Parts: Fruit Nectar,Edible Uses:,,"Fruit - raw[60, 105]. A sweet flavour[61]. The...",Comandra,umbellata
2858,Edible Parts: Nectar,Edible Uses: Tea,,The plant has been boiled up as a tea[257]. Th...,Ipomopsis,aggregata
3132,Edible Parts: Fruit Nectar,Edible Uses:,,"Fruit - raw or cooked[105, 177, 212]. Not tast...",Lonicera,ciliosa
3326,Edible Parts: Fruit Nectar,Edible Uses:,,"Fruit - fresh or dried[61, 105, 183]. The frui...",Menziesia,ferruginea


In [135]:
edible_cols[3].loc[2858]

'The plant has been boiled up as a tea[257]. The nectar is sucked from the flowers by children[257].'

In [136]:
edible_cols[edible_cols[0].str.contains('Manna')]

Unnamed: 0,0,1,2,3,Genus,Species
2987,Edible Parts: Manna Sap,Edible Uses: Gum Sweetener,,"A gum, or resin, is produced under the bark. I...",Larix,occidentalis
4303,Edible Parts: Inner bark Manna,Edible Uses: Coffee Condiment Gum Tea,,Young shoot tips - used as a flavouring in coo...,Pseudotsuga,menziesii
4653,Edible Parts: Flowers Inner bark Manna,Edible Uses: Tea,,A honeydew can be obtained from the cut branch...,Salix,gooddingii


In [137]:
edible_cols[edible_cols[0].str.contains('Pollen')]

Unnamed: 0,0,1,2,3,Genus,Species
4761,Edible Parts: Pollen Root Seed Stem,Edible Uses:,,Root - raw or cooked[172]. Rich in starch. Ste...,Scirpus,microcarpus
5378,Edible Parts: Flowers Leaves Oil Pollen Ro...,Edible Uses: Oil,,Roots - raw or cooked[145]. Rich in starch[105...,Typha,domingensis
5379,Edible Parts: Flowers Leaves Oil Pollen Ro...,Edible Uses: Oil,,"Roots - raw or cooked[2, 12]. They can be boil...",Typha,latifolia


In [138]:
edible_cols[(~edible_cols[0].str.contains('Edible Parts:')) & 
            ~(edible_cols[0]=='') & ~(edible_cols[0]=='None known') & 
            ~(edible_cols[0]=='0')][0].head()

76     Leaves - cooked[105]. Some caution is advised,...
101         Seeds[257]. No further information is given.
192    The heart of the plant is very rich in sacchar...
278        Catkins - raw or cooked. A bitter taste[172].
441    Young leaves - cooked[177]. Used as a potherb[...
Name: 0, dtype: object

In [139]:
edible_cols[3] = np.where(((~edible_cols[0].str.contains('Edible Parts:')) & 
            ~(edible_cols[0]=='') & ~(edible_cols[0]=='None known') & 
            ~(edible_cols[0]=='0')), edible_cols[0], edible_cols[3])

In [140]:
edible_cols.loc[76]

0          Leaves - cooked[105]. Some caution is advised,...
1                                                           
2                                                           
3          Leaves - cooked[105]. Some caution is advised,...
Genus                                                 Actaea
Species                                             racemosa
Name: 76, dtype: object

In [146]:
pfaf['Edible inner bark'] = edible_cols[0].str.lower().str.contains('inner bark')
pfaf['Edible roots'] = edible_cols[0].str.lower().str.contains('root')
pfaf['Edible seeds'] = edible_cols[0].str.lower().str.contains('seed')
pfaf['Edible seedpods'] = edible_cols[0].str.lower().str.contains('seedpod')
pfaf['Edible sap'] = (edible_cols[0].str.lower().str.contains('sap') | 
                      edible_cols[0].str.lower().str.contains('manna'))
pfaf['Edible leaves'] = edible_cols[0].str.lower().str.contains('leaves')
pfaf['Edible flowers'] = edible_cols[0].str.lower().str.contains('flowers')
pfaf['Edible fruit'] = edible_cols[0].str.lower().str.contains('fruit')
pfaf['Edible stems'] = edible_cols[0].str.lower().str.contains('stem')
pfaf['Edible oil'] = edible_cols[0].str.lower().str.contains('oil')
pfaf['Edible shoots'] = edible_cols[3].str.lower().str.contains('shoots')

In [144]:
edible_uses_counts

Counter({'Edible': 1188,
         'Uses:': 1188,
         'Gum': 61,
         'Tea': 174,
         'Condiment': 120,
         'Drink': 38,
         'Sweetener': 36,
         'Oil': 80,
         'Pectin': 7,
         'Colouring': 7,
         'Coffee': 58,
         'Chocolate': 4,
         'Milk': 2,
         'Curdling': 3,
         'agent': 3,
         'Rutin': 1,
         'Salt': 1,
         'Stabilizer': 1,
         'Gelatine': 1})

In [149]:
pfaf['Edible use gum'] = edible_cols[1].str.lower().str.contains('gum')
pfaf['Edible use drink'] = (edible_cols[1].str.lower().str.contains('tea') | 
                            edible_cols[1].str.lower().str.contains('drink') |
                            edible_cols[1].str.lower().str.contains('coffee'))
pfaf['Edible use condiment'] = edible_cols[1].str.lower().str.contains('condiment')
pfaf['Edible use sweetener'] = edible_cols[1].str.lower().str.contains('sweetener')
pfaf['Edible use oil'] = edible_cols[1].str.lower().str.contains('oil')

In [164]:
pfaf['Edible description'] = edible_cols[3]

In [160]:
pfaf.drop('Edible Uses', axis=1, inplace=True)

In [150]:
pfaf.to_csv('pfaf_data.csv', index=False)

In [151]:
pfaf['Medicinal Rating'].unique()

array(['', '   (2 of 5)', '   (5 of 5)', '   (3 of 5)', '   (1 of 5)',
       '   (0 of 5)', '   (4 of 5)'], dtype=object)

In [153]:
pfaf['Medicinal Rating'].replace('', '0', inplace=True)
pfaf['Medicinal Rating'] = pfaf['Medicinal Rating'].str.extract(r'([0-9])').astype(int)

In [158]:
habitats = pfaf['Habitats'].copy()
habitats_counts = Counter()
habitats.str.split(';').apply(habitats_counts.update)
habitats_counts

Counter({'': 5497,
         'Woodland Garden Canopy': 194,
         ' not Deep Shade': 82,
         ' Cultivated Beds': 545,
         ' East Wall. By. South Wall. By. West Wall. By.': 11,
         ' South Wall. By. West Wall. By.': 24,
         'Woodland Garden Secondary': 54,
         ' Dappled Shade': 335,
         ' Sunny Edge': 62,
         ' Ground Cover': 91,
         ' Lawn': 8,
         ' Meadow': 28,
         'Woodland Garden Dappled Shade': 193,
         ' Secondary': 35,
         ' Shady Edge': 272,
         'Woodland Garden Sunny Edge': 446,
         ' South Wall. By.': 2,
         ' Bog Garden': 181,
         ' Hedge': 39,
         ' Hedgerow': 8,
         'Woodland Garden Cultivated Beds': 1,
         ' Pond': 73,
         ' East Wall. In. South Wall. In. West Wall. In.': 8,
         'Woodland Garden Shady Edge': 1,
         ' North Wall. In. East Wall. In. West Wall. In.': 4,
         ' East Wall. In. West Wall. In.': 1,
         ' East Wall. By. South Wall. By.': 22,
  

In [159]:
pfaf['Canopy'] = habitats.str.contains('Canopy')
pfaf['Understory'] = habitats.str.contains('Secondary')
pfaf['Cultivated Beds'] = habitats.str.contains('Cultivated Beds')
pfaf['Dappled Shade'] = habitats.str.contains('Dappled Shade')
pfaf['Sunny Edge'] = habitats.str.contains('Sunny Edge')
pfaf['Ground Cover'] = habitats.str.contains('Ground Cover')
pfaf['Meadow'] = habitats.str.contains('Meadow')
pfaf['Shade Edge'] = habitats.str.contains('Shady Edge')
pfaf['Bog'] = habitats.str.contains('Bog')
pfaf['Hedge'] = habitats.str.contains('Hedge')
pfaf['Pond'] = habitats.str.contains('Pond')

In [161]:
pfaf.drop('Habitats', axis=1, inplace=True)

In [163]:
pfaf['Medicinal Uses'].iloc[1]

"Plants For A Future can not take any responsibility for any adverse effects from the use of plants. Always seek advice from a professional before using a plant medicinally.\n\n\nThis plant was used quite widely by native North American Indians. An infusion of the bark was used as a tonic and to treat stomach ailments, TB, haemorrhoids and various minor complaints[257]. The pitch, or resin, was also used to treat colds, sore throats etc[257]. The bark of this tree contains blisters that are filled with a resin called 'Canadian Balsam'[226]. Although the report does not mention the uses of this resin, it can almost certainly be used in the same ways as the resin of A. balsamea, as detailed below:- The resin obtained from this tree (see 'Uses notes' below) has been used throughout the world and is a very effective antiseptic and healing agent. It is used as a healing and analgesic protective covering for burns, bruises, wounds and sores[213, 222, 226]. It is also used to treat sore nippl

In [165]:
medicinal_cols = pfaf['Medicinal Uses'].str.split('\n', expand=True)
medicinal_cols[['Genus', 'Species']] = pfaf[['Genus', 'Species']]
medicinal_cols.fillna('', inplace=True)

In [166]:
medicinal_cols.head()

Unnamed: 0,0,1,2,3,4,5,Genus,Species
0,,,,,,,Abutilon,abutiloides
1,Plants For A Future can not take any responsib...,,,This plant was used quite widely by native Nor...,,,Abies,amabilis
2,Plants For A Future can not take any responsib...,Analgesic Antiscorbutic Antiseptic Diuretic...,,,The resin obtained from the balsam fir (see 'U...,,Abies,balsamea
6,,,,,,,Abutilon,berlandieri
7,,,,,,,Abies,bracteata


In [167]:
medicinal_cols[0].unique()

array(['',
       'Plants For A Future can not take any responsibility for any adverse effects from the use of plants. Always seek advice from a professional before using a plant medicinally.'],
      dtype=object)

In [172]:
medicinal_counter = Counter()
medicinal_cols[1].str.split().apply(medicinal_counter.update)
medicinal_cols[2].str.split().apply(medicinal_counter.update)
medicinal_counter

Counter({'Analgesic': 86,
         'Antiscorbutic': 19,
         'Antiseptic': 93,
         'Diuretic': 224,
         'Poultice': 221,
         'Stimulant': 65,
         'Tonic': 174,
         'VD': 52,
         'Antirheumatic': 85,
         'Pectoral': 71,
         'TB': 40,
         'Cathartic': 40,
         'Diaphoretic': 130,
         'Emetic': 89,
         'Laxative': 91,
         'Ophthalmic': 95,
         'Skin': 72,
         'Stomachic': 132,
         'Antihalitosis': 1,
         'Deodorant': 4,
         'Foot': 6,
         'care': 6,
         'Miscellany': 72,
         'Astringent': 248,
         'Antiemetic': 16,
         'Birthing': 44,
         'aid': 44,
         'Galactogogue': 25,
         'Antidiarrhoeal': 17,
         'Antiinflammatory': 30,
         'Antispasmodic': 76,
         'Appetizer': 35,
         'Aromatic': 14,
         'Carminative': 49,
         'Antipruritic': 15,
         'Emmenagogue': 47,
         'Hypnotic': 19,
         'Oxytoxic': 14,
         'Kidne

In [173]:
to_remove = ['Alterative', 'Aromatherapy', 'Bach', 'Homeopathy', 
             'Antiscorbutic', 'Balsamic', 'Tonic', 
             'tonic', 'Blood', 'purifier', 'Cardiotonic', 
             'Antibilous', 'Cholagogue', 'Uterine', 'complaints', 
             'Antihalitosis', 'care', 'Miscellany', 'aid', 
             'Aromatic', 'Deobstruent', 'Acrid', 'Enuresis', 
             'Nutritive']

In [175]:
medicinal_counter['Laxative'] += medicinal_counter['Aperient']
medicinal_counter['Laxative'] += medicinal_counter['Cathartic']
to_remove.append('Aperient')
to_remove.append('Cathartic')
medicinal_counter['Purgative'] += medicinal_counter['Hydrogogue']
to_remove.append('Hydrogogue')
medicinal_counter['Cancer'] += medicinal_counter['Antitumor']
medicinal_counter['Cancer'] += medicinal_counter['Cytostatic']
medicinal_counter['Cancer'] += medicinal_counter['Cytotoxic']
medicinal_counter['Cancer'] += medicinal_counter['Resolvent']
to_remove.append('Antitumor')
to_remove.append('Cytostatic')
to_remove.append('Cytotoxic')
to_remove.append('Resolvent')
medicinal_counter['Fever'] = (medicinal_counter['Antipyretic'] + 
    medicinal_counter['Febrifuge'])
to_remove.append('Antipyretic')
to_remove.append('Febrifuge')
medicinal_counter['Stimulant'] += medicinal_counter['Stimulants']
to_remove.append('Stimulants')
medicinal_counter['Birthing'] += medicinal_counter['Oxytoxic']
to_remove.append('Oxytoxic')
medicinal_counter['Female misc'] = medicinal_counter["Women's"]
to_remove.append("Women's")

In [176]:
medicinal_counter = {k:v for k,v in medicinal_counter.items() if 
                     k not in to_remove}

In [178]:
for use in medicinal_counter.keys():
    pfaf[use] = (medicinal_cols[1].str.contains(use) | 
                 medicinal_cols[2].str.contains(use))

In [179]:
pfaf['Medicinal description'] = medicinal_cols[3] + 
    medicinal_cols[4] + medicinal_cols[5]

False    5491
True       86
Name: Analgesic, dtype: int64

In [181]:
medicinal_cols[5].value_counts()

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        