In [109]:
import pandas as pd

import os
import glob

import re

### Construct full 'national' Dataframe from all state CSV files 

In [110]:
files = glob.glob('data/state_files/*.csv')

In [111]:
li = []

for file in files:
    df = pd.read_csv(file, index_col=None, header=0)
    li.append(df)

In [112]:
frame = pd.concat(li, axis=0, ignore_index=True)

### Extract the Genus
functionally: the first capital 'word'

In [113]:
def genus_extract(row):
    try: 
        genus = re.search('([A-Z][a-z]+)', row['Scientific Name with Author'])
        return genus[0]
    except:
        genus = None

In [114]:
frame['Genus'] = frame.apply(lambda row: genus_extract(row), axis=1)

### Extract full scientific names 
functionally: the Genus + all the lower case 'words' of the "scientific name" (Eliminating the author names)

In [32]:
frame

Unnamed: 0,Symbol,Synonym Symbol,Scientific Name with Author,State Common Name,Family,State,Genus
0,ABFR,,Abies fraseri (Pursh) Poir.,Fraser fir,Pinaceae,Georgia,Abies
1,ABFR,PIFR,Pinus fraseri Pursh,,Pinaceae,Georgia,Pinus
2,ABIES,,Abies Mill.,fir,Pinaceae,Georgia,Abies
3,ABPR3,,Abrus precatorius L.,rosarypea,Fabaceae,Georgia,Abrus
4,ABPR3,ABAB2,"Abrus abrus (L.) W. Wight, nom. inval.",,Fabaceae,Georgia,Abrus
...,...,...,...,...,...,...,...
684951,ZIVEG,TOGR3,Toxicoscordion gramineum (Rydb.) Rydb.,,Liliaceae,South Dakota,Toxicoscordion
684952,ZIVEG,ZIGR2,Zigadenus gramineus Rydb.,,Liliaceae,South Dakota,Zigadenus
684953,ZIVEG,ZIIN,Zigadenus intermedius Rydb.,,Liliaceae,South Dakota,Zigadenus
684954,ZIZAN,,Zizania L.,wildrice,Poaceae,South Dakota,Zizania


In [115]:
def sci_names(row):
    try: 
        epithets = re.findall('\s([a-z]+\.*)', row['Scientific Name with Author'])
        full_name = row['Genus']
        for i in epithets:
            full_name += ' ' + i
        return full_name
    except:
        return None

In [116]:
frame['Scientific name'] = frame.apply(lambda row: sci_names(row), axis=1)

### Extract species names 

In [117]:
def species(row):
    try: 
        species = row['Scientific name'].split()[1]
        return species
    except:
        return None

In [118]:
frame['Species'] = frame.apply(lambda row: species(row), axis=1)

### Extract infraspecific names 
functionally: the remainder of the "scientifcc name" after *Genus* and *species*

In [119]:
def infra(row):
    try: 
        extras = row['Scientific name'].split()[2:]
        infras = ''
        for i in extras:
            infras += ' ' + i
        infras = infras.strip()
        if len(infras) == 0: return None
        else: return infras
    except:
        return None

In [120]:
frame['Infraspecies name'] = frame.apply(lambda row: infra(row), axis=1)

Infraspecific names could take the form of: 

"var.", "ssp", "subsp.", "forma", "f.", or "nom. inval."  

and the regex function could consist of : 

In [121]:
# gs_match = re.search('(var\.|ssp\.|subsp\.|f\.\svar\.|f\.|nom\.\sinval\.)\s[a-z]+', row['Scientific Name with Author'])

### Rearrange columns

In [127]:
frame.columns

Index(['Symbol', 'Synonym Symbol', 'Scientific Name with Author',
       'State Common Name', 'Family', 'State', 'Genus', 'Scientific name',
       'Species', 'Infraspecies name'],
      dtype='object')

In [130]:
frame

Unnamed: 0,Symbol,Synonym Symbol,Scientific Name with Author,State Common Name,Family,State,Genus,Scientific name,Species,Infraspecies name
0,ABFR,,Abies fraseri (Pursh) Poir.,Fraser fir,Pinaceae,Georgia,Abies,Abies fraseri,fraseri,
1,ABFR,PIFR,Pinus fraseri Pursh,,Pinaceae,Georgia,Pinus,Pinus fraseri,fraseri,
2,ABIES,,Abies Mill.,fir,Pinaceae,Georgia,Abies,Abies,,
3,ABPR3,,Abrus precatorius L.,rosarypea,Fabaceae,Georgia,Abrus,Abrus precatorius,precatorius,
4,ABPR3,ABAB2,"Abrus abrus (L.) W. Wight, nom. inval.",,Fabaceae,Georgia,Abrus,Abrus abrus nom. inval.,abrus,nom. inval.
...,...,...,...,...,...,...,...,...,...,...
684951,ZIVEG,TOGR3,Toxicoscordion gramineum (Rydb.) Rydb.,,Liliaceae,South Dakota,Toxicoscordion,Toxicoscordion gramineum,gramineum,
684952,ZIVEG,ZIGR2,Zigadenus gramineus Rydb.,,Liliaceae,South Dakota,Zigadenus,Zigadenus gramineus,gramineus,
684953,ZIVEG,ZIIN,Zigadenus intermedius Rydb.,,Liliaceae,South Dakota,Zigadenus,Zigadenus intermedius,intermedius,
684954,ZIZAN,,Zizania L.,wildrice,Poaceae,South Dakota,Zizania,Zizania,,


In [150]:
frame = frame[['Symbol', 'Synonym Symbol', 'Family', 
              'Scientific Name with Author', 'Scientific name', 
              'Genus', 'Species', 'Infraspecies name', 
              'State Common Name', 'State']]
frame

Unnamed: 0,Symbol,Synonym Symbol,Family,Scientific Name with Author,Scientific name,Genus,Species,Infraspecies name,State Common Name,State
0,ABFR,,Pinaceae,Abies fraseri (Pursh) Poir.,Abies fraseri,Abies,fraseri,,Fraser fir,Georgia
1,ABFR,PIFR,Pinaceae,Pinus fraseri Pursh,Pinus fraseri,Pinus,fraseri,,,Georgia
2,ABIES,,Pinaceae,Abies Mill.,Abies,Abies,,,fir,Georgia
3,ABPR3,,Fabaceae,Abrus precatorius L.,Abrus precatorius,Abrus,precatorius,,rosarypea,Georgia
4,ABPR3,ABAB2,Fabaceae,"Abrus abrus (L.) W. Wight, nom. inval.",Abrus abrus nom. inval.,Abrus,abrus,nom. inval.,,Georgia
...,...,...,...,...,...,...,...,...,...,...
684951,ZIVEG,TOGR3,Liliaceae,Toxicoscordion gramineum (Rydb.) Rydb.,Toxicoscordion gramineum,Toxicoscordion,gramineum,,,South Dakota
684952,ZIVEG,ZIGR2,Liliaceae,Zigadenus gramineus Rydb.,Zigadenus gramineus,Zigadenus,gramineus,,,South Dakota
684953,ZIVEG,ZIIN,Liliaceae,Zigadenus intermedius Rydb.,Zigadenus intermedius,Zigadenus,intermedius,,,South Dakota
684954,ZIZAN,,Poaceae,Zizania L.,Zizania,Zizania,,,wildrice,South Dakota
