# Feature Engineering

In [1]:
import pandas as pd

### Load Data from U.S. Fish & Wildlife Service

In [2]:
species = pd.read_pickle("../Data/fws.pkl")
species.head()

Unnamed: 0,Scientific Name,Common Name,Group,Region,Federal Listing Status,VIP,State
0,Accipiter gentilis,Northern goshawk,Birds,2,Not Listed,V,AL
1,Acipenser fulvescens,Lake sturgeon,Fishes,3,Not Listed,V,AL
2,Acipenser oxyrinchus (=oxyrhynchus) desotoi,Atlantic sturgeon (Gulf subspecies),Fishes,4,Threatened,V,AL
3,Agarodes alabamensis,[Unnamed] caddisfly,Insects,4,Not Listed,I,AL
4,Agrimonia incisa,Incised groovebur,Flowering Plants,4,Not Listed,P,AL


In [3]:
# drop regions (multicollinear with state)
species = species.drop(['Region'], axis=1)
species.head()

Unnamed: 0,Scientific Name,Common Name,Group,Federal Listing Status,VIP,State
0,Accipiter gentilis,Northern goshawk,Birds,Not Listed,V,AL
1,Acipenser fulvescens,Lake sturgeon,Fishes,Not Listed,V,AL
2,Acipenser oxyrinchus (=oxyrhynchus) desotoi,Atlantic sturgeon (Gulf subspecies),Fishes,Threatened,V,AL
3,Agarodes alabamensis,[Unnamed] caddisfly,Insects,Not Listed,I,AL
4,Agrimonia incisa,Incised groovebur,Flowering Plants,Not Listed,P,AL


### Create Dummy Variables for Categorical Data

In [4]:
#create dummy variables
species = pd.get_dummies(data=species, columns=['Group', 'VIP', 'State'])
print(species.shape)
species.head()

(11737, 76)


Unnamed: 0,Scientific Name,Common Name,Federal Listing Status,Group_Amphibians,Group_Annelid Worms,Group_Arachnids,Group_Birds,Group_Clams,Group_Conifers and Cycads,Group_Corals,...,State_SD,State_TN,State_TX,State_UT,State_VA,State_VT,State_WA,State_WI,State_WV,State_WY
0,Accipiter gentilis,Northern goshawk,Not Listed,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Acipenser fulvescens,Lake sturgeon,Not Listed,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Acipenser oxyrinchus (=oxyrhynchus) desotoi,Atlantic sturgeon (Gulf subspecies),Threatened,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Agarodes alabamensis,[Unnamed] caddisfly,Not Listed,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Agrimonia incisa,Incised groovebur,Not Listed,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# drop species groups with representation less than 1%
species = species.drop(['Group_Annelid Worms',
                        'Group_Arachnids',
                        'Group_Conifers and Cycads',
                        'Group_Corals',
                        'Group_Flatworms and Roundworms',
                        'Group_Hydroids',
                        'Group_Lichens',
                        'Group_Millipedes',
                        'Group_Sponges'], axis=1)
species.head()

Unnamed: 0,Scientific Name,Common Name,Federal Listing Status,Group_Amphibians,Group_Birds,Group_Clams,Group_Crustaceans,Group_Ferns and Allies,Group_Fishes,Group_Flowering Plants,...,State_SD,State_TN,State_TX,State_UT,State_VA,State_VT,State_WA,State_WI,State_WV,State_WY
0,Accipiter gentilis,Northern goshawk,Not Listed,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Acipenser fulvescens,Lake sturgeon,Not Listed,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,Acipenser oxyrinchus (=oxyrhynchus) desotoi,Atlantic sturgeon (Gulf subspecies),Threatened,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,Agarodes alabamensis,[Unnamed] caddisfly,Not Listed,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Agrimonia incisa,Incised groovebur,Not Listed,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# drop state distributions with representation less than 1%
species = species.drop(['State_AK','State_CT','State_DE','State_IA','State_KS',
                        'State_LA','State_MA','State_MD','State_ME','State_MI',
                        'State_MN','State_MT','State_ND','State_NE','State_NH',
                        'State_NJ','State_OH','State_OK','State_RI','State_SD',
                        'State_VT','State_WI'], axis=1)
species.head()

Unnamed: 0,Scientific Name,Common Name,Federal Listing Status,Group_Amphibians,Group_Birds,Group_Clams,Group_Crustaceans,Group_Ferns and Allies,Group_Fishes,Group_Flowering Plants,...,State_OR,State_PA,State_SC,State_TN,State_TX,State_UT,State_VA,State_WA,State_WV,State_WY
0,Accipiter gentilis,Northern goshawk,Not Listed,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Acipenser fulvescens,Lake sturgeon,Not Listed,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,Acipenser oxyrinchus (=oxyrhynchus) desotoi,Atlantic sturgeon (Gulf subspecies),Threatened,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,Agarodes alabamensis,[Unnamed] caddisfly,Not Listed,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Agrimonia incisa,Incised groovebur,Not Listed,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


### Data Loading

In [5]:
species.to_pickle("../Data/species.pkl")