# Feature Engineering

In [1]:
import pandas as pd

### Load Data from U.S. Fish & Wildlife Service

In [2]:
species = pd.read_pickle("../Data/fws.pkl")
species.head()

Unnamed: 0,Scientific Name,Common Name,Group,Region,Federal Listing Status,VIP,State
0,Accipiter gentilis,Northern goshawk,Birds,2,Not Listed,V,AL
1,Acipenser fulvescens,Lake sturgeon,Fishes,3,Not Listed,V,AL
2,Acipenser oxyrinchus (=oxyrhynchus) desotoi,Atlantic sturgeon (Gulf subspecies),Fishes,4,Threatened,V,AL
3,Agarodes alabamensis,[Unnamed] caddisfly,Insects,4,Not Listed,I,AL
4,Agrimonia incisa,Incised groovebur,Flowering Plants,4,Not Listed,P,AL


In [3]:
# drop regions
species = species.drop(['Region'], axis=1)
species.head()

Unnamed: 0,Scientific Name,Common Name,Group,Federal Listing Status,VIP,State
0,Accipiter gentilis,Northern goshawk,Birds,Not Listed,V,AL
1,Acipenser fulvescens,Lake sturgeon,Fishes,Not Listed,V,AL
2,Acipenser oxyrinchus (=oxyrhynchus) desotoi,Atlantic sturgeon (Gulf subspecies),Fishes,Threatened,V,AL
3,Agarodes alabamensis,[Unnamed] caddisfly,Insects,Not Listed,I,AL
4,Agrimonia incisa,Incised groovebur,Flowering Plants,Not Listed,P,AL


### Load Data from U.S. Forest Service

In [4]:
forests = pd.read_pickle("../Data/forests.pkl")
forests.head()

Unnamed: 0,State,Total Land Area (Thousands of Acres),Forest Land Area (Thousands of Acres)
0,CT,3099,1712
1,DE,1247,340
2,IL,35532,4848
3,IN,22929,4830
4,IA,35749,3014


In [5]:
species = species.merge(forests, on='State')
species.head()

Unnamed: 0,Scientific Name,Common Name,Group,Federal Listing Status,VIP,State,Total Land Area (Thousands of Acres),Forest Land Area (Thousands of Acres)
0,Accipiter gentilis,Northern goshawk,Birds,Not Listed,V,AL,32413,22877
1,Acipenser fulvescens,Lake sturgeon,Fishes,Not Listed,V,AL,32413,22877
2,Acipenser oxyrinchus (=oxyrhynchus) desotoi,Atlantic sturgeon (Gulf subspecies),Fishes,Threatened,V,AL,32413,22877
3,Agarodes alabamensis,[Unnamed] caddisfly,Insects,Not Listed,I,AL,32413,22877
4,Agrimonia incisa,Incised groovebur,Flowering Plants,Not Listed,P,AL,32413,22877


### Load Data from U.S. Environmental Protection Agency

In [6]:
aqi = pd.read_pickle("../Data/aqi.pkl")
aqi.head()

Unnamed: 0_level_0,Days with AQI,Good Days
State ID,Unnamed: 1_level_1,Unnamed: 2_level_1
AK,357,282
AL,324,251
AR,362,261
AZ,365,261
CA,365,203


In [7]:
species = species.merge(aqi, left_on='State', right_index=True, how='left')
species.head()

Unnamed: 0,Scientific Name,Common Name,Group,Federal Listing Status,VIP,State,Total Land Area (Thousands of Acres),Forest Land Area (Thousands of Acres),Days with AQI,Good Days
0,Accipiter gentilis,Northern goshawk,Birds,Not Listed,V,AL,32413,22877,324,251
1,Acipenser fulvescens,Lake sturgeon,Fishes,Not Listed,V,AL,32413,22877,324,251
2,Acipenser oxyrinchus (=oxyrhynchus) desotoi,Atlantic sturgeon (Gulf subspecies),Fishes,Threatened,V,AL,32413,22877,324,251
3,Agarodes alabamensis,[Unnamed] caddisfly,Insects,Not Listed,I,AL,32413,22877,324,251
4,Agrimonia incisa,Incised groovebur,Flowering Plants,Not Listed,P,AL,32413,22877,324,251


### Create Dummy Variables for Categorical Data

In [8]:
#create dummy variables
species = pd.get_dummies(data=species, columns=['Group', 'VIP', 'State'])
print(species.shape)
species.head()

(11347, 79)


Unnamed: 0,Scientific Name,Common Name,Federal Listing Status,Total Land Area (Thousands of Acres),Forest Land Area (Thousands of Acres),Days with AQI,Good Days,Group_Amphibians,Group_Annelid Worms,Group_Arachnids,...,State_SD,State_TN,State_TX,State_UT,State_VA,State_VT,State_WA,State_WI,State_WV,State_WY
0,Accipiter gentilis,Northern goshawk,Not Listed,32413,22877,324,251,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Acipenser fulvescens,Lake sturgeon,Not Listed,32413,22877,324,251,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Acipenser oxyrinchus (=oxyrhynchus) desotoi,Atlantic sturgeon (Gulf subspecies),Threatened,32413,22877,324,251,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Agarodes alabamensis,[Unnamed] caddisfly,Not Listed,32413,22877,324,251,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Agrimonia incisa,Incised groovebur,Not Listed,32413,22877,324,251,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# drop 9 species groups with representation less than 1%
species = species.drop(['Group_Annelid Worms',
                        'Group_Arachnids',
                        'Group_Conifers and Cycads',
                        'Group_Corals',
                        'Group_Flatworms and Roundworms',
                        'Group_Hydroids',
                        'Group_Lichens',
                        'Group_Millipedes',
                        'Group_Sponges'], axis=1)
species.head()

Unnamed: 0,Scientific Name,Common Name,Federal Listing Status,Total Land Area (Thousands of Acres),Forest Land Area (Thousands of Acres),Days with AQI,Good Days,Group_Amphibians,Group_Birds,Group_Clams,...,State_SD,State_TN,State_TX,State_UT,State_VA,State_VT,State_WA,State_WI,State_WV,State_WY
0,Accipiter gentilis,Northern goshawk,Not Listed,32413,22877,324,251,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,Acipenser fulvescens,Lake sturgeon,Not Listed,32413,22877,324,251,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Acipenser oxyrinchus (=oxyrhynchus) desotoi,Atlantic sturgeon (Gulf subspecies),Threatened,32413,22877,324,251,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Agarodes alabamensis,[Unnamed] caddisfly,Not Listed,32413,22877,324,251,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Agrimonia incisa,Incised groovebur,Not Listed,32413,22877,324,251,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# drop 22 state distributions with representation less than 1%
species = species.drop(['State_AK','State_CT','State_DE','State_IA','State_KS',
                        'State_LA','State_MA','State_MD','State_ME','State_MI',
                        'State_MN','State_MT','State_ND','State_NE','State_NH',
                        'State_NJ','State_OH','State_OK','State_RI','State_SD',
                        'State_VT','State_WI'], axis=1)
species.head()

Unnamed: 0,Scientific Name,Common Name,Federal Listing Status,Total Land Area (Thousands of Acres),Forest Land Area (Thousands of Acres),Days with AQI,Good Days,Group_Amphibians,Group_Birds,Group_Clams,...,State_OR,State_PA,State_SC,State_TN,State_TX,State_UT,State_VA,State_WA,State_WV,State_WY
0,Accipiter gentilis,Northern goshawk,Not Listed,32413,22877,324,251,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,Acipenser fulvescens,Lake sturgeon,Not Listed,32413,22877,324,251,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Acipenser oxyrinchus (=oxyrhynchus) desotoi,Atlantic sturgeon (Gulf subspecies),Threatened,32413,22877,324,251,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Agarodes alabamensis,[Unnamed] caddisfly,Not Listed,32413,22877,324,251,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Agrimonia incisa,Incised groovebur,Not Listed,32413,22877,324,251,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Data Loading

In [11]:
species.to_pickle("../Data/species.pkl")