# IMPORT Dependencies


In [1]:
import pandas as pd
from sqlalchemy import create_engine
import difflib as dl

In [2]:
import pymysql
pymysql.install_as_MySQLdb()

# Create Species Key

In [3]:
# Read csv file into pandas dataframe
csv_file = "Data/Pop.EstByState.csv"
spec_df = pd.read_csv(csv_file)

In [4]:
#Select species columns and drop duplicates
spec_df = spec_df[['Sequence AOS 59', 'English Name', 'Scientific Name']]
spec_df = spec_df.drop_duplicates()
spec_df = spec_df.rename(columns={'Sequence AOS 59': 'SPECIES_ID'})
# Create key index
species_key = spec_df.set_index(['SPECIES_ID'])
species_key.head()


Unnamed: 0_level_0,English Name,Scientific Name
SPECIES_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
81,Plain Chachalaca,Ortalis vetula
94,Mountain Quail,Oreortyx pictus
99,Northern Bobwhite,Colinus virginianus
102,Scaled Quail,Callipepla squamata
104,California Quail,Callipepla californica


In [5]:
# Read csv file into pandas dataframe
csv_file = "Data/Pop.EstByState.csv"
pop_df = pd.read_csv(csv_file)

In [6]:
#filter for State of Illinois
pop_df = pop_df[(pop_df['Province / State / Territory'] == 'IL')]
# keep neccesary columns
pop_df = pop_df[['Sequence AOS 59','Province / State / Territory', 'Population Estimate',
        'Lower 95% bound', 'Upper 95% bound', 'Estimated % of Global Population',
        'Estimated % of USA/Canada Population', 'Median Estimate', 'Lower 80% bound', 'Upper 80% bound']]
#DROP SPECIES WITH NO POP EST
pop_df = pop_df.dropna(subset = ['Population Estimate'])
# rename columns
pop_df = pop_df.rename(columns={'Sequence AOS 59': 'SPECIES_ID' })

In [7]:
#Set index
bird_populations = pop_df.set_index(['SPECIES_ID'])

In [8]:
# Add column for % of state population
#tot_pop = df['Population Estimate'].sum()
#print(tot_pop)

#df['Estimated % of State Population'] = df['Population Estimate'].sum(axis=1) / tot_pop

In [9]:
bird_populations.head()

Unnamed: 0_level_0,Province / State / Territory,Population Estimate,Lower 95% bound,Upper 95% bound,Estimated % of Global Population,Estimated % of USA/Canada Population,Median Estimate,Lower 80% bound,Upper 80% bound
SPECIES_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
119,IL,290,0,1200,0.0%,0.0%,160,0,820
123,IL,190000,110000,310000,0.3%,1.2%,180000,130000,270000
149,IL,290000,180000,450000,0.2%,1.8%,280000,200000,400000
165,IL,120000,70000,200000,0.1%,1.4%,120000,80000,180000
198,IL,3200000,2100000,4800000,2.1%,2.4%,3000000,2300000,4400000


# CLEAN UP BIRD STRIKE DATA

In [10]:
# Read csv file into pandas dataframe
csv_file = "Data/IL_airplane_strikes_data_2000-2016.csv"
strike_df = pd.read_csv(csv_file, encoding = 'ISO-8859–1')


In [11]:
#filter for chicago airports and years 2004-2016
strike_df = strike_df[(strike_df['AIRPORT_ID'] == 'KORD') | (strike_df['AIRPORT_ID'] == 'KMDW')]
strike_df = strike_df[(strike_df['INCIDENT_YEAR'] >= 2004) & (strike_df['INCIDENT_YEAR'] <= 2016)]

#keep neccesary columns
bird_strikes = strike_df[['INCIDENT_DATE', 'INCIDENT_MONTH', 'INCIDENT_YEAR', 
         'TIME_OF_DAY', 'TIME','AIRPORT', 'SPECIES', 'COST_REPAIRS', 
         'EFFECT', 'SKY', 'PRECIP', 'BIRDS_SEEN', 'BIRDS_STRUCK', 'SIZE']]

bird_strikes.head()


Unnamed: 0,INCIDENT_DATE,INCIDENT_MONTH,INCIDENT_YEAR,TIME_OF_DAY,TIME,AIRPORT,SPECIES,COST_REPAIRS,EFFECT,SKY,PRECIP,BIRDS_SEEN,BIRDS_STRUCK,SIZE
0,12/29/2016,12,2016,,,CHICAGO O'HARE INTL ARPT,Snowy owl,,,,,,1,Large
1,12/26/2016,12,2016,Night,2000.0,CHICAGO O'HARE INTL ARPT,Unknown bird - large,,,No Cloud,,1,1,Large
2,12/22/2016,12,2016,,,CHICAGO MIDWAY INTL ARPT,Short-eared owl,,,,,,1,Small
3,12/18/2016,12,2016,Day,1701.0,CHICAGO O'HARE INTL ARPT,Mallard,,,,,,1,Medium
4,12/15/2016,12,2016,,1015.0,CHICAGO MIDWAY INTL ARPT,Canada goose,,Precautionary Landing,Some Cloud,,2 to 10,1,Medium


In [12]:
#bird_strikes['SPECIES'] = bird_strikes['SPECIES'].apply(lambda x: dl.get_close_matches(x, species_key['English Name']))



# CLEAN UP BLD STRIKE DATA 

In [13]:
# Read csv file into pandas dataframe
csv_file = "Data/Chicago_bld_strike_data_1978-2016.csv"
bld_strike_df = pd.read_csv(csv_file)
#filter for 2004-2016
bld_strike_df = bld_strike_df[(bld_strike_df['Date'] >= '2004-01-01') & (bld_strike_df['Date'] <= '2016-12-31')]

bld_strike_df.head()

Unnamed: 0,Genus,Species,Date,Locality
41,Ammodramus,nelsoni,2004-05-18,MP
42,Ammodramus,nelsoni,2004-10-02,MP
43,Ammodramus,nelsoni,2005-09-28,MP
44,Ammodramus,nelsoni,2006-09-20,MP
45,Ammodramus,nelsoni,2007-05-20,MP


# CLEAN UP FLIGHT DATA

In [14]:
# Read csv files into pandas dataframe
csv_file = "Data/MDW Flight Data 2004-2019.csv"
MDW_flights = pd.read_csv(csv_file)
csv_file = "Data/ORD Flight Data 2004-2019.csv"
ORD_flights = pd.read_csv(csv_file)

In [15]:
#append data
flights = MDW_flights.append(ORD_flights, ignore_index=True)

#filter for years 2004-2016
flights = flights[(flights['year'] >= 2004) & (flights['year'] <= 2016)]

#keep neccessary columns
chicago_flights = flights[['year', 'airport', 'airport_name', 'arr_flights']]

#rename columns
chicago_flights = chicago_flights.rename(columns={'arr_flights': 'total_flights' })
chicago_flights.head()

Unnamed: 0,year,airport,airport_name,total_flights
0,2004,MDW,"Chicago, IL: Chicago Midway International",149.0
1,2004,MDW,"Chicago, IL: Chicago Midway International",120.0
2,2004,MDW,"Chicago, IL: Chicago Midway International",56.0
3,2004,MDW,"Chicago, IL: Chicago Midway International",155.0
4,2004,MDW,"Chicago, IL: Chicago Midway International",247.0
