In [2]:
import pandas as pd
import numpy as np

import requests
import json
from datetime import datetime

**Import df_cleaned.csv**

In [3]:
df_cleaned = pd.read_csv('df_cleaned.csv')

In [4]:
len(df_cleaned)

11675

In [5]:
df_cleaned.head()

Unnamed: 0.1,Unnamed: 0,Record ID,Incident Year,Incident Month,Incident Day,Operator ID,Operator,Aircraft,Aircraft Type,Aircraft Make,...,Flight Phase,Visibility,Precipitation,Height,Speed,Distance,Species ID,Species Name,Species Quantity,Flight Impact
0,106753,300045,2010,1,1,BUS,BUSINESS,C-310,A,226,...,TAKEOFF RUN,DAY,NONE,0.0,30.0,0.0,NE1,GULL,1,ABORTED TAKEOFF
1,106754,300390,2010,1,1,JBU,JETBLUE AIRWAYS,A-320,A,04A,...,TAKEOFF RUN,DAY,RAIN,0.0,120.0,0.0,ZT3,GRACKLE,1,NONE
2,106764,300701,2010,1,2,SKW,SKYWEST AIRLINES,CRJ100/200,A,188,...,LANDING ROLL,DAY,NONE,0.0,80.0,0.0,YH004,HORNED LARK,1,NONE
3,106765,300186,2010,1,2,NWA,NORTHWEST AIRLINES,DC-9,A,583,...,TAKEOFF RUN,DAY,NONE,0.0,140.0,0.0,K33,HAWK,1,NONE
4,106769,301152,2010,1,2,JBU,JETBLUE AIRWAYS,A-320,A,04A,...,APPROACH,DAY,NONE,20.0,120.0,0.0,YI010,TREE SWALLOW,1,NONE


**Create birdstrike count for airports and save as csv**

In [6]:
airport_counts = df_cleaned.groupby('Airport ID').size().sort_values(ascending=False)
df_airports = pd.DataFrame({'Airport ID':airport_counts.index, 'Birdstrikes_N':airport_counts.values})

In [7]:
df_airports.head()

Unnamed: 0,Airport ID,Birdstrikes_N
0,KDFW,402
1,KSMF,337
2,KDEN,293
3,KPHL,286
4,KBNA,258


**Import airport_codes_csv (downloaded from internet)**

In [8]:
airport_codes = pd.read_csv('airport-codes_csv.csv')

In [9]:
airport_codes.head()

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates
0,00A,heliport,Total Rf Heliport,11.0,,US,US-PA,Bensalem,00A,,00A,"-74.93360137939453, 40.07080078125"
1,00AA,small_airport,Aero B Ranch Airport,3435.0,,US,US-KS,Leoti,00AA,,00AA,"-101.473911, 38.704022"
2,00AK,small_airport,Lowell Field,450.0,,US,US-AK,Anchor Point,00AK,,00AK,"-151.695999146, 59.94919968"
3,00AL,small_airport,Epps Airpark,820.0,,US,US-AL,Harvest,00AL,,00AL,"-86.77030181884766, 34.86479949951172"
4,00AR,closed,Newport Hospital & Clinic Heliport,237.0,,US,US-AR,Newport,,,,"-91.254898, 35.6087"


In [10]:
# airport_codes[airport_codes['ident'] == 'KJAX']
# name municipality gps_code coordinates

In [11]:
#df_merged = pd.merge(airports, airport_codes, how = 'left', left_on = 'Airport ID', right_on = 'ident')

In [12]:
#df_merged['long'], df_merged['lat'] = df_merged['coordinates'].str.split(',', 1).str

In [13]:
#len(df_merged)

In [14]:
#pd.set_option('display.max_rows', None)

In [15]:
len(airport_codes)

57421

**Merging df_airports and airport_codes to get long/lat values for airport IDs that only our data has**

In [16]:
df_merged_4 = pd.merge(df_airports, airport_codes, how = 'inner', left_on = 'Airport ID', right_on = 'ident')

In [17]:
df_merged_4['long'], df_merged_4['lat'] = df_merged_4['coordinates'].str.split(',', 1).str

  df_merged_4['long'], df_merged_4['lat'] = df_merged_4['coordinates'].str.split(',', 1).str


In [18]:
df_merged_4['lat'] = df_merged_4['lat'].astype(str)
df_merged_4['long'] = df_merged_4['long'].astype(str)

In [19]:
df_merged_4.head()

Unnamed: 0,Airport ID,Birdstrikes_N,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates,long,lat
0,KDFW,402,KDFW,large_airport,Dallas Fort Worth International Airport,607.0,,US,US-TX,Dallas-Fort Worth,KDFW,DFW,DFW,"-97.038002, 32.896801",-97.038002,32.896801
1,KSMF,337,KSMF,large_airport,Sacramento International Airport,27.0,,US,US-CA,Sacramento,KSMF,SMF,SMF,"-121.59100341796875, 38.69540023803711",-121.59100341796876,38.69540023803711
2,KDEN,293,KDEN,large_airport,Denver International Airport,5431.0,,US,US-CO,Denver,KDEN,DEN,DEN,"-104.672996521, 39.861698150635",-104.672996521,39.861698150635
3,KPHL,286,KPHL,large_airport,Philadelphia International Airport,36.0,,US,US-PA,Philadelphia,KPHL,PHL,PHL,"-75.24109649658203, 39.87189865112305",-75.24109649658203,39.87189865112305
4,KBNA,258,KBNA,large_airport,Nashville International Airport,599.0,,US,US-TN,Nashville,KBNA,BNA,BNA,"-86.6781997680664, 36.1245002746582",-86.6781997680664,36.1245002746582


**Create empty zips with lat/long df**

In [20]:
column_names = ["lat_", "long_", "zipcode"]
df_w_zip = pd.DataFrame(columns = column_names)

**Import zipcodes via API**

In [21]:
Token = '546a860b4a7a4d0697f2fe7f66f1e7e5'

In [22]:
# initially, list was used to store zip codes, then switched to df
# zip_codes_list = []

In [23]:
for index, row in df_merged_4.iterrows():
    #make the api call
    response = requests.get('https://api.bigdatacloud.net/data/reverse-geocode?latitude='+row.lat+'&longitude='+row.long+'&localityLanguage=en&key='+Token)  
    #load the api response as a json
    res_json = json.loads(response.text)
    if res_json['postcode'] == None:
        continue
    else:
        new_row = {'lat_':row.lat, 'long_': row.long, 'zipcode': res_json['postcode']}
        # print(new_row)
        df_w_zip = df_w_zip.append(new_row, ignore_index=True)

In [24]:
len(df_w_zip)

802

In [25]:
df_w_zip.head()

Unnamed: 0,lat_,long_,zipcode
0,32.896801,-97.038002,76051
1,38.69540023803711,-121.59100341796876,95837
2,39.861698150635,-104.672996521,80249
3,39.87189865112305,-75.24109649658203,19113
4,36.1245002746582,-86.6781997680664,37214


In [26]:
# this csv was not helpful
# zip_codes = pd.read_csv('us-zip-code-latitude-and-longitude.csv', sep=';')

**Join all dates together into one column**

In [27]:
df_cleaned["incident_date"] = df_cleaned["Incident Year"].astype(str) + "-" + df_cleaned["Incident Month"].astype(str) + "-" + df_cleaned["Incident Day"].astype(str)
df_cleaned["incident_date"] = pd.to_datetime(df_cleaned['incident_date'], format= '%Y-%m-%d').astype(str)
# date_list = pd.DataFrame(df_cleaned['incident_date'])
# date_list['incident_date'] = date_list['incident_date'].astype(str)

In [28]:
len(df_cleaned)

11675

In [29]:
#len(df_cleaned[df_cleaned['incident_date'].str.contains("2010")])

In [30]:
df_cleaned_coord = pd.merge(df_cleaned, df_merged_4, how = 'left', left_on = 'Airport ID', right_on = 'Airport ID')

In [31]:
df_cleaned_coord.head()

Unnamed: 0.1,Unnamed: 0,Record ID,Incident Year,Incident Month,Incident Day,Operator ID,Operator,Aircraft,Aircraft Type,Aircraft Make,...,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates,long,lat
0,106753,300045,2010,1,1,BUS,BUSINESS,C-310,A,226,...,,US,US-FL,Pompano Beach,KPMP,PPM,PMP,"-80.111099243164, 26.247100830078",-80.111099243164,26.247100830078
1,106754,300390,2010,1,1,JBU,JETBLUE AIRWAYS,A-320,A,04A,...,,US,US-FL,Orlando,KMCO,MCO,MCO,"-81.30899810791016, 28.429399490356445",-81.30899810791016,28.429399490356445
2,106764,300701,2010,1,2,SKW,SKYWEST AIRLINES,CRJ100/200,A,188,...,,US,US-UT,Salt Lake City,KSLC,SLC,SLC,"-111.97799682617188, 40.78839874267578",-111.97799682617188,40.78839874267578
3,106765,300186,2010,1,2,NWA,NORTHWEST AIRLINES,DC-9,A,583,...,,US,US-MD,Baltimore,KBWI,BWI,BWI,"-76.668297, 39.1754",-76.668297,39.1754
4,106769,301152,2010,1,2,JBU,JETBLUE AIRWAYS,A-320,A,04A,...,,US,US-FL,Orlando,KMCO,MCO,MCO,"-81.30899810791016, 28.429399490356445",-81.30899810791016,28.429399490356445


In [32]:
df_cleaned_coord = df_cleaned_coord.drop(columns=['continent','iso_country','iso_region','gps_code','iata_code','local_code'])

In [33]:
df_cleaned_coord.columns

Index(['Unnamed: 0', 'Record ID', 'Incident Year', 'Incident Month',
       'Incident Day', 'Operator ID', 'Operator', 'Aircraft', 'Aircraft Type',
       'Aircraft Make', 'Aircraft Model', 'Airport ID', 'Airport', 'State',
       'FAA Region', 'Flight Phase', 'Visibility', 'Precipitation', 'Height',
       'Speed', 'Distance', 'Species ID', 'Species Name', 'Species Quantity',
       'Flight Impact', 'incident_date', 'Birdstrikes_N', 'ident', 'type',
       'name', 'elevation_ft', 'municipality', 'coordinates', 'long', 'lat'],
      dtype='object')

In [34]:
df_zip_date = pd.merge(df_cleaned_coord, df_w_zip, how = 'left', left_on = ['long', 'lat'], right_on = ['long_', 'lat_'])

In [35]:
df_zip_date.head()

Unnamed: 0.1,Unnamed: 0,Record ID,Incident Year,Incident Month,Incident Day,Operator ID,Operator,Aircraft,Aircraft Type,Aircraft Make,...,type,name,elevation_ft,municipality,coordinates,long,lat,lat_,long_,zipcode
0,106753,300045,2010,1,1,BUS,BUSINESS,C-310,A,226,...,small_airport,Pompano Beach Airpark,19.0,Pompano Beach,"-80.111099243164, 26.247100830078",-80.111099243164,26.247100830078,26.247100830078,-80.111099243164,33060
1,106754,300390,2010,1,1,JBU,JETBLUE AIRWAYS,A-320,A,04A,...,large_airport,Orlando International Airport,96.0,Orlando,"-81.30899810791016, 28.429399490356445",-81.30899810791016,28.429399490356445,28.429399490356445,-81.30899810791016,32827
2,106764,300701,2010,1,2,SKW,SKYWEST AIRLINES,CRJ100/200,A,188,...,large_airport,Salt Lake City International Airport,4227.0,Salt Lake City,"-111.97799682617188, 40.78839874267578",-111.97799682617188,40.78839874267578,40.78839874267578,-111.97799682617188,84116
3,106765,300186,2010,1,2,NWA,NORTHWEST AIRLINES,DC-9,A,583,...,large_airport,Baltimore/Washington International Thurgood Ma...,146.0,Baltimore,"-76.668297, 39.1754",-76.668297,39.1754,39.1754,-76.668297,21240
4,106769,301152,2010,1,2,JBU,JETBLUE AIRWAYS,A-320,A,04A,...,large_airport,Orlando International Airport,96.0,Orlando,"-81.30899810791016, 28.429399490356445",-81.30899810791016,28.429399490356445,28.429399490356445,-81.30899810791016,32827


In [36]:
df_zip_date['zipcode'].isna().sum()

73

In [37]:
df_zip_date = df_zip_date[df_zip_date["zipcode"] != ""]

In [38]:
len(df_zip_date)

11597

In [39]:
df_zip_date.columns

Index(['Unnamed: 0', 'Record ID', 'Incident Year', 'Incident Month',
       'Incident Day', 'Operator ID', 'Operator', 'Aircraft', 'Aircraft Type',
       'Aircraft Make', 'Aircraft Model', 'Airport ID', 'Airport', 'State',
       'FAA Region', 'Flight Phase', 'Visibility', 'Precipitation', 'Height',
       'Speed', 'Distance', 'Species ID', 'Species Name', 'Species Quantity',
       'Flight Impact', 'incident_date', 'Birdstrikes_N', 'ident', 'type',
       'name', 'elevation_ft', 'municipality', 'coordinates', 'long', 'lat',
       'lat_', 'long_', 'zipcode'],
      dtype='object')

In [40]:
df_zip_date_2010 = df_zip_date[df_zip_date['incident_date'].str.contains("2010")]

In [41]:
df_zip_date_2010['zipcode'] = df_zip_date_2010['zipcode'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_zip_date_2010['zipcode'] = df_zip_date_2010['zipcode'].astype(str)


In [42]:
column_names = ["date", "station", "zipcode", "value"]

df_w_weather = pd.DataFrame(columns = column_names)

In [43]:
#Token_w = 'EHZroSHTsmfKuOMeOWRkrDpMPWoZTXmb'

In [44]:
#for index,row in df_zip_date_2010.iterrows():
#     response = requests.get('https://www.ncdc.noaa.gov/cdo-web/api/v2/data?datasetid=GHCND&locationid=ZIP:'+row.zipcode+'&datatypeid=TOBS&startdate='+row.incident_date+'&enddate='+row.incident_date, 
#                             headers={'token':Token_w})
#     res_json = json.loads(response.text)
#    if res_json != {}:
#       new_row = {'date': row.incident_date, 'station': res_json['results'][0]['station'], 'zipcode': row.zipcode, 'value': res_json['results'][0]['value']}
#        df_w_weather = df_w_weather.append(new_row, ignore_index=True)
#        print(new_row)

From the documentation:

ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt

TOBS = Temperature at the time of observation (tenths of degrees C). We can divide results by 10

In [45]:
df_w_weather.to_csv('df_w_weather.csv', index=False)

In [46]:
# 1) we should connect df_w_zip to df_merged;
df_w_coord = pd.merge(df_merged, df_w_zip, how = 'left', left_on = ["lat","long"], right_on = ["lat_", "long_"])
df_w_coord.drop(['local_code','lat_','long_'], axis = 1, inplace = True)

NameError: name 'df_merged' is not defined

In [None]:
df_w_coord.head()

In [None]:
start_date = pd.date_range(start="2015-01-01",end="2020-01-01", freq = 'AS')
end_date = pd.date_range(start="2015-12-31",end="2020-12-31", freq = 'A')

In [None]:
start_date = pd.Series(start_date.format(), name = "start_date")
end_date = pd.Series(end_date.format(), name = "end_date")

In [None]:
df_w_date = pd.concat([start_date, end_date], axis = 1)
df_w_date.head()

In [None]:
Token = 'pgiGvZTCXlwUolZMKeeMTjEnzJTFCPLW'
response = requests.get('https://www.ncdc.noaa.gov/cdo-web/api/v2/data?datasetid=GHCND&locationid=ZIP:28801&startdate=2015-01-01&enddate=2015-12-31', headers={'token':Token})
res_json = json.loads(response.text)
res_json = res_json['results']
res_json[1]['date']

In [None]:
res_json[0]

In [None]:
dates_temp = []
dates_prcp = []
temps = []
prcp = []

# 2) get weather data based on zip
#for index, row, date_index, date_row in zip(df_merged.iterrows(),df_w_date.iterrows()):
for index,row in df_w_zip.iterrows():
    for date_index,date_row in df_w_date.iterrows():
        #make the api call
        response = requests.get('https://www.ncdc.noaa.gov/cdo-web/api/v2/data?datasetid=GHCND&locationid=ZIP:'+row.zipcode+'&startdate='+date_row.start_date+'&enddate='+date_row.end_date, headers={'token':Token})
        res_json = json.loads(response.text)
        #get all items in the response which are average temperature readings
        avg_temps = [item for item in res_json['results'] if item['datatype']=='TAVG']
        #get the date field from all average temperature readings
        dates_temp += [item['date'] for item in avg_temps]
        #get the actual average temperature from all average temperature readings
        temps += [item['value'] for item in avg_temps]

In [None]:
#initialize lists to store data
dates_temp = []
dates_prcp = []
temps = []
prcp = []

#for each year from 2015-2019 ...
for year in range(2015, 2019):
    year = str(year)
    print('working on year '+year)
    
    #make the api call
    r = requests.get('https://www.ncdc.noaa.gov/cdo-web/api/v2/data?datasetid=GHCND&datatypeid=TAVG&limit=1000&stationid=GHCND:USW00023129&startdate='+year+'-01-01&enddate='+year+'-12-31', headers={'token':Token})
    #load the api response as a json
    d = json.loads(r.text)
    if d == None:
        continue
    else:
        #get all items in the response which are average temperature readings
        avg_temps = [item for item in d['results'] if item['datatype']=='TAVG']
        #get the date field from all average temperature readings
        dates_temp += [item['date'] for item in avg_temps]
        #get the actual average temperature from all average temperature readings
        temps += [item['value'] for item in avg_temps]

In [None]:
df_temp = pd.DataFrame()

#populate date and average temperature fields (cast string date to datetime and convert temperature from tenths of Celsius to Fahrenheit)
df_temp['date'] = [datetime.strptime(d, "%Y-%m-%dT%H:%M:%S") for d in dates_temp]
df_temp['avgTemp'] = [float(v)/10.0*1.8 + 32 for v in temps]

In [None]:
df_temp