In [1]:
import pandas as pd
import requests as r
import geopandas as gpd
import numpy as np
import pandas as pd

from scipy.spatial import cKDTree
from shapely.geometry import Point

In [2]:
airports = pd.read_csv("./airports.csv")

In [3]:
airports[airports.LATITUDE.isna()]

Unnamed: 0,IATA_CODE,AIRPORT,CITY,STATE,COUNTRY,LATITUDE,LONGITUDE
96,ECP,Northwest Florida Beaches International Airport,Panama City,FL,USA,,
234,PBG,Plattsburgh International Airport,Plattsburgh,NY,USA,,
313,UST,Northeast Florida Regional Airport (St. August...,St. Augustine,FL,USA,,


In [4]:
# Fix ECP
airports.loc[96,["LATITUDE", "LONGITUDE"]] = (30.3548543,-85.8017021)
# Fix PBG
airports.loc[234,["LATITUDE", "LONGITUDE"]] = (44.6520597,-73.470109)
# Fix UST
airports.loc[313,["LATITUDE", "LONGITUDE"]] = (29.95439,-81.3450803)

In [5]:
try:
    weather = pd.read_csv("./weather_data_unprocessed.csv")  # pd.DataFrame()
except:
    df = pd.DataFrame(columns=[
        "station",
        "valid",
        "tmpc",
        "sknt",
        "p01m",
        "vsby",
        "gust",
        "skyc1",
        "skyc2",
        "skyc3",
        "wxcodes",
        "ice_accretion_6hr",
        "snowdepth"
    ])

    for code in airports.IATA_CODE:
        url = f"https://mesonet.agron.iastate.edu/cgi-bin/request/asos.py?station={code}&data=tmpc&data=sknt&data=p01m&data=vsby&data=gust&data=skyc1&data=skyc2&data=skyc3&data=wxcodes&data=ice_accretion_6hr&data=snowdepth&year1=2015&month1=1&day1=1&year2=2015&month2=8&day2=1&tz=Etc%2FUTC&format=onlycomma&latlon=no&elev=no&missing=empty&trace=T&direct=no&report_type=1&report_type=2"
        df = pd.read_csv(url)
        weather = weather.append(df)
    weather.to_csv("./weather_data_unprocessed.csv")

In [6]:
weather

Unnamed: 0.1,Unnamed: 0,station,valid,tmpc,sknt,p01m,vsby,gust,skyc1,skyc2,skyc3,wxcodes,ice_accretion_6hr,snowdepth
0,0,ABE,2015-01-01 00:51,-6.7,4.0,0.00,10.0,,CLR,,,,,
1,1,ABE,2015-01-01 01:51,-5.6,3.0,0.00,10.0,,CLR,,,,,
2,2,ABE,2015-01-01 02:51,-6.1,3.0,0.00,10.0,,CLR,,,,,
3,3,ABE,2015-01-01 03:51,-6.1,0.0,0.00,10.0,,CLR,,,,,
4,4,ABE,2015-01-01 04:51,-7.2,3.0,0.00,10.0,,CLR,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1948947,6488,XNA,2015-07-31 19:53,30.6,5.0,0.00,10.0,,CLR,,,,,
1948948,6489,XNA,2015-07-31 20:53,31.1,4.0,0.00,10.0,,CLR,,,,,
1948949,6490,XNA,2015-07-31 21:53,30.6,4.0,0.00,10.0,,CLR,,,,,
1948950,6491,XNA,2015-07-31 22:53,30.6,5.0,0.00,10.0,,CLR,,,,,


In [7]:
weather["valid"] = pd.to_datetime(weather["valid"])
weather

Unnamed: 0.1,Unnamed: 0,station,valid,tmpc,sknt,p01m,vsby,gust,skyc1,skyc2,skyc3,wxcodes,ice_accretion_6hr,snowdepth
0,0,ABE,2015-01-01 00:51:00,-6.7,4.0,0.00,10.0,,CLR,,,,,
1,1,ABE,2015-01-01 01:51:00,-5.6,3.0,0.00,10.0,,CLR,,,,,
2,2,ABE,2015-01-01 02:51:00,-6.1,3.0,0.00,10.0,,CLR,,,,,
3,3,ABE,2015-01-01 03:51:00,-6.1,0.0,0.00,10.0,,CLR,,,,,
4,4,ABE,2015-01-01 04:51:00,-7.2,3.0,0.00,10.0,,CLR,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1948947,6488,XNA,2015-07-31 19:53:00,30.6,5.0,0.00,10.0,,CLR,,,,,
1948948,6489,XNA,2015-07-31 20:53:00,31.1,4.0,0.00,10.0,,CLR,,,,,
1948949,6490,XNA,2015-07-31 21:53:00,30.6,4.0,0.00,10.0,,CLR,,,,,
1948950,6491,XNA,2015-07-31 22:53:00,30.6,5.0,0.00,10.0,,CLR,,,,,


In [8]:
weather.gust = weather.gust.fillna(0)
weather.ice_accretion_6hr = weather.ice_accretion_6hr.fillna(0)
weather.loc[weather.ice_accretion_6hr=='T',"ice_accretion_6hr"].ice_accretion_6hr = 0
weather = weather.join(weather.wxcodes.str.split(' ', expand=True))
weather = weather.drop(columns="snowdepth")

In [9]:
# Creating a geometry column for all airports
geometry = [Point(xy) for xy in zip(airports['LONGITUDE'], airports['LATITUDE'])]
# Creating a Geographic data frame for all airports
gpd1 = gpd.GeoDataFrame(airports, geometry=geometry).reset_index(drop=True)

# Get all the airports for which we couldn't retrieve any data
missing_airports = set(airports.IATA_CODE.unique()) - set(weather.station.unique())
# Create geo data frame for the missing airports
gpd2 = gpd1[gpd1["IATA_CODE"].isin(missing_airports)].reset_index(drop=True)
# Change the column name to be identifiable later
gpd2["MISSING_IATA_CODE"] = gpd2["IATA_CODE"]
gpd2 = gpd2.drop(columns="IATA_CODE")

In [10]:
def ckdnearest(gdA, gdB):
    """
    Function to compute pairwise distances between all points in gdA and gdB
    Found in: https://gis.stackexchange.com/a/301935
    """
    nA = np.array(list(gdA.geometry.apply(lambda x: (x.x, x.y))))
    nB = np.array(list(gdB.geometry.apply(lambda x: (x.x, x.y))))
    btree = cKDTree(nB)
    dist, idx = btree.query(nA, k=1)
    gdB_nearest = gdB.iloc[idx].drop(columns="geometry").reset_index(drop=True)
    gdf = pd.concat(
        [
            gdA.reset_index(drop=True),
            gdB_nearest,
            pd.Series(dist, name='dist')
        ], 
        axis=1)

    return gdf

In [11]:
# Find the closest airport to the missing ones
distance_matrix = ckdnearest(gpd2, gpd1[~gpd1.IATA_CODE.isin(missing_airports)]).sort_values('dist')
airport_mapping = distance_matrix[["MISSING_IATA_CODE", "IATA_CODE"]]
airport_mapping

Unnamed: 0,MISSING_IATA_CODE,IATA_CODE
8,CLD,SAN
33,UST,JAX
20,MQT,ESC
28,SCE,MDT
11,FCA,MSO
36,YUM,PSP
18,KTN,BLI
34,WRG,BLI
26,PSG,BLI
5,BQN,MIA


In [12]:
df_missing_airports = pd.DataFrame()
for airports in airport_mapping.itertuples():
    missing_airport = airports[1]
    closest_airport = airports[2]
    closest_airport = weather[weather.station==closest_airport]
    closest_airport.station = missing_airport
    df_missing_airports = df_missing_airports.append(closest_airport)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [13]:
weather = weather.append(df_missing_airports)

In [14]:
weather.to_csv("weather_for_all_airports.csv")

In [15]:
weather.valid = weather.valid.dt.round('1h')

# Collocate with Flights

In [16]:
flights_train = pd.read_csv("./train_emiel_v5.csv")
flights_test = pd.read_csv("./test_emiel_v5.csv")

In [17]:
flights_train.SCHEDULED_DEPARTURE_DATETIME = pd.to_datetime(flights_train.SCHEDULED_DEPARTURE_DATETIME)
flights_test.SCHEDULED_DEPARTURE_DATETIME = pd.to_datetime(flights_test.SCHEDULED_DEPARTURE_DATETIME)

In [18]:
weather

Unnamed: 0.1,Unnamed: 0,station,valid,tmpc,sknt,p01m,vsby,gust,skyc1,skyc2,skyc3,wxcodes,ice_accretion_6hr,0,1,2,3
0,0,ABE,2015-01-01 01:00:00,-6.7,4.0,0.00,10.0,0.0,CLR,,,,0,,,,
1,1,ABE,2015-01-01 02:00:00,-5.6,3.0,0.00,10.0,0.0,CLR,,,,0,,,,
2,2,ABE,2015-01-01 03:00:00,-6.1,3.0,0.00,10.0,0.0,CLR,,,,0,,,,
3,3,ABE,2015-01-01 04:00:00,-6.1,0.0,0.00,10.0,0.0,CLR,,,,0,,,,
4,4,ABE,2015-01-01 05:00:00,-7.2,3.0,0.00,10.0,0.0,CLR,,,,0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1718742,5563,PPG,2015-07-31 20:00:00,20.6,16.0,0.00,10.0,0.0,FEW,BKN,,,0,,,,
1718743,5564,PPG,2015-07-31 21:00:00,21.7,16.0,0.00,10.0,0.0,FEW,BKN,,,0,,,,
1718744,5565,PPG,2015-07-31 22:00:00,22.2,18.0,0.00,10.0,0.0,FEW,BKN,,,0,,,,
1718745,5566,PPG,2015-07-31 23:00:00,21.7,18.0,0.00,10.0,0.0,FEW,SCT,BKN,,0,,,,


In [19]:
flights_train.SCHEDULED_DEPARTURE_DATETIME = flights_train.SCHEDULED_DEPARTURE_DATETIME.dt.round('1h') 
flights_test.SCHEDULED_DEPARTURE_DATETIME = flights_test.SCHEDULED_DEPARTURE_DATETIME.dt.round('1h')

In [None]:
flights_train = flights_train.merge(weather, how="left", left_on=["SCHEDULED_DEPARTURE_DATETIME", "ORIGIN_AIRPORT"], right_on=["valid","station"], suffixes=("_departure","_departure"))
flights_test = flights_test.merge(weather, how="left", left_on=["SCHEDULED_DEPARTURE_DATETIME", "ORIGIN_AIRPORT"], right_on=["valid","station"], suffixes=("_departure","_departure"))

In [None]:
flights_train.to_csv('./flights_train_with_weather.csv')
flights_test.to_csv('./flights_test_with_weather.csv')