Use function *multi_merge(train,test,list_of_lag_days,list_of_feature_lists)* to create variations of our datasets.


In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import haversine as hv
from datetime import timedelta

In [5]:
STATIONS = {
    1 : (41.995,-87.933),
    2 : (41.786,-87.752)
}

In [6]:
def nearest_station(in_coords):
    
    dist = {k:hv.haversine(in_coords,v) for k,v in STATIONS.items()}
    
    return min(dist, key=dist.get)


In [19]:
train = pd.read_csv(r'.\data\train.csv')
test = pd.read_csv(r'.\data\test.csv')
train['nearest_station'] = train.apply(lambda x: nearest_station([x.Latitude, x.Longitude]), axis=1)
test['nearest_station'] = test.apply(lambda x: nearest_station([x.Latitude, x.Longitude]), axis=1)
train.Date = train.Date.astype('datetime64[ns]')
test.Date = test.Date.astype('datetime64[ns]')

In [11]:
weather = pd.read_csv(r'.\data\weather_cleaned_stack_back.csv')
weather.drop(columns='Unnamed: 0',inplace=True)
weather.Date = weather.Date.astype('datetime64[ns]')

In [45]:
temp_list=['Tmax','Tmin','Tavg','Depart','DewPoint','WetBulb','Cool']
rain_list=['PrecipTotal']
day_list=['Sunset','DaylightHrs']

In [61]:
train.columns

Index(['Date', 'Address', 'Species', 'Block', 'Street', 'Trap',
       'AddressNumberAndStreet', 'Latitude', 'Longitude', 'AddressAccuracy',
       'NumMosquitos', 'WnvPresent', 'nearest_station', 'date_lag'],
      dtype='object')

In [77]:
def to_merge(train,test,lag,feature_list):
    train['date_lag'] = train.Date.map(lambda x : x - timedelta(days=lag))
    test['date_lag'] = test.Date.map(lambda x : x - timedelta(days=lag))
    feature_list=feature_list+['Date','Station']
    train_weather = train.merge(weather[feature_list],left_on=['date_lag','nearest_station'],right_on=['Date','Station'])
    test_weather = test.merge(weather[feature_list],left_on=['date_lag','nearest_station'],right_on=['Date','Station'])
    train_weather.drop(['Date_y','Station'],axis=1,inplace=True)
    train_weather.rename({'Date_x':'Date'},axis=1,inplace=True)
    test_weather.drop(['Date_y','Station'],axis=1,inplace=True)
    test_weather.rename({'Date_x':'Date'},axis=1,inplace=True)
    return(train_weather,test_weather)

In [78]:
train_temp,test_temp=to_merge(train,test,3,temp_list)
train_temp.columns

Index(['Date', 'Address', 'Species', 'Block', 'Street', 'Trap',
       'AddressNumberAndStreet', 'Latitude', 'Longitude', 'AddressAccuracy',
       'NumMosquitos', 'WnvPresent', 'nearest_station', 'date_lag', 'Tmax',
       'Tmin', 'Tavg', 'Depart', 'DewPoint', 'WetBulb', 'Cool'],
      dtype='object')

In [79]:
def multi_merge(train,test,lag_list,list_of_lists):
    if len(lag_list)!=len(list_of_lists):
        print('Mismatch in list lengths')
        return None
    else:
        for i in range(len(lag_list)):
            train,test=to_merge(train,test,lag_list[i],list_of_lists[i])
        return(test,train)

In [80]:
lag_list=[0,3,11]
feat_list=[day_list,temp_list,rain_list]
train,test=multi_merge(train,test,lag_list,feat_list)

In [81]:
train.columns

Index(['Id', 'Date', 'Address', 'Species', 'Block', 'Street', 'Trap',
       'AddressNumberAndStreet', 'Latitude', 'Longitude', 'AddressAccuracy',
       'nearest_station', 'date_lag', 'Sunset', 'DaylightHrs', 'Tmax', 'Tmin',
       'Tavg', 'Depart', 'DewPoint', 'WetBulb', 'Cool', 'PrecipTotal'],
      dtype='object')

In [None]:
train_weather['month'] = train_weather.Date.map(lambda x : x.month)
test_weather['month'] = test_weather.Date.map(lambda x : x.month)

In [111]:
cols = ['month','Species','NumMosquitos','Sunset', 'Street',
       'DaylightHrs', 'Tmax', 'Tmin', 'Tavg', 'Depart', 'DewPoint', 'WetBulb',
       'Cool', 'PrecipTotal', 'StnPressure', 'SeaLevel', 'ResultSpeed',
       'ResultDir', 'AvgSpeed','WnvPresent']

In [113]:
train_weather = train_weather[cols]
test_weather = test_weather[['month','Species','Sunset','Street',
       'DaylightHrs', 'Tmax', 'Tmin', 'Tavg', 'Depart', 'DewPoint', 'WetBulb',
       'Cool', 'PrecipTotal', 'StnPressure', 'SeaLevel', 'ResultSpeed',
       'ResultDir', 'AvgSpeed']]

## Export CSVs

In [114]:
train_weather.to_csv('train_weather.csv',index=False)
test_weather.to_csv('test_weather.csv',index=False)