Use function *multi_merge(train,test,list_of_lag_days,list_of_feature_lists)* to create variations of our datasets.

---

# Imports

In [67]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import haversine as hv
from datetime import timedelta

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score

In [3]:
# set station coordinates
STATIONS = {
    1 : (41.995,-87.933),
    2 : (41.786,-87.752)
}

def nearest_station(in_coords):
    
    dist = {k:hv.haversine(in_coords,v) for k,v in STATIONS.items()}
    
    return min(dist, key=dist.get)

In [72]:
# import train,test data and map nearest weather station
train = pd.read_csv(r'.\data\train.csv')
test = pd.read_csv(r'.\data\test.csv')
train['nearest_station'] = train.apply(lambda x: nearest_station([x.Latitude, x.Longitude]), axis=1)
test['nearest_station'] = test.apply(lambda x: nearest_station([x.Latitude, x.Longitude]), axis=1)
train.Date = train.Date.astype('datetime64[ns]')
test.Date = test.Date.astype('datetime64[ns]')

In [73]:
# import weather data and convert date type
weather = pd.read_csv(r'.\data\weather_cleaned_stack_back.csv')
weather.drop(columns='Unnamed: 0',inplace=True)
weather.Date = weather.Date.astype('datetime64[ns]')

In [28]:
train.columns

Index(['Date', 'Address', 'Species', 'Block', 'Street', 'Trap',
       'AddressNumberAndStreet', 'Latitude', 'Longitude', 'AddressAccuracy',
       'NumMosquitos', 'WnvPresent', 'nearest_station'],
      dtype='object')

---
# Munging weather data

In [57]:
temp_list=['Tmax','Tmin','Tavg','Depart','DewPoint','WetBulb','Cool']
rain_list=['PrecipTotal']
day_list=['Sunset','DaylightHrs','StnPressure','SeaLevel','ResultSpeed',
          'ResultDir','AvgSpeed']

In [58]:
# function to merge weather features based on date lag
def to_merge(train,test,lag,feature_list):
    train['date_lag'] = train.Date.map(lambda x : x - timedelta(days=lag))
    test['date_lag'] = test.Date.map(lambda x : x - timedelta(days=lag))
    feature_list=feature_list+['Date','Station']
    train_weather = train.merge(weather[feature_list],left_on=['date_lag','nearest_station'],right_on=['Date','Station'])
    test_weather = test.merge(weather[feature_list],left_on=['date_lag','nearest_station'],right_on=['Date','Station'])
    train_weather.drop(['Date_y','Station'],axis=1,inplace=True)
    train_weather.rename({'Date_x':'Date'},axis=1,inplace=True)
    test_weather.drop(['Date_y','Station'],axis=1,inplace=True)
    test_weather.rename({'Date_x':'Date'},axis=1,inplace=True)
    return(train_weather,test_weather)

In [59]:
# function to merge multiple weather features with different date lags
def multi_merge(train,test,lag_list,list_of_lists):
    if len(lag_list)!=len(list_of_lists):
        print('Mismatch in list lengths')
        return None
    else:
        for i in range(len(lag_list)):
            train,test=to_merge(train,test,lag_list[i],list_of_lists[i])
        train['month'] = train.Date.map(lambda x : x.month)
        test['month'] = test.Date.map(lambda x : x.month)
        return(train,test)

In [60]:
lag_list=[0,3,11]
feat_list=[day_list,temp_list,rain_list]
train_1,test_1=multi_merge(train,test,lag_list,feat_list)

In [61]:
cols = ['month','Species','NumMosquitos','Sunset', 'Street',
       'DaylightHrs', 'Tmax', 'Tmin', 'Tavg', 'Depart', 'DewPoint', 'WetBulb',
       'Cool', 'PrecipTotal', 'StnPressure', 'SeaLevel', 'ResultSpeed',
       'ResultDir', 'AvgSpeed','WnvPresent']

In [63]:
train_1 = train_1[cols]

---
# Preparation of data

In [68]:
scaler = StandardScaler()
sm = SMOTE(sampling_strategy=1,random_state=666)

train_dummies = pd.get_dummies(train_1,drop_first=True,columns=['Species','Street'])
y = train_dummies['WnvPresent']
X = train_dummies[[col for col in train_dummies.columns if col != 'WnvPresent']]

train_x, test_x, train_y, test_y = train_test_split(X,y,test_size = 0.3, random_state = 666,stratify=y)
train_x=scaler.fit_transform(train_x)
test_x=scaler.transform(test_x)
sampledX,sampledy = sm.fit_sample(train_x,train_y)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  # Remove the CWD from sys.path while we load stuff.


In [70]:
# test fit and predict, benchmark
lr=LogisticRegression(solver='liblinear')
lr.fit(sampledX,sampledy)
pred=lr.predict(test_x)
recall_score(test_y,pred)

0.6484848484848484

---
# Testing different ranges

In [103]:
def score_iter(train,test,lag_list,feature_list):
    train_1,test_1=multi_merge(train,test,lag_list,feat_list)
    train_1 = train_1[cols]
    scaler = StandardScaler()
    sm = SMOTE(sampling_strategy=1,random_state=666)

    train_dummies = pd.get_dummies(train_1,drop_first=True,columns=['Species','Street'])
    train_dummies=train_dummies.astype('float64')
    y = train_dummies['WnvPresent']
    X = train_dummies[[col for col in train_dummies.columns if col != 'WnvPresent']]

    train_x, test_x, train_y, test_y = train_test_split(X,y,test_size = 0.3, random_state = 666,stratify=y)
    train_x=scaler.fit_transform(train_x)
    test_x=scaler.transform(test_x)
    sampledX,sampledy = sm.fit_sample(train_x,train_y)
    
    lr=LogisticRegression(solver='liblinear')
    lr.fit(sampledX,sampledy)
    pred=lr.predict(test_x)
    return (recall_score(test_y,pred))

In [106]:
for i in range(1,8):
    lag_list=[0,i,11]
    print('temp days ' + str(i) + ': ' + str(score_iter(train,test,lag_list,feat_list)))

temp days 1: 0.6606060606060606
temp days 2: 0.6606060606060606
temp days 3: 0.6484848484848484
temp days 4: 0.6666666666666666
temp days 5: 0.6424242424242425
temp days 6: 0.6606060606060606
temp days 7: 0.6424242424242425


In [105]:
for i in range (1,15):
    lag_list=[0,4,i]
    print('rain day '+str(i) + ': ' + str(score_iter(train,test,lag_list,feat_list)))

rain day 1: 0.6666666666666666
rain day 2: 0.6666666666666666
rain day 3: 0.6606060606060606
rain day 4: 0.6666666666666666
rain day 5: 0.6484848484848484
rain day 6: 0.6545454545454545
rain day 7: 0.6606060606060606
rain day 8: 0.6666666666666666
rain day 9: 0.6606060606060606
rain day 10: 0.6666666666666666
rain day 11: 0.6666666666666666
rain day 12: 0.6666666666666666
rain day 13: 0.6606060606060606
rain day 14: 0.6666666666666666
