# Import Libraries

In [6]:
!pip install haversine

Collecting haversine
  Using cached https://files.pythonhosted.org/packages/e3/72/1a7b859168b618384133f53f23fa54965c5f79d749b50ec1b66fd6a62759/haversine-2.1.1-py2.py3-none-any.whl
Installing collected packages: haversine
Successfully installed haversine-2.1.1


In [129]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import haversine as hv
from datetime import timedelta

from sklearn.model_selection import train_test_split

In [20]:
full_data = pd.read_csv('data/train.csv')
weather = pd.read_csv('data/weather_cleaned_stack_back_v2.csv')
spray = pd.read_csv('data/spray.csv')

In [9]:
full_data.columns

Index(['Date', 'Address', 'Species', 'Block', 'Street', 'Trap',
       'AddressNumberAndStreet', 'Latitude', 'Longitude', 'AddressAccuracy',
       'NumMosquitos', 'WnvPresent'],
      dtype='object')

In [81]:
full_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10506 entries, 0 to 10505
Data columns (total 15 columns):
Date                      10506 non-null datetime64[ns]
Address                   10506 non-null object
Species                   10506 non-null object
Block                     10506 non-null int64
Street                    10506 non-null object
Trap                      10506 non-null object
AddressNumberAndStreet    10506 non-null object
Latitude                  10506 non-null float64
Longitude                 10506 non-null float64
AddressAccuracy           10506 non-null int64
NumMosquitos              10506 non-null int64
WnvPresent                10506 non-null int64
spray_dist                1569 non-null float64
spray_status              10506 non-null int64
Year                      10506 non-null int64
dtypes: datetime64[ns](1), float64(3), int64(6), object(5)
memory usage: 1.2+ MB


In [64]:
full_data.columns.to_list()

['Date',
 'Address',
 'Species',
 'Block',
 'Street',
 'Trap',
 'AddressNumberAndStreet',
 'Latitude',
 'Longitude',
 'AddressAccuracy',
 'NumMosquitos',
 'WnvPresent',
 'spray_dist',
 'spray_status',
 'Year']

In [12]:
weather.drop(['Unnamed: 0'],axis=1,inplace=True)

In [13]:
weather.dtypes

Date                 object
Sunset              float64
DaylightHrs         float64
Tmax                float64
Tmin                float64
Tavg                float64
Depart              float64
DewPoint            float64
WetBulb             float64
Cool                float64
WetWeather            int64
PrecipTotal         float64
StnPressure         float64
SeaLevel            float64
ResultSpeed         float64
ResultDir           float64
AvgSpeed            float64
Station               int64
RH                  float64
CoolYearlyCum       float64
Sunset7dAvg         float64
DaylightHrs7dAvg    float64
Tmax7dAvg           float64
Tmin7dAvg           float64
Tavg7dAvg           float64
Depart7dAvg         float64
DewPoint7dAvg       float64
WetBulb7dAvg        float64
Cool7dAvg           float64
PrecipTotal7dAvg    float64
StnPressure7dAvg    float64
SeaLevel7dAvg       float64
ResultSpeed7dAvg    float64
ResultDir7dAvg      float64
AvgSpeed7dAvg       float64
dtype: object

In [21]:
spray = spray.drop_duplicates(subset=['Latitude', 'Longitude'], keep=False)

In [38]:
spray.Date.value_counts()

2013-08-15    2668
2013-08-29    2302
2013-07-17    2202
2013-07-25    1607
2013-08-22    1587
2011-09-07    1571
2013-08-08    1195
2013-09-05     924
2013-08-16     141
2011-08-29      95
Name: Date, dtype: int64

## Feature Engineering

Distance to nearest spray point within 3 days of each day.

In [22]:
spray.Date = spray.Date.astype('datetime64[ns]')
full_data.Date = full_data.Date.astype('datetime64[ns]')
weather.Date = weather.Date.astype('datetime64[ns]')

In [16]:
weather.Date.head()

0   2007-05-01
1   2007-05-02
2   2007-05-03
3   2007-05-04
4   2007-05-05
Name: Date, dtype: datetime64[ns]

In [18]:
weather.Date.tail()

2939   2014-10-27
2940   2014-10-28
2941   2014-10-29
2942   2014-10-30
2943   2014-10-31
Name: Date, dtype: datetime64[ns]

In [17]:
full_data.Date.head()

0   2007-05-29
1   2007-05-29
2   2007-05-29
3   2007-05-29
4   2007-05-29
Name: Date, dtype: datetime64[ns]

We don't have trap / mosquito / wnv data for the years of even number. Let it be...it will be a left join for full_data with weather

In [19]:
full_data.Date.tail()

10501   2013-09-26
10502   2013-09-26
10503   2013-09-26
10504   2013-09-26
10505   2013-09-26
Name: Date, dtype: datetime64[ns]

In [23]:
spray.Date.tail()

14830   2013-09-05
14831   2013-09-05
14832   2013-09-05
14833   2013-09-05
14834   2013-09-05
Name: Date, dtype: datetime64[ns]

In [30]:
full_data.Date[10505]

Timestamp('2013-09-26 00:00:00')

In [42]:
start_date = full_data.Date[7576]
end_date = start_date + timedelta(days=7)

In [44]:
mask = (spray.Date >= start_date) & (spray.Date <= end_date)
spray_subset = spray[mask]
display(spray_subset)

Unnamed: 0,Date,Time,Latitude,Longitude,coords
95,2011-09-07,7:52:52 PM,41.981433,-87.787777,"[41.9814333333333, -87.7877766666667]"
96,2011-09-07,7:53:02 PM,41.980998,-87.787778,"[41.9809983333333, -87.7877783333333]"
97,2011-09-07,7:53:12 PM,41.980560,-87.787762,"[41.98056, -87.7877616666667]"
98,2011-09-07,7:53:22 PM,41.980198,-87.787758,"[41.9801983333333, -87.7877583333333]"
99,2011-09-07,7:53:32 PM,41.979752,-87.787765,"[41.9797516666667, -87.78776500000001]"
100,2011-09-07,7:53:42 PM,41.979352,-87.787753,"[41.9793516666667, -87.7877533333333]"
101,2011-09-07,7:53:52 PM,41.978850,-87.787763,"[41.97885, -87.7877633333333]"
102,2011-09-07,7:54:02 PM,41.978398,-87.787773,"[41.978398333333296, -87.7877733333333]"
103,2011-09-07,7:54:12 PM,41.977990,-87.787760,"[41.977990000000005, -87.78775999999999]"
104,2011-09-07,7:54:22 PM,41.977495,-87.787778,"[41.977495000000005, -87.7877783333333]"


In [33]:
spray['coords'] = spray.apply(lambda x: ([x.Latitude, x.Longitude]), axis=1)

In [45]:
def nearest_spray(date, in_coords, n_diff=7):   
    start_date = date
    end_date = date + timedelta(days=n_diff)
    mask = (spray.Date >= start_date) & (spray.Date <= end_date)
    spray_subset = spray[mask]
    
    if not spray_subset.empty:
        dist = [hv.haversine(in_coords,coord) for coord in (spray_subset.coords)]     
        return round(min(dist),2)
    else:
        print('empty subset')
        return np.nan

In [47]:
%time

full_data['spray_dist'] = full_data.apply(lambda x: nearest_spray(x.Date,[x.Latitude, x.Longitude]), axis=1)

CPU times: user 1e+03 ns, sys: 1e+03 ns, total: 2 µs
Wall time: 3.81 µs
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty

empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset

empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset

empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset

empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset

empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset

empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset

empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset

empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset

empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset

empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset

empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset

empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset
empty subset

In [48]:
## spray_status : 1 if less than 7 days and 3 km, else 0
full_data['spray_status'] = full_data.apply(lambda x: 1 if (x.spray_dist <= 3) else 0,axis=1)

In [49]:
full_data.spray_status.value_counts()

0    10208
1      298
Name: spray_status, dtype: int64

We can see that only 298 sites are covered under the spray conditions

In [51]:
full_data['Year'] = full_data.Date.map(lambda x : x.year)
spray['Year'] = spray.Date.map(lambda x: x.year)

In [55]:
spray.Year.value_counts()

2013    12626
2011     1666
Name: Year, dtype: int64

In [53]:
full_data.Year.value_counts()

2007    3811
2013    2392
2009    2249
2011    2054
Name: Year, dtype: int64

In [63]:
full_data.groupby(['Year'])['Trap'].nunique()

Year
2007    116
2009     73
2011     80
2013     74
Name: Trap, dtype: int64

In [72]:
df = pd.DataFrame(full_data.groupby(['Date','Species','Trap'])['NumMosquitos'].sum())

In [77]:
df.reset_index(inplace=True)

In [92]:
df.reset_index(drop=True)

Unnamed: 0,Date,Species,Trap,NumMosquitos
0,2007-05-29,CULEX PIPIENS,T096,1
1,2007-05-29,CULEX PIPIENS/RESTUANS,T002,1
2,2007-05-29,CULEX PIPIENS/RESTUANS,T015,1
3,2007-05-29,CULEX PIPIENS/RESTUANS,T048,1
4,2007-05-29,CULEX PIPIENS/RESTUANS,T050,1
5,2007-05-29,CULEX PIPIENS/RESTUANS,T054,2
6,2007-05-29,CULEX PIPIENS/RESTUANS,T086,1
7,2007-05-29,CULEX PIPIENS/RESTUANS,T129,1
8,2007-05-29,CULEX PIPIENS/RESTUANS,T143,1
9,2007-05-29,CULEX PIPIENS/RESTUANS,T148,1


In [97]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8475 entries, 0 to 8474
Data columns (total 4 columns):
Date            8475 non-null datetime64[ns]
Species         8475 non-null object
Trap            8475 non-null object
NumMosquitos    8475 non-null int64
dtypes: datetime64[ns](1), int64(1), object(2)
memory usage: 264.9+ KB


In [101]:
full_data.drop(['NumMosquitos'],axis=1,inplace=True)

KeyError: "['NumMosquitos'] not found in axis"

In [102]:
full_data = full_data.drop_duplicates()

In [103]:
full_condensed = pd.merge(df, full_data, how='left',on=['Date','Species','Trap'])

In [104]:
full_condensed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8610 entries, 0 to 8609
Data columns (total 15 columns):
Date                      8610 non-null datetime64[ns]
Species                   8610 non-null object
Trap                      8610 non-null object
NumMosquitos              8610 non-null int64
Address                   8610 non-null object
Block                     8610 non-null int64
Street                    8610 non-null object
AddressNumberAndStreet    8610 non-null object
Latitude                  8610 non-null float64
Longitude                 8610 non-null float64
AddressAccuracy           8610 non-null int64
WnvPresent                8610 non-null int64
spray_dist                1334 non-null float64
spray_status              8610 non-null int64
Year                      8610 non-null int64
dtypes: datetime64[ns](1), float64(3), int64(6), object(5)
memory usage: 1.1+ MB


In [54]:
spray.count()

Date         14292
Time         13708
Latitude     14292
Longitude    14292
coords       14292
Year         14292
dtype: int64

In [105]:
weather.Date = weather.Date.astype('datetime64[ns]')

In [106]:
weather['Year'] = weather.Date.map(lambda x : x.year)

In [107]:
weather.Year.value_counts()

2013    368
2011    368
2009    368
2007    368
2014    368
2012    368
2010    368
2008    368
Name: Year, dtype: int64

## Weather data Mapping

In [108]:
STATIONS = {
    1 : (41.995,-87.933),
    2 : (41.786,-87.752)
}

In [109]:
def nearest_station(in_coords):
    
    dist = {k:hv.haversine(in_coords,v) for k,v in STATIONS.items()}
    
    return min(dist, key=dist.get)


In [111]:
weather = pd.read_csv('data/weather_cleaned_stack_back_v2.csv')
weather.drop(columns='Unnamed: 0',inplace=True)

In [112]:
weather.Date = weather.Date.astype('datetime64[ns]')
weather.head()

Unnamed: 0,Date,Sunset,DaylightHrs,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Cool,...,Depart7dAvg,DewPoint7dAvg,WetBulb7dAvg,Cool7dAvg,PrecipTotal7dAvg,StnPressure7dAvg,SeaLevel7dAvg,ResultSpeed7dAvg,ResultDir7dAvg,AvgSpeed7dAvg
0,2007-05-01,18.82,14.02,83.0,50.0,67.0,14.0,51.0,56.0,2.0,...,14.0,51.0,56.0,2.0,0.0,29.1,29.82,1.7,27.0,9.2
1,2007-05-02,18.83,14.05,59.0,42.0,51.0,-3.0,42.0,47.0,0.0,...,5.5,46.5,51.5,1.0,0.0,29.24,29.955,7.35,15.5,11.3
2,2007-05-03,18.85,14.08,66.0,46.0,56.0,2.0,40.0,48.0,0.0,...,4.333333,44.333333,50.333333,0.666667,0.0,29.29,30.01,8.8,12.666667,11.5
3,2007-05-04,18.87,14.14,66.0,49.0,58.0,4.0,41.0,50.0,0.0,...,4.25,43.5,50.25,0.5,0.00025,29.295,30.02,9.2,11.5,11.325
4,2007-05-05,18.88,14.16,66.0,53.0,60.0,5.0,38.0,49.0,0.0,...,4.4,42.4,50.0,0.4,0.0004,29.316,30.036,9.7,10.6,11.46


In [113]:
full_condensed['nearest_station'] = full_condensed.apply(lambda x: nearest_station([x.Latitude, x.Longitude]), axis=1)

In [114]:
full_condensed.nearest_station.value_counts()

2    6090
1    2520
Name: nearest_station, dtype: int64

In [117]:
full_condensed['date_lag'] = full_condensed.Date.map(lambda x : x - timedelta(days=7))

In [118]:
full_weather = full_condensed.merge(weather,left_on=['date_lag','nearest_station'],right_on=['Date','Station'])

In [119]:
full_condensed.Date.head()

0   2007-05-29
1   2007-05-29
2   2007-05-29
3   2007-05-29
4   2007-05-29
Name: Date, dtype: datetime64[ns]

In [120]:
weather.Date.head()

0   2007-05-01
1   2007-05-02
2   2007-05-03
3   2007-05-04
4   2007-05-05
Name: Date, dtype: datetime64[ns]

In [121]:
full_weather.columns

Index(['Date_x', 'Species', 'Trap', 'NumMosquitos', 'Address', 'Block',
       'Street', 'AddressNumberAndStreet', 'Latitude', 'Longitude',
       'AddressAccuracy', 'WnvPresent', 'spray_dist', 'spray_status', 'Year',
       'nearest_station', 'date_lag', 'Date_y', 'Sunset', 'DaylightHrs',
       'Tmax', 'Tmin', 'Tavg', 'Depart', 'DewPoint', 'WetBulb', 'Cool',
       'WetWeather', 'PrecipTotal', 'StnPressure', 'SeaLevel', 'ResultSpeed',
       'ResultDir', 'AvgSpeed', 'Station', 'RH', 'CoolYearlyCum',
       'Sunset7dAvg', 'DaylightHrs7dAvg', 'Tmax7dAvg', 'Tmin7dAvg',
       'Tavg7dAvg', 'Depart7dAvg', 'DewPoint7dAvg', 'WetBulb7dAvg',
       'Cool7dAvg', 'PrecipTotal7dAvg', 'StnPressure7dAvg', 'SeaLevel7dAvg',
       'ResultSpeed7dAvg', 'ResultDir7dAvg', 'AvgSpeed7dAvg'],
      dtype='object')

In [127]:
full_weather['Month'] = full_weather.Date_x.map(lambda x : x.month)

In [128]:
cols = ['Month','Species','Street','NumMosquitos','spray_dist', 
        'spray_status', 'Year','Sunset', 'DaylightHrs',
        'Tmax', 'Tmin', 'Tavg', 'Depart', 'DewPoint', 'WetBulb', 'Cool',
        'WetWeather', 'PrecipTotal', 'StnPressure', 'SeaLevel', 'ResultSpeed',
        'ResultDir', 'AvgSpeed', 'Station', 'RH', 'CoolYearlyCum',
        'Sunset7dAvg', 'DaylightHrs7dAvg', 'Tmax7dAvg', 'Tmin7dAvg',
        'Tavg7dAvg', 'Depart7dAvg', 'DewPoint7dAvg', 'WetBulb7dAvg',
        'Cool7dAvg', 'PrecipTotal7dAvg', 'StnPressure7dAvg', 'SeaLevel7dAvg',
        'ResultSpeed7dAvg', 'ResultDir7dAvg', 'AvgSpeed7dAvg']

In [133]:
full_weather = full_weather[cols]

## Train/Test split

In [134]:
y = full_weather['NumMosquitos']
X = full_weather[[col for col in full_weather.columns if col != 'NumMosquitos']]

In [137]:
train_x, test_x, train_y, test_y = train_test_split(X,y,test_size = 0.3, random_state = 666)

## Export CSVs

In [139]:
train_x.to_csv('data/train_x_regression.csv',index=False)
test_x.to_csv('data/test_x_regression.csv',index=False)
train_y.to_csv('data/train_y_regression.csv',index=False)
test_y.to_csv('data/test_y_regression.csv',index=False)

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
