## Exploratory Data Analysis ##
Although sensitivity seems an appropriate metric to us considering that false positives would not be detrimental to the city, our metric is accuracy based on the Kaggle competition. 

In [12]:
# Installed geopy, which is a tool to help determine distances between objects.  We used this below to determine
# the distance between if West Nile Virus is presenet to those cases it wasn't, by year
!pip install geopy



In [13]:
# Import libraries
import pandas as pd
import seaborn as sns
pd.core.common.is_list_like = pd.api.types.is_list_like
import pandas_datareader.data as web
from datetime import datetime
import matplotlib.pyplot as plt
from geopy.distance import geodesic
import time

In [14]:
# Import csv data files and assign to variable
spray = pd.read_csv('./data/spray.csv')
weather = pd.read_csv('./data/weather.csv')
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [15]:
# Use the imported column names and lowercase all letters and add an underscore where spaces exist between words
train.columns = [col.lower().replace(' ', '_') for col in train.columns]
test.columns = [col.lower().replace(' ', '_') for col in test.columns]
spray.columns = [col.lower().replace(' ', '_') for col in spray.columns]
weather.columns = [col.lower().replace(' ', '_') for col in weather.columns]

In [16]:
#create baseline file to upload to Kaggle
baseline=test.loc[:,("id")]
baseline['wnvpresent'] = 0
baseline.to_csv('./Data/baselinesubmission.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


#### Baseline Score

We created a baseline file, where there were no West Nile Virus predictions made (all zero's), to upload to Kaggle to get an idea what the score would be, according to their measurements.  While it turned out that our score was .50, that wasn't much of a surprise since a mosquito either has West Nile or does not.

In [17]:
#baseline on train
1-train['wnvpresent'].mean()

0.9475537787930707

In [18]:
spray.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14835 entries, 0 to 14834
Data columns (total 4 columns):
date         14835 non-null object
time         14251 non-null object
latitude     14835 non-null float64
longitude    14835 non-null float64
dtypes: float64(2), object(2)
memory usage: 463.7+ KB


In [19]:
#looked for null values in spray dataset
spray[spray.isnull().any(axis=1)]

Unnamed: 0,date,time,latitude,longitude
1030,2011-09-07,,41.987092,-87.794286
1031,2011-09-07,,41.987620,-87.794382
1032,2011-09-07,,41.988004,-87.794574
1033,2011-09-07,,41.988292,-87.795486
1034,2011-09-07,,41.988100,-87.796014
1035,2011-09-07,,41.986372,-87.794862
1036,2011-09-07,,41.986228,-87.795582
1037,2011-09-07,,41.984836,-87.793998
1038,2011-09-07,,41.984836,-87.794670
1039,2011-09-07,,41.984884,-87.795198


In [20]:
# Dropped time column from spray because it isn't contained in any other datasets, and has a high number of missing
# values
spray.drop(columns='time', inplace=True)

In [21]:
spray.head()

Unnamed: 0,date,latitude,longitude
0,2011-08-29,42.391623,-88.089163
1,2011-08-29,42.391348,-88.089163
2,2011-08-29,42.391022,-88.089157
3,2011-08-29,42.390637,-88.089158
4,2011-08-29,42.39041,-88.088858


In [22]:
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2944 entries, 0 to 2943
Data columns (total 22 columns):
station        2944 non-null int64
date           2944 non-null object
tmax           2944 non-null int64
tmin           2944 non-null int64
tavg           2944 non-null object
depart         2944 non-null object
dewpoint       2944 non-null int64
wetbulb        2944 non-null object
heat           2944 non-null object
cool           2944 non-null object
sunrise        2944 non-null object
sunset         2944 non-null object
codesum        2944 non-null object
depth          2944 non-null object
water1         2944 non-null object
snowfall       2944 non-null object
preciptotal    2944 non-null object
stnpressure    2944 non-null object
sealevel       2944 non-null object
resultspeed    2944 non-null float64
resultdir      2944 non-null int64
avgspeed       2944 non-null object
dtypes: float64(1), int64(5), object(16)
memory usage: 506.1+ KB


In [23]:
weather['tavg'].value_counts()

73    138
77    117
70    117
75    110
71    109
74    107
72    104
69    103
78    102
76    100
68     99
79     98
66     93
67     89
61     88
64     86
80     84
65     84
63     81
57     67
62     66
60     61
50     57
81     55
58     49
53     49
54     48
82     48
55     48
52     46
56     46
59     45
51     36
83     34
49     29
45     28
47     24
46     24
84     21
44     19
48     17
86     16
85     16
42     15
43     12
M      11
87      9
41      7
40      5
39      4
89      4
88      4
91      4
37      2
90      2
38      2
36      2
94      1
93      1
92      1
Name: tavg, dtype: int64

In [24]:
# Dropped all t-averages with missing value 'M' because there were additional missing values in each of these rows.
# Only 11 were missing.
weather = (weather[weather['tavg'] != 'M'])

In [25]:
# Look at Missing values is the depart column of weather
weather[weather['depart'] == 'M']

Unnamed: 0,station,date,tmax,tmin,tavg,depart,dewpoint,wetbulb,heat,cool,...,codesum,depth,water1,snowfall,preciptotal,stnpressure,sealevel,resultspeed,resultdir,avgspeed
1,2,2007-05-01,84,52,68,M,51,57,0,3,...,,M,M,M,0.00,29.18,29.82,2.7,25,9.6
3,2,2007-05-02,60,43,52,M,42,47,13,0,...,BR HZ,M,M,M,0.00,29.44,30.08,13.3,2,13.4
5,2,2007-05-03,67,48,58,M,40,50,7,0,...,HZ,M,M,M,0.00,29.46,30.12,12.9,6,13.2
9,2,2007-05-05,66,54,60,M,39,50,5,0,...,,M,M,M,T,29.46,30.09,11.2,7,11.5
11,2,2007-05-06,68,52,60,M,30,46,5,0,...,,M,M,M,0.00,29.62,30.28,13.8,10,14.5
13,2,2007-05-07,84,50,67,M,39,53,0,2,...,,M,M,M,0.00,29.44,30.12,8.5,17,9.9
15,2,2007-05-08,80,60,70,M,57,63,0,5,...,HZ,M,M,M,T,29.36,30.02,2.5,8,5.4
17,2,2007-05-09,76,63,70,M,60,63,0,5,...,BR HZ,M,M,M,0.02,29.28,29.93,3.9,7,5.9
19,2,2007-05-10,83,59,71,M,52,61,0,6,...,BR HZ,M,M,M,0.00,29.26,29.91,2.0,9,3.9
21,2,2007-05-11,73,49,61,M,44,51,4,0,...,,M,M,M,0.00,29.39,30.03,11.7,36,12.8


In [26]:
# There are 0.498 missing values from weather depart column
weather['depart'].value_counts(normalize=True)

M      0.498125
 2     0.031708
-1     0.028640
-2     0.027276
 5     0.026253
 7     0.025912
 1     0.025912
 3     0.025571
 0     0.025230
-3     0.024548
 4     0.024207
 6     0.022844
 8     0.020116
-5     0.019434
-4     0.019093
-6     0.017047
 9     0.016025
10     0.015684
-8     0.014661
-7     0.010228
11     0.009547
12     0.009547
-9     0.008524
13     0.007842
-10    0.007501
14     0.007501
15     0.005114
16     0.004091
-11    0.003409
-12    0.002728
17     0.002387
18     0.002046
-14    0.002046
-13    0.001705
19     0.001364
20     0.001364
22     0.001023
-16    0.001023
-15    0.001023
-17    0.000682
21     0.000682
23     0.000341
Name: depart, dtype: float64

In [27]:
# Check for null values in Weather
weather[weather.isnull().any(axis=1)]

Unnamed: 0,station,date,tmax,tmin,tavg,depart,dewpoint,wetbulb,heat,cool,...,codesum,depth,water1,snowfall,preciptotal,stnpressure,sealevel,resultspeed,resultdir,avgspeed


In [28]:
weather.shape

(2933, 22)

In [25]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10506 entries, 0 to 10505
Data columns (total 12 columns):
date                      10506 non-null object
address                   10506 non-null object
species                   10506 non-null object
block                     10506 non-null int64
street                    10506 non-null object
trap                      10506 non-null object
addressnumberandstreet    10506 non-null object
latitude                  10506 non-null float64
longitude                 10506 non-null float64
addressaccuracy           10506 non-null int64
nummosquitos              10506 non-null int64
wnvpresent                10506 non-null int64
dtypes: float64(2), int64(4), object(6)
memory usage: 985.0+ KB


In [26]:
# Check for null values in train data
train[train.isnull().any(axis=1)]

Unnamed: 0,date,address,species,block,street,trap,addressnumberandstreet,latitude,longitude,addressaccuracy,nummosquitos,wnvpresent


In [27]:
train.columns

Index(['date', 'address', 'species', 'block', 'street', 'trap',
       'addressnumberandstreet', 'latitude', 'longitude', 'addressaccuracy',
       'nummosquitos', 'wnvpresent'],
      dtype='object')

In [29]:
# Convert dates to datetime
spray['date']   = pd.to_datetime(spray['date'])
weather['date'] = pd.to_datetime(weather['date'])
train['date']   = pd.to_datetime(train['date'])
test['date']    = pd.to_datetime(test['date'])

In [30]:
test['date'].head()

0   2008-06-11
1   2008-06-11
2   2008-06-11
3   2008-06-11
4   2008-06-11
Name: date, dtype: datetime64[ns]

In [31]:
spray.head()

Unnamed: 0,date,latitude,longitude
0,2011-08-29,42.391623,-88.089163
1,2011-08-29,42.391348,-88.089163
2,2011-08-29,42.391022,-88.089157
3,2011-08-29,42.390637,-88.089158
4,2011-08-29,42.39041,-88.088858


In [32]:
# Split dates into individual categories test file
test['year'] = [str(date)[0:4] for date in test.date]
test['month'] = [str(date)[5:7] for date in test.date]
test['day'] = [str(date)[8:10] for date in test.date]

In [33]:
# Split dates into individual categories train file
train['year'] = [str(date)[0:4] for date in train.date]
train['month'] = [str(date)[5:7] for date in train.date]
train['day'] = [str(date)[8:10] for date in train.date]

In [34]:
# Combine weather stations 1 and 2 based on date
weather1 = weather[weather['station'] == 1]
weather2 = weather[weather['station'] == 2]
weather1.drop(columns='station', inplace=True)
weather2.drop(columns='station', inplace=True)
weather = weather1.merge(weather2, on='date')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [36]:
# Merge train and test datasets with weather dataset based on the date field
train=train.merge(weather, on='date')
test=test.merge(weather, on='date')

In [38]:
# Get rid of alpha characters in traps to create number field
train['trap']=[trap.replace("T","") for trap in train['trap']]
test['trap']=[trap.replace("T","") for trap in test['trap']]
train['trap']=[trap.replace("A","") for trap in train['trap']]
test['trap']=[trap.replace("A","") for trap in test['trap']]
train['trap']=[trap.replace("B","") for trap in train['trap']]
test['trap']=[trap.replace("B","") for trap in test['trap']]
train['trap']=[trap.replace("C","") for trap in train['trap']]
test['trap']=[trap.replace("C","") for trap in test['trap']]

In [39]:
# Change field type for trap from object to int in test and train
train['trap']=pd.to_numeric(train['trap'])
test['trap']=pd.to_numeric(test['trap'])

In [40]:
# Make dummy columns for species type in test and train
train=pd.get_dummies(train, columns=['species'], drop_first=True)
test=pd.get_dummies(test, columns=['species'], drop_first=True)

In [41]:
train.dtypes

date                              datetime64[ns]
address                                   object
block                                      int64
street                                    object
trap                                       int64
addressnumberandstreet                    object
latitude                                 float64
longitude                                float64
addressaccuracy                            int64
nummosquitos                               int64
wnvpresent                                 int64
year                                      object
month                                     object
day                                       object
tmax_x                                     int64
tmin_x                                     int64
tavg_x                                     int64
depart_x                                  object
dewpoint_x                                 int64
wetbulb_x                                 object
heat_x              

In [42]:
# Create lowercase characters and underscore species dummy columns
train.columns = [col.lower().replace(' ', '_') for col in train.columns]
test.columns = [col.lower().replace(' ', '_') for col in test.columns]

In [44]:
train['trap'].head()

0     2
1     2
2     7
3    15
4    15
Name: trap, dtype: int64

In [45]:
train['addressnumberandstreet'].head()

0    4100  N OAK PARK AVE, Chicago, IL
1    4100  N OAK PARK AVE, Chicago, IL
2     6200  N MANDELL AVE, Chicago, IL
3      7900  W FOSTER AVE, Chicago, IL
4      7900  W FOSTER AVE, Chicago, IL
Name: addressnumberandstreet, dtype: object

In [46]:
train['address'].head()

0    4100 North Oak Park Avenue, Chicago, IL 60634,...
1    4100 North Oak Park Avenue, Chicago, IL 60634,...
2    6200 North Mandell Avenue, Chicago, IL 60646, USA
3      7900 West Foster Avenue, Chicago, IL 60656, USA
4      7900 West Foster Avenue, Chicago, IL 60656, USA
Name: address, dtype: object

In [47]:
train['wnvpresent'].head()

0    0
1    0
2    0
3    0
4    0
Name: wnvpresent, dtype: int64

In [48]:
train['trap'].value_counts()

900    750
115    542
138    314
2      185
135    183
54     172
128    160
151    156
212    152
90     151
114    148
158    147
31     146
48     146
903    140
8      139
11     133
209    130
200    129
73     128
30     126
94     126
3      122
28     117
66     116
74     115
47     114
102    112
221    112
9      111
      ... 
150     22
107     22
141     22
162     21
156     21
206     21
219     21
154     21
75      20
70      20
71      19
142     18
97      17
34      17
50      17
1       15
60      15
44      15
51      15
157     14
149     14
5       14
72      13
4       12
229      8
78       8
238      7
76       5
237      3
40       2
Name: trap, Length: 134, dtype: int64

In [49]:
test['trap'].value_counts()

90     3051
218    3051
2      2377
200    2291
54     1552
128    1534
65     1532
9      1528
94     1521
35     1520
900    1468
8       822
11      814
27      803
151     797
903     784
28      783
231     781
12      781
3       780
220     776
115     776
73      776
16      775
135     774
223     774
63      774
158     773
212     773
102     772
       ... 
25      760
107     760
88      760
215     760
44      760
150     760
86      760
149     760
85      760
43      760
91      760
84      760
155     760
219     760
19      760
146     760
234     760
18      760
92      760
156     760
81      760
17      760
157     760
232     760
40      760
143     760
79      760
15      760
206     760
154     760
Name: trap, Length: 135, dtype: int64

In [50]:
# Change types of date fields in test and train
train['year'] = train['year'].astype(float)
test['year'] = test['year'].astype(float)
train['month'] = train['month'].astype(float)
test['month'] = test['month'].astype(float)
train['day'] = train['day'].astype(float)
test['day'] = test['day'].astype(float)

### Geopy

Using Geopy, below we are using the latitude and longitude, based on the corresponding year, and creating a function that calculates the distance between positive West Nile Virus samples found in mosquitoes vs. the mosquitoes that did not test positive for the West Nile.  We repeated the process for each year in the train and test data, that will be used as features later on in our model.  The comments are the same for each function below

In [51]:
%%time

# Using 2007 Train data, find the distance between mosquitos that tested positive for WNV vs. those that did not

# Filter by year
distance_2007 = train[train['year'] == 2007.0]
features = ['latitude', 'longitude']

distance_2007 = distance_2007[features]
distance_2007 = distance_2007.apply(tuple, axis = 1)
train['wnvpresent'] = train['wnvpresent'].astype(float)

# Find if WNV present for year
westnile_2007 = (train[train['wnvpresent'] == 1]) 
westnile_2007 = westnile_2007[westnile_2007['year'] == 2007.0]
westnile_2007 = westnile_2007[features]
westnile_2007 = westnile_2007.apply(tuple, axis = 1)

# Iterate through each distance, and if it has WNV present, append it to list
h=0
results_2007 = []
for i in distance_2007:
   dists_2007 = []

   for j in westnile_2007:
        
       # Use Geopy to find the difference in distances, in miles 
       dist_2007 = (geodesic(i, j).miles)
       if dist_2007 > 0:
            dists_2007.append(dist_2007)
   
   results_2007.append(min(dists_2007))

# Create dataframe    
train_2007=pd.DataFrame(results_2007,columns = ['distance'])
train_07 = train[train['year'] == 2007]
train_7 = pd.concat([train_07, train_2007], axis = 1)

CPU times: user 2min 29s, sys: 172 ms, total: 2min 29s
Wall time: 2min 29s


In [52]:
%%time

# Using 2009 Train data, find the distance between mosquitos that tested positive for WNV vs. those that did not

distance_2009 = train[train['year'] == 2009.0]
features = ['latitude', 'longitude']

distance_2009 = distance_2009[features]
distance_2009 = distance_2009.apply(tuple, axis = 1)
train['wnvpresent'] = train['wnvpresent'].astype(float)

westnile_2009 = (train[train['wnvpresent'] == 1]) 
westnile_2009 = westnile_2009[westnile_2009['year'] == 2009.0]
westnile_2009 = westnile_2009[features]
westnile_2009 = westnile_2009.apply(tuple, axis = 1)
westnile_2009.head()

h=0
results_2009 = []
for i in distance_2009:
   dists_2009 = []

   for j in westnile_2009:

       dist_2009 = (geodesic(i, j).miles)
       if dist_2009 > 0:
            dists_2009.append(dist_2009)
    
   results_2009.append(min(dists_2009))

train_2009=pd.DataFrame(results_2009,columns = ['distance'])
train_09 = train[train['year'] == 2009]
train_09.reset_index(inplace = True)
train_9 = pd.concat([train_09, train_2009], axis = 1)

CPU times: user 6.94 s, sys: 6.39 ms, total: 6.95 s
Wall time: 6.94 s


In [53]:
%%time

# Using 2011 Train data, find the distance between mosquitos that tested positive for WNV vs. those that did not

distance_2011 = train[train['year'] == 2011.0]
features = ['latitude', 'longitude']

distance_2011 = distance_2011[features]
distance_2011 = distance_2011.apply(tuple, axis = 1)
train['wnvpresent'] = train['wnvpresent'].astype(float)

westnile_2011 = (train[train['wnvpresent'] == 1]) 
westnile_2011 = westnile_2011[westnile_2011['year'] == 2011.0]
westnile_2011 = westnile_2011[features]
westnile_2011 = westnile_2011.apply(tuple, axis = 1)
westnile_2011.head()


h=0
results_2011 = []
for i in distance_2011:
   dists_2011 = []

   for j in westnile_2011:

       dist_2011 = (geodesic(i, j).miles)
       if dist_2011 > 0:
            dists_2011.append(dist_2011)
   
   results_2011.append(min(dists_2011))

train_2011=pd.DataFrame(results_2011,columns = ['distance'])    
train_11 = train[train['year'] == 2011]
train_11.reset_index(inplace = True)
train_1 = pd.concat([train_11, train_2011], axis = 1)

CPU times: user 18.7 s, sys: 10.8 ms, total: 18.7 s
Wall time: 18.7 s


In [54]:
%%time

# Using 2013 Train data, find the distance between mosquitos that tested positive for WNV vs. those that did not

distance_2013 = train[train['year'] == 2013.0]
features = ['latitude', 'longitude']

distance_2013 = distance_2013[features]
distance_2013 = distance_2013.apply(tuple, axis = 1)
train['wnvpresent'] = train['wnvpresent'].astype(float)

westnile_2013 = (train[train['wnvpresent'] == 1]) 
westnile_2013 = westnile_2013[westnile_2013['year'] == 2013.0]
westnile_2013 = westnile_2013[features]
westnile_2013 = westnile_2013.apply(tuple, axis = 1)
westnile_2013.head()


h=0
results_2013 = []
for i in distance_2013:
   dists_2013 = []

   for j in westnile_2013:

       dist_2013 = (geodesic(i, j).miles)
       if dist_2013 > 0:
            dists_2013.append(dist_2013)
   
   results_2013.append(min(dists_2013))

train_2013=pd.DataFrame(results_2013,columns = ['distance'])
train_13 = train[train['year'] == 2013]
train_13.reset_index(inplace = True)
train_3 = pd.concat([train_13, train_2013], axis = 1)

CPU times: user 1min 34s, sys: 111 ms, total: 1min 34s
Wall time: 1min 34s


In [55]:
# Take each year of train distance data just calculated and add to new df
list = [train_7, train_9, train_1, train_3]
train_new = pd.concat(list, axis = 0)
train_new


Unnamed: 0,address,addressaccuracy,addressnumberandstreet,avgspeed_x,avgspeed_y,block,codesum_x,codesum_y,cool_x,cool_y,...,tmax_y,tmin_x,tmin_y,trap,water1_x,water1_y,wetbulb_x,wetbulb_y,wnvpresent,year
0,"4100 North Oak Park Avenue, Chicago, IL 60634,...",9,"4100 N OAK PARK AVE, Chicago, IL",6.5,7.4,41,BR HZ,BR HZ,9,12,...,88,60,65,2,M,M,65,66,0.0,2007.0
1,"4100 North Oak Park Avenue, Chicago, IL 60634,...",9,"4100 N OAK PARK AVE, Chicago, IL",6.5,7.4,41,BR HZ,BR HZ,9,12,...,88,60,65,2,M,M,65,66,0.0,2007.0
2,"6200 North Mandell Avenue, Chicago, IL 60646, USA",9,"6200 N MANDELL AVE, Chicago, IL",6.5,7.4,62,BR HZ,BR HZ,9,12,...,88,60,65,7,M,M,65,66,0.0,2007.0
3,"7900 West Foster Avenue, Chicago, IL 60656, USA",8,"7900 W FOSTER AVE, Chicago, IL",6.5,7.4,79,BR HZ,BR HZ,9,12,...,88,60,65,15,M,M,65,66,0.0,2007.0
4,"7900 West Foster Avenue, Chicago, IL 60656, USA",8,"7900 W FOSTER AVE, Chicago, IL",6.5,7.4,79,BR HZ,BR HZ,9,12,...,88,60,65,15,M,M,65,66,0.0,2007.0
5,"1500 West Webster Avenue, Chicago, IL 60614, USA",8,"1500 W WEBSTER AVE, Chicago, IL",6.5,7.4,15,BR HZ,BR HZ,9,12,...,88,60,65,45,M,M,65,66,0.0,2007.0
6,"2500 West Grand Avenue, Chicago, IL 60654, USA",8,"2500 W GRAND AVE, Chicago, IL",6.5,7.4,25,BR HZ,BR HZ,9,12,...,88,60,65,46,M,M,65,66,0.0,2007.0
7,"1100 Roosevelt Road, Chicago, IL 60608, USA",8,"1100 W ROOSEVELT, Chicago, IL",6.5,7.4,11,BR HZ,BR HZ,9,12,...,88,60,65,48,M,M,65,66,0.0,2007.0
8,"1100 Roosevelt Road, Chicago, IL 60608, USA",8,"1100 W ROOSEVELT, Chicago, IL",6.5,7.4,11,BR HZ,BR HZ,9,12,...,88,60,65,48,M,M,65,66,0.0,2007.0
9,"1100 West Chicago Avenue, Chicago, IL 60642, USA",8,"1100 W CHICAGO, Chicago, IL",6.5,7.4,11,BR HZ,BR HZ,9,12,...,88,60,65,49,M,M,65,66,0.0,2007.0


In [56]:
train.shape

(10506, 60)

### Repeating the Geopy process with the Test Data

In [58]:
# test['distance'] = 0

In [59]:
%%time

# Using 2008 Test data, find the distance between mosquitos that tested positive for WNV vs. those that did not

distance_2008 = test[test['year'] == 2008.0]
features = ['latitude', 'longitude']

distance_2008 = distance_2008[features]
distance_2008 = distance_2008.apply(tuple, axis = 1)

westnile_2007 = (train[train['wnvpresent'] == 1]) 
westnile_2007 = westnile_2007[westnile_2007['year'] == 2007.0]
westnile_2007 = westnile_2007[features]
westnile_2007 = westnile_2007.apply(tuple, axis = 1)


h=0
results_2008 = []
for i in distance_2008:
   dists_2008 = []

   for j in westnile_2007:

       dist_2008 = (geodesic(i, j).miles)
       if dist_2008 > 0:
            dists_2008.append(dist_2008)
   results_2008.append(min(dists_2008))

test_2008=pd.DataFrame(results_2008,columns = ['distance'])
test_08 = test[test['year'] == 2008]
test_08.reset_index(inplace = True)
test_8 = pd.concat([test_08, test_2008], axis = 1)

CPU times: user 20min 41s, sys: 797 ms, total: 20min 42s
Wall time: 37min 52s


In [60]:
%%time

# Using 2010 Test data, find the distance between mosquitos that tested positive for WNV vs. those that did not


distance_2010 = test[test['year'] == 2010.0]
features = ['latitude', 'longitude']

distance_2010 = distance_2010[features]
distance_2010 = distance_2010.apply(tuple, axis = 1)

westnile_2009 = (train[train['wnvpresent'] == 1]) 
westnile_2009 = westnile_2009[westnile_2009['year'] == 2007.0]
westnile_2009 = westnile_2009[features]
westnile_2009 = westnile_2009.apply(tuple, axis = 1)


h=0
results_2010 = []
for i in distance_2010:
   dists_2010= []

   for j in westnile_2009:

       dist_2010 = (geodesic(i, j).miles)
       if dist_2010 > 0:
            dists_2010.append(dist_2010)
   
   results_2010.append(min(dists_2010))

test_2010=pd.DataFrame(results_2010,columns = ['distance'])
test_10 = test[test['year'] == 2010]
test_10.reset_index(inplace = True)
test_0 = pd.concat([test_10, test_2010], axis = 1)

CPU times: user 32min 17s, sys: 4.23 s, total: 32min 21s
Wall time: 2h 10min 36s


In [61]:
%%time

# Using 2012 Test data, find the distance between mosquitos that tested positive for WNV vs. those that did not

distance_2012 = test[test['year'] == 2012.0]
features = ['latitude', 'longitude']

distance_2012 = distance_2012[features]
distance_2012 = distance_2012.apply(tuple, axis = 1)


westnile_2011 = (train[train['wnvpresent'] == 1]) 
westnile_2011 = westnile_2011[westnile_2011['year'] == 2011.0]
westnile_2011 = westnile_2011[features]
westnile_2011 = westnile_2011.apply(tuple, axis = 1)

h=0
results_2012 = []
for i in distance_2012:
   dists_2012 = []

   for j in westnile_2011:

       dist_2012 = (geodesic(i, j).miles)
       if dist_2012 > 0:
            dists_2012.append(dist_2012)
   
   results_2012.append(min(dists_2012))

test_2012=pd.DataFrame(results_2012,columns = ['distance'])
test_12 = test[test['year'] == 2012]
test_12.reset_index(inplace = True)
test_2 = pd.concat([test_12, test_2012], axis = 1)

CPU times: user 5min 46s, sys: 771 ms, total: 5min 47s
Wall time: 23min 49s


In [62]:
%%time

# Using 2014 Test data, find the distance between mosquitos that tested positive for WNV vs. those that did not

distance_2014 = test[test['year'] == 2014.0]
features = ['latitude', 'longitude']

distance_2014 = distance_2014[features]
distance_2014 = distance_2014.apply(tuple, axis = 1)

westnile_2013 = (train[train['wnvpresent'] == 1]) 
westnile_2013 = westnile_2013[westnile_2013['year'] == 2013.0]
westnile_2013 = westnile_2013[features]
westnile_2013 = westnile_2013.apply(tuple, axis = 1)

h=0
results_2014 = []
for i in distance_2014:
   dists_2014 = []

   for j in westnile_2013:

       dist_2014 = (geodesic(i, j).miles)
       if dist_2014 > 0:
            dists_2014.append(dist_2014)
   
   results_2014.append(min(dists_2014))

test_2014=pd.DataFrame(results_2014,columns = ['distance'])
test_14 = test[test['year'] == 2014]
test_14.reset_index(inplace = True)
test_4 = pd.concat([test_14, test_2014], axis = 1)

CPU times: user 19min 30s, sys: 2.75 s, total: 19min 33s
Wall time: 1h 26min 17s


In [63]:
# Take each year of test distance data just calculated and add to new df
list = [test_8, test_0, test_2, test_4]
test_new = pd.concat(list, axis = 0)
test_new.head()

Unnamed: 0,index,id,date,address,block,street,trap,addressnumberandstreet,latitude,longitude,...,avgspeed_y,species_culex_pipiens,species_culex_pipiens/restuans,species_culex_restuans,species_culex_salinarius,species_culex_tarsalis,species_culex_territans,species_unspecified_culex,distance,distance.1
0,0,1,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",41,N OAK PARK AVE,2,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,...,10.4,0,1,0,0,0,0,0,0,1.050839
1,1,2,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",41,N OAK PARK AVE,2,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,...,10.4,0,0,1,0,0,0,0,0,1.050839
2,2,3,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",41,N OAK PARK AVE,2,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,...,10.4,1,0,0,0,0,0,0,0,1.050839
3,3,4,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",41,N OAK PARK AVE,2,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,...,10.4,0,0,0,1,0,0,0,0,1.050839
4,4,5,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",41,N OAK PARK AVE,2,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,...,10.4,0,0,0,0,0,1,0,0,1.050839


In [66]:
test_new.shape

(116293, 63)

In [67]:
train_new.shape

(10506, 62)

In [65]:
# Save cleaned files
spray.to_csv('../project_work/data/sprayclean.csv', index=False)
weather.to_csv('../project_work/data/weatherclean.csv', index=False)
train_new.to_csv('../project_work/data/trainclean.csv', index=False)
test_new.to_csv('../project_work/data/testclean.csv', index=False)