In [1]:
pip install geopy


The following command must be run outside of the IPython shell:

    $ pip install geopy

The Python package manager (pip) can only be used from outside of IPython.
Please reissue the `pip` command in a separate terminal or command prompt.

See the Python documentation for more informations on how to install packages:

    https://docs.python.org/3/installing/


In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from geopy.distance import vincenty
%matplotlib inline

In [3]:
%%javascript
$('.nbp-app-bar').toggle()

<IPython.core.display.Javascript object>

## Read in the data

In [4]:
path = '../kaggle_data/input/'

In [5]:
train = pd.read_csv(path+'train.csv')
weather = pd.read_csv(path+'weather.csv')
spray = pd.read_csv(path+'spray.csv')
test = pd.read_csv(path+'test.csv')
sample_sub = pd.read_csv(path+'sampleSubmission.csv')

In [6]:
print(len(test))
test.head()
# This should be 116293.

116293


Unnamed: 0,Id,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy
0,1,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9
1,2,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9
2,3,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9
3,4,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX SALINARIUS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9
4,5,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX TERRITANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9


In [7]:
test.shape

(116293, 11)

In [8]:
weather=pd.read_csv('weather2.csv')
print(weather.shape)
weather.head()

(1472, 5)


Unnamed: 0,Date,avg_rain1,avg_rain2,avg_dry1,avg_dry2
0,2007-05-01,,,,
1,2007-05-02,0.0,,1.0,
2,2007-05-03,0.0,,0.5,
3,2007-05-04,0.0,,0.666667,
4,2007-05-05,0.25,,0.75,


In [9]:
chicago=pd.merge(test, weather, on='Date');
print(chicago.shape)
chicago.head(3)

(116293, 15)


Unnamed: 0,Id,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,avg_rain1,avg_rain2,avg_dry1,avg_dry2
0,1,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1.0,0.571429,0.142857,0.714286
1,2,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1.0,0.571429,0.142857,0.714286
2,3,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1.0,0.571429,0.142857,0.714286


## Feature Engineering

In [10]:
chicago.columns

Index(['Id', 'Date', 'Address', 'Species', 'Block', 'Street', 'Trap',
       'AddressNumberAndStreet', 'Latitude', 'Longitude', 'AddressAccuracy',
       'avg_rain1', 'avg_rain2', 'avg_dry1', 'avg_dry2'],
      dtype='object')

In [11]:
# Reduce the species categorical variable to just 4 categories
chicago['CULEX']='ALTER'
chicago.loc[(chicago['Species']=='CULEX PIPIENS/RESTUANS') , 'CULEX']='PIPIENS-RESTUANS'
chicago.loc[(chicago['Species']=='CULEX RESTUANS') , 'CULEX']='RESTUANS'
chicago.loc[(chicago['Species']=='CULEX PIPIENS') , 'CULEX']='PIPIENS'
chicago['CULEX'].value_counts()
# Dummify the Species
dummies=pd.get_dummies(chicago['CULEX'])
chicago=pd.concat([chicago, dummies], axis=1)

In [12]:
# Let's convert date/time into something usable. 
chicago['Month']=chicago['Date'].str.split('-').str[1].astype(str)
chicago['Year']=chicago['Date'].str.split('-').str[0].astype(int)
chicago[['Date', 'Year', 'Month']].head()

Unnamed: 0,Date,Year,Month
0,2008-06-11,2008,6
1,2008-06-11,2008,6
2,2008-06-11,2008,6
3,2008-06-11,2008,6
4,2008-06-11,2008,6


In [13]:
# Day of the year
chicago['DayRunner']=pd.to_datetime(chicago['Date']).dt.dayofyear

In [14]:
# How many of all the lat-long combos have ever tested positive for the virus?
chicago['lat_long'] = list(zip(chicago['Latitude'], chicago['Longitude']))
latlongs=chicago.groupby('lat_long').mean()

In [15]:
# Certain lat-longs have the virus more frequently. That's useful!
import pickle
with open('top_hots.pickle', 'rb') as f:
    top_hots = pickle.load(f)

In [16]:
# For each trap, let's calculate its distance from the top virus hotspots
for x in range(30):
    digit=str(x)
    chicago['to_hotspot'+digit]=chicago['lat_long'].apply(lambda row: vincenty(row, top_hots[x]).miles)
    
    # Create an indicator if a row is less than 2 miles from one of the virus hotspots
    chicago['flag_hotspot'+digit]=chicago['to_hotspot'+digit].apply(lambda row: row<2).astype(int)
    
# Summarize those indicators.
hot_list=['flag_hotspot0',
          'flag_hotspot1',
          'flag_hotspot2',
          'flag_hotspot3',
          'flag_hotspot4',
          'flag_hotspot6',
          'flag_hotspot6',
          'flag_hotspot7',
          'flag_hotspot8',
          'flag_hotspot9',
          'flag_hotspot10',
          'flag_hotspot11',
          'flag_hotspot12',
          'flag_hotspot13',
          'flag_hotspot14',
          'flag_hotspot16',
          'flag_hotspot16',
          'flag_hotspot17',
          'flag_hotspot18',
          'flag_hotspot19',
          'flag_hotspot20',
          'flag_hotspot21',
          'flag_hotspot22',
          'flag_hotspot23',
          'flag_hotspot24',
          'flag_hotspot26',
          'flag_hotspot26',
          'flag_hotspot27',
          'flag_hotspot28',
          'flag_hotspot29',
         ]
chicago['near_hotspot'] = chicago[hot_list].sum(axis=1)
chicago['near_hotspot'].describe()

count    116293.000000
mean          1.661158
std           1.851068
min           0.000000
25%           0.000000
50%           1.000000
75%           3.000000
max           8.000000
Name: near_hotspot, dtype: float64

In [17]:
# Are there some lat-longs with higher mosquito counts? Yes.
with open('top_chomps.pickle', 'rb') as f:
    top_chomps = pickle.load(f)

In [18]:
# Do the same thing for its distance from the 20 spots with highest average mosquito count.
for x in range(30):
    digit=str(x)
    chicago['to_chompspot'+digit]=chicago['lat_long'].apply(lambda row: vincenty(row, top_chomps[x]).miles)   

    # Create an indicator if a row is less than 2 miles from one of the high-mosquito-count chompspots
    chicago['flag_chompspot'+digit]=chicago['to_chompspot'+digit].apply(lambda row: row<2).astype(int)
    
# Summarize those 10 indicators.
chomp_list=['flag_chompspot0',
          'flag_chompspot1',
          'flag_chompspot2',
          'flag_chompspot3',
          'flag_chompspot4',
          'flag_chompspot6',
          'flag_chompspot6',
          'flag_chompspot7',
          'flag_chompspot8',
          'flag_chompspot9',
          'flag_chompspot10',
          'flag_chompspot11',
          'flag_chompspot12',
          'flag_chompspot13',
          'flag_chompspot14',
          'flag_chompspot16',
          'flag_chompspot16',
          'flag_chompspot17',
          'flag_chompspot18',
          'flag_chompspot19',
          'flag_chompspot20',
          'flag_chompspot21',
          'flag_chompspot22',
          'flag_chompspot23',
          'flag_chompspot24',
          'flag_chompspot26',
          'flag_chompspot26',
          'flag_chompspot27',
          'flag_chompspot28',
          'flag_chompspot29',
         ]
chicago['near_chompspot'] = chicago[chomp_list].sum(axis=1)
chicago['near_chompspot'].describe()

count    116293.000000
mean          1.192221
std           1.321861
min           0.000000
25%           0.000000
50%           1.000000
75%           2.000000
max           6.000000
Name: near_chompspot, dtype: float64

In [19]:
# Create a list of (nearly) all the variables we just created 
drop_list=[]
for x in range(30):
    digit=str(x)
    drop_list.append('to_hotspot'+digit)
    drop_list.append('flag_hotspot'+digit)
    drop_list.append('to_chompspot'+digit)
    drop_list.append('flag_chompspot'+digit)
# Drop them becuase they are no longer necessary.
chicago=chicago.drop(hot_list + chomp_list + drop_list, axis=1)

# Make sure we have the same variables as `pickle`

In [20]:
chicago=chicago.drop(['Street', 'Block', 'Trap', 'Address', 'AddressAccuracy',
                      'AddressNumberAndStreet', 'Date', 'Species', 'CULEX',
                      'Latitude', 'Longitude', 'Month', 'Year', 'lat_long'], axis=1)
# We'll drop Id a little further down

In [21]:
testing=list(chicago.columns)
testing.sort()
print(len(testing))
testing

12


['ALTER',
 'DayRunner',
 'Id',
 'PIPIENS',
 'PIPIENS-RESTUANS',
 'RESTUANS',
 'avg_dry1',
 'avg_dry2',
 'avg_rain1',
 'avg_rain2',
 'near_chompspot',
 'near_hotspot']

In [22]:
features=['DayRunner', 'ALTER', 'PIPIENS', 'PIPIENS-RESTUANS', 'RESTUANS',
       'near_hotspot', 'near_chompspot', 'avg_rain1', 'avg_rain2', 'avg_dry1',
       'avg_dry2']

In [23]:
compare_lists1 = np.setdiff1d(testing, features)
print(compare_lists1)
compare_lists2 = np.setdiff1d(features, testing)
print(compare_lists2)

['Id']
[]


In [33]:
chicago.head()

Unnamed: 0,Id,avg_rain1,avg_rain2,avg_dry1,avg_dry2,ALTER,PIPIENS,PIPIENS-RESTUANS,RESTUANS,DayRunner,near_hotspot,near_chompspot
0,1,1.0,0.571429,0.142857,0.714286,0,0,1,0,163,5,3
1,2,1.0,0.571429,0.142857,0.714286,0,0,0,1,163,5,3
2,3,1.0,0.571429,0.142857,0.714286,0,1,0,0,163,5,3
3,4,1.0,0.571429,0.142857,0.714286,1,0,0,0,163,5,3
4,5,1.0,0.571429,0.142857,0.714286,1,0,0,0,163,5,3


## Export to CSV

## Import the pickle and `predict`!

In [24]:
import pickle
with open('net_model.pickle', 'rb') as f:
    net_model = pickle.load(f)

In [25]:
# Do we have the right number of rows?
print(len(chicago))
print(116293-len(chicago))

116293
0


In [26]:
net_model

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [27]:
chicago.columns

Index(['Id', 'avg_rain1', 'avg_rain2', 'avg_dry1', 'avg_dry2', 'ALTER',
       'PIPIENS', 'PIPIENS-RESTUANS', 'RESTUANS', 'DayRunner', 'near_hotspot',
       'near_chompspot'],
      dtype='object')

In [28]:
# Predict the y values on the testing data.
y_hat = net_model.predict(chicago.drop(['Id'], axis=1))
print(len(y_hat))

116293


## Save the output for Kaggle submisison

In [29]:
print(sample_sub.columns)
sample_sub.head(3)

Index(['Id', 'WnvPresent'], dtype='object')


Unnamed: 0,Id,WnvPresent
0,1,0
1,2,0
2,3,0


In [34]:
submission=pd.DataFrame(list(zip(chicago['Id'],y_hat)), columns=['Id','WnvPresent'])
print(len(submission))
submission.head(3)

116293


Unnamed: 0,Id,WnvPresent
0,1,0
1,2,0
2,3,0
3,4,0
4,5,0
5,6,0
6,7,0
7,8,0
8,9,0
9,10,0


In [31]:
dirname = '../Austin/'
submission.to_csv(dirname + 'submission.csv', index=False)

In [32]:
# https://www.kaggle.com/c/predict-west-nile-virus/submit