In [1]:
COMP_NAME = 'predict-west-nile-virus'

In [2]:
from fastai.tabular.all import *

In [3]:
WORKING_DIR = Path(".").absolute()
INPUT_DIR = Path("../input").absolute()
ARCHIVE_DIR = Path("../archive").absolute()
MODELS_DIR = Path("../models").absolute()

In [4]:
DATA_DIR = Path("../data").absolute()
submission_path = DATA_DIR

In [5]:
path = Path(INPUT_DIR/COMP_NAME); Path.BASE_PATH = path; path.ls()

(#18) [Path('mapdata_copyright_openstreetmap_contributors.rds'),Path('spray.csv'),Path('west_nile.zip'),Path('.DS_Store'),Path('beat_the_benchmark.csv'),Path('sampleSubmission.csv'),Path('test.csv'),Path('west_nile'),Path('mapdata_copyright_openstreetmap_contributors.txt.zip'),Path('sampleSubmission.csv.zip')...]

In [6]:
import pandas as pd
import numpy as np
from sklearn import ensemble, preprocessing

In [7]:
# Load dataset 
# train = pd.read_csv('../input/train.csv')
# test = pd.read_csv('../input/test.csv')
# sample = pd.read_csv('../input/sampleSubmission.csv')
# weather = pd.read_csv('../input/weather.csv')

In [8]:
train = pd.read_csv(path/'train.csv')
test = pd.read_csv(path/'test.csv')
sample = pd.read_csv(path/'sampleSubmission.csv')
weather = pd.read_csv(path/'weather.csv')

In [9]:
# Get labels
labels = train.WnvPresent.values

In [12]:
labels[100:110]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [10]:
# Not using codesum for this benchmark
weather = weather.drop('CodeSum', axis=1)

In [13]:
weather.columns

Index(['Station', 'Date', 'Tmax', 'Tmin', 'Tavg', 'Depart', 'DewPoint',
       'WetBulb', 'Heat', 'Cool', 'Sunrise', 'Sunset', 'Depth', 'Water1',
       'SnowFall', 'PrecipTotal', 'StnPressure', 'SeaLevel', 'ResultSpeed',
       'ResultDir', 'AvgSpeed'],
      dtype='object')

In [14]:
weather.head()

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,...,Sunset,Depth,Water1,SnowFall,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
0,1,2007-05-01,83,50,67,14,51,56,0,2,...,1849,0,M,0.0,0.0,29.1,29.82,1.7,27,9.2
1,2,2007-05-01,84,52,68,M,51,57,0,3,...,-,M,M,M,0.0,29.18,29.82,2.7,25,9.6
2,1,2007-05-02,59,42,51,-3,42,47,14,0,...,1850,0,M,0.0,0.0,29.38,30.09,13.0,4,13.4
3,2,2007-05-02,60,43,52,M,42,47,13,0,...,-,M,M,M,0.0,29.44,30.08,13.3,2,13.4
4,1,2007-05-03,66,46,56,2,40,48,9,0,...,1851,0,M,0.0,0.0,29.39,30.12,11.7,7,11.9


In [15]:
# Split station 1 and 2 and join horizontally
weather_stn1 = weather[weather['Station']==1]
weather_stn2 = weather[weather['Station']==2]
weather_stn1 = weather_stn1.drop('Station', axis=1)
weather_stn2 = weather_stn2.drop('Station', axis=1)
weather = weather_stn1.merge(weather_stn2, on='Date')

In [16]:
weather.columns

Index(['Date', 'Tmax_x', 'Tmin_x', 'Tavg_x', 'Depart_x', 'DewPoint_x',
       'WetBulb_x', 'Heat_x', 'Cool_x', 'Sunrise_x', 'Sunset_x', 'Depth_x',
       'Water1_x', 'SnowFall_x', 'PrecipTotal_x', 'StnPressure_x',
       'SeaLevel_x', 'ResultSpeed_x', 'ResultDir_x', 'AvgSpeed_x', 'Tmax_y',
       'Tmin_y', 'Tavg_y', 'Depart_y', 'DewPoint_y', 'WetBulb_y', 'Heat_y',
       'Cool_y', 'Sunrise_y', 'Sunset_y', 'Depth_y', 'Water1_y', 'SnowFall_y',
       'PrecipTotal_y', 'StnPressure_y', 'SeaLevel_y', 'ResultSpeed_y',
       'ResultDir_y', 'AvgSpeed_y'],
      dtype='object')

In [17]:
weather.head()

Unnamed: 0,Date,Tmax_x,Tmin_x,Tavg_x,Depart_x,DewPoint_x,WetBulb_x,Heat_x,Cool_x,Sunrise_x,...,Sunset_y,Depth_y,Water1_y,SnowFall_y,PrecipTotal_y,StnPressure_y,SeaLevel_y,ResultSpeed_y,ResultDir_y,AvgSpeed_y
0,2007-05-01,83,50,67,14,51,56,0,2,448,...,-,M,M,M,0.00,29.18,29.82,2.7,25,9.6
1,2007-05-02,59,42,51,-3,42,47,14,0,447,...,-,M,M,M,0.00,29.44,30.08,13.3,2,13.4
2,2007-05-03,66,46,56,2,40,48,9,0,446,...,-,M,M,M,0.00,29.46,30.12,12.9,6,13.2
3,2007-05-04,66,49,58,4,41,50,7,0,444,...,-,M,M,M,0.00,29.36,30.04,10.1,7,10.4
4,2007-05-05,66,53,60,5,38,49,5,0,443,...,-,M,M,M,T,29.46,30.09,11.2,7,11.5


In [18]:
# replace some missing values and T with -1
weather = weather.replace('M', -1)
weather = weather.replace('-', -1)
weather = weather.replace('T', -1)
weather = weather.replace(' T', -1)
weather = weather.replace('  T', -1)

In [19]:
# Functions to extract month and day from dataset
# You can also use parse_dates of Pandas.
def create_month(x):
    return x.split('-')[1]

def create_day(x):
    return x.split('-')[2]

In [20]:
train['month'] = train.Date.apply(create_month)
train['day'] = train.Date.apply(create_day)
test['month'] = test.Date.apply(create_month)
test['day'] = test.Date.apply(create_day)

In [21]:
# Add integer latitude/longitude columns
train['Lat_int'] = train.Latitude.apply(int)
train['Long_int'] = train.Longitude.apply(int)
test['Lat_int'] = test.Latitude.apply(int)
test['Long_int'] = test.Longitude.apply(int)

In [22]:
train.columns

Index(['Date', 'Address', 'Species', 'Block', 'Street', 'Trap',
       'AddressNumberAndStreet', 'Latitude', 'Longitude', 'AddressAccuracy',
       'NumMosquitos', 'WnvPresent', 'month', 'day', 'Lat_int', 'Long_int'],
      dtype='object')

In [23]:
test.columns

Index(['Id', 'Date', 'Address', 'Species', 'Block', 'Street', 'Trap',
       'AddressNumberAndStreet', 'Latitude', 'Longitude', 'AddressAccuracy',
       'month', 'day', 'Lat_int', 'Long_int'],
      dtype='object')

In [24]:
train.head()

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent,month,day,Lat_int,Long_int
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634, USA",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0,5,29,41,-87
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634, USA",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0,5,29,41,-87
2,2007-05-29,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX RESTUANS,62,N MANDELL AVE,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9,1,0,5,29,41,-87
3,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX PIPIENS/RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,1,0,5,29,41,-87
4,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,4,0,5,29,41,-87


In [25]:
# drop address columns
train = train.drop(['Address', 'AddressNumberAndStreet','WnvPresent', 'NumMosquitos'], axis = 1)
test = test.drop(['Id', 'Address', 'AddressNumberAndStreet'], axis = 1)

In [26]:
train.columns

Index(['Date', 'Species', 'Block', 'Street', 'Trap', 'Latitude', 'Longitude',
       'AddressAccuracy', 'month', 'day', 'Lat_int', 'Long_int'],
      dtype='object')

In [27]:
test.columns

Index(['Date', 'Species', 'Block', 'Street', 'Trap', 'Latitude', 'Longitude',
       'AddressAccuracy', 'month', 'day', 'Lat_int', 'Long_int'],
      dtype='object')

In [28]:
train.head()

Unnamed: 0,Date,Species,Block,Street,Trap,Latitude,Longitude,AddressAccuracy,month,day,Lat_int,Long_int
0,2007-05-29,CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,41.95469,-87.800991,9,5,29,41,-87
1,2007-05-29,CULEX RESTUANS,41,N OAK PARK AVE,T002,41.95469,-87.800991,9,5,29,41,-87
2,2007-05-29,CULEX RESTUANS,62,N MANDELL AVE,T007,41.994991,-87.769279,9,5,29,41,-87
3,2007-05-29,CULEX PIPIENS/RESTUANS,79,W FOSTER AVE,T015,41.974089,-87.824812,8,5,29,41,-87
4,2007-05-29,CULEX RESTUANS,79,W FOSTER AVE,T015,41.974089,-87.824812,8,5,29,41,-87


In [29]:
# Merge with weather data
train = train.merge(weather, on='Date')
test = test.merge(weather, on='Date')
train = train.drop(['Date'], axis = 1)
test = test.drop(['Date'], axis = 1)

In [30]:
train.columns

Index(['Species', 'Block', 'Street', 'Trap', 'Latitude', 'Longitude',
       'AddressAccuracy', 'month', 'day', 'Lat_int', 'Long_int', 'Tmax_x',
       'Tmin_x', 'Tavg_x', 'Depart_x', 'DewPoint_x', 'WetBulb_x', 'Heat_x',
       'Cool_x', 'Sunrise_x', 'Sunset_x', 'Depth_x', 'Water1_x', 'SnowFall_x',
       'PrecipTotal_x', 'StnPressure_x', 'SeaLevel_x', 'ResultSpeed_x',
       'ResultDir_x', 'AvgSpeed_x', 'Tmax_y', 'Tmin_y', 'Tavg_y', 'Depart_y',
       'DewPoint_y', 'WetBulb_y', 'Heat_y', 'Cool_y', 'Sunrise_y', 'Sunset_y',
       'Depth_y', 'Water1_y', 'SnowFall_y', 'PrecipTotal_y', 'StnPressure_y',
       'SeaLevel_y', 'ResultSpeed_y', 'ResultDir_y', 'AvgSpeed_y'],
      dtype='object')

In [31]:
test.columns

Index(['Species', 'Block', 'Street', 'Trap', 'Latitude', 'Longitude',
       'AddressAccuracy', 'month', 'day', 'Lat_int', 'Long_int', 'Tmax_x',
       'Tmin_x', 'Tavg_x', 'Depart_x', 'DewPoint_x', 'WetBulb_x', 'Heat_x',
       'Cool_x', 'Sunrise_x', 'Sunset_x', 'Depth_x', 'Water1_x', 'SnowFall_x',
       'PrecipTotal_x', 'StnPressure_x', 'SeaLevel_x', 'ResultSpeed_x',
       'ResultDir_x', 'AvgSpeed_x', 'Tmax_y', 'Tmin_y', 'Tavg_y', 'Depart_y',
       'DewPoint_y', 'WetBulb_y', 'Heat_y', 'Cool_y', 'Sunrise_y', 'Sunset_y',
       'Depth_y', 'Water1_y', 'SnowFall_y', 'PrecipTotal_y', 'StnPressure_y',
       'SeaLevel_y', 'ResultSpeed_y', 'ResultDir_y', 'AvgSpeed_y'],
      dtype='object')

In [32]:
# Convert categorical data to numbers
lbl = preprocessing.LabelEncoder()
lbl.fit(list(train['Species'].values) + list(test['Species'].values))
train['Species'] = lbl.transform(train['Species'].values)
test['Species'] = lbl.transform(test['Species'].values)

In [33]:
train.columns

Index(['Species', 'Block', 'Street', 'Trap', 'Latitude', 'Longitude',
       'AddressAccuracy', 'month', 'day', 'Lat_int', 'Long_int', 'Tmax_x',
       'Tmin_x', 'Tavg_x', 'Depart_x', 'DewPoint_x', 'WetBulb_x', 'Heat_x',
       'Cool_x', 'Sunrise_x', 'Sunset_x', 'Depth_x', 'Water1_x', 'SnowFall_x',
       'PrecipTotal_x', 'StnPressure_x', 'SeaLevel_x', 'ResultSpeed_x',
       'ResultDir_x', 'AvgSpeed_x', 'Tmax_y', 'Tmin_y', 'Tavg_y', 'Depart_y',
       'DewPoint_y', 'WetBulb_y', 'Heat_y', 'Cool_y', 'Sunrise_y', 'Sunset_y',
       'Depth_y', 'Water1_y', 'SnowFall_y', 'PrecipTotal_y', 'StnPressure_y',
       'SeaLevel_y', 'ResultSpeed_y', 'ResultDir_y', 'AvgSpeed_y'],
      dtype='object')

In [34]:
test.columns

Index(['Species', 'Block', 'Street', 'Trap', 'Latitude', 'Longitude',
       'AddressAccuracy', 'month', 'day', 'Lat_int', 'Long_int', 'Tmax_x',
       'Tmin_x', 'Tavg_x', 'Depart_x', 'DewPoint_x', 'WetBulb_x', 'Heat_x',
       'Cool_x', 'Sunrise_x', 'Sunset_x', 'Depth_x', 'Water1_x', 'SnowFall_x',
       'PrecipTotal_x', 'StnPressure_x', 'SeaLevel_x', 'ResultSpeed_x',
       'ResultDir_x', 'AvgSpeed_x', 'Tmax_y', 'Tmin_y', 'Tavg_y', 'Depart_y',
       'DewPoint_y', 'WetBulb_y', 'Heat_y', 'Cool_y', 'Sunrise_y', 'Sunset_y',
       'Depth_y', 'Water1_y', 'SnowFall_y', 'PrecipTotal_y', 'StnPressure_y',
       'SeaLevel_y', 'ResultSpeed_y', 'ResultDir_y', 'AvgSpeed_y'],
      dtype='object')

In [35]:
train.head()

Unnamed: 0,Species,Block,Street,Trap,Latitude,Longitude,AddressAccuracy,month,day,Lat_int,...,Sunset_y,Depth_y,Water1_y,SnowFall_y,PrecipTotal_y,StnPressure_y,SeaLevel_y,ResultSpeed_y,ResultDir_y,AvgSpeed_y
0,2,41,N OAK PARK AVE,T002,41.95469,-87.800991,9,5,29,41,...,-1,-1,-1,-1,0.0,29.44,30.09,5.8,16,7.4
1,3,41,N OAK PARK AVE,T002,41.95469,-87.800991,9,5,29,41,...,-1,-1,-1,-1,0.0,29.44,30.09,5.8,16,7.4
2,3,62,N MANDELL AVE,T007,41.994991,-87.769279,9,5,29,41,...,-1,-1,-1,-1,0.0,29.44,30.09,5.8,16,7.4
3,2,79,W FOSTER AVE,T015,41.974089,-87.824812,8,5,29,41,...,-1,-1,-1,-1,0.0,29.44,30.09,5.8,16,7.4
4,3,79,W FOSTER AVE,T015,41.974089,-87.824812,8,5,29,41,...,-1,-1,-1,-1,0.0,29.44,30.09,5.8,16,7.4


In [38]:
lbl.classes_

array(['CULEX ERRATICUS', 'CULEX PIPIENS', 'CULEX PIPIENS/RESTUANS',
       'CULEX RESTUANS', 'CULEX SALINARIUS', 'CULEX TARSALIS',
       'CULEX TERRITANS', 'UNSPECIFIED CULEX'], dtype='<U22')

In [39]:
lbl.fit(list(train['Street'].values) + list(test['Street'].values))
train['Street'] = lbl.transform(train['Street'].values)
test['Street'] = lbl.transform(test['Street'].values)

In [40]:
lbl.classes_

array(['  W ARMITAGE AVENUE', ' E 105TH ST', ' E 111TH ST', ' E 115TH ST',
       ' E 118TH ST', ' E 130TH ST', ' E 136TH ST', ' E 138TH ST',
       ' E 67TH ST', ' E 91ST PL', ' E 91ST ST', ' E RANDOLPH ST',
       ' N ASHLAND AVE', ' N ASHLAND AVE OVERPASS', ' N AUSTIN AVE',
       ' N AVONDALE AVE', ' N CALIFORNIA AVE', ' N CAMPBELL AVE',
       ' N CANNON DR', ' N CENTRAL PARK DR', ' N FRANCISCO AVE',
       ' N HARLEM AVE', ' N HUMBOLDT DR', ' N KEDVALE AVE',
       ' N KENNETH AVE', ' N KILBOURN AVE', ' N LARAMIE AVE',
       ' N LAWLER AVE', ' N LEMONT AVE', ' N LONG AVE', ' N MANDELL AVE',
       ' N MCCLELLAN AVE', ' N MELVINA AVE', ' N MILWAUKEE AVE',
       ' N MONT CLARE AVE', ' N MOSELL AVE', ' N OAK PARK AVE',
       ' N OAKLEY AVE', ' N OKETO AVE', ' N PITTSBURGH AVE',
       ' N PULASKI RD', ' N RICHMOND ST', ' N RIDGE AVE',
       ' N RUTHERFORD AVE', ' N SPRINGFIELD AVE', ' N STAVE ST',
       ' N STREETER DR', ' N TRIPP AVE', ' N WESTERN AVE',
       ' S ARTESIAN AVE

In [41]:
lbl.fit(list(train['Trap'].values) + list(test['Trap'].values))
train['Trap'] = lbl.transform(train['Trap'].values)
test['Trap'] = lbl.transform(test['Trap'].values)

In [42]:
lbl.classes_

array(['T001', 'T002', 'T002A', 'T002B', 'T003', 'T004', 'T005', 'T006',
       'T007', 'T008', 'T009', 'T011', 'T012', 'T013', 'T014', 'T015',
       'T016', 'T017', 'T018', 'T019', 'T025', 'T027', 'T028', 'T030',
       'T031', 'T033', 'T034', 'T035', 'T036', 'T037', 'T039', 'T040',
       'T043', 'T044', 'T045', 'T046', 'T047', 'T048', 'T049', 'T050',
       'T051', 'T054', 'T054C', 'T060', 'T061', 'T062', 'T063', 'T065',
       'T065A', 'T066', 'T067', 'T069', 'T070', 'T071', 'T072', 'T073',
       'T074', 'T075', 'T076', 'T077', 'T078', 'T079', 'T080', 'T081',
       'T082', 'T083', 'T084', 'T085', 'T086', 'T088', 'T089', 'T090',
       'T090A', 'T090B', 'T090C', 'T091', 'T092', 'T094', 'T094B', 'T095',
       'T096', 'T097', 'T099', 'T100', 'T102', 'T103', 'T107', 'T114',
       'T115', 'T128', 'T128A', 'T129', 'T135', 'T138', 'T141', 'T142',
       'T143', 'T144', 'T145', 'T146', 'T147', 'T148', 'T149', 'T150',
       'T151', 'T152', 'T153', 'T154', 'T155', 'T156', 'T157', 'T158

In [46]:
train.loc[:,(train == -1).any(axis=0)].count()

WetBulb_x        10506
Water1_x         10506
SnowFall_x       10506
PrecipTotal_x    10506
StnPressure_x    10506
Depart_y         10506
Sunrise_y        10506
Sunset_y         10506
Depth_y          10506
Water1_y         10506
SnowFall_y       10506
PrecipTotal_y    10506
dtype: int64

In [44]:
train.count()

Species            10506
Block              10506
Street             10506
Trap               10506
Latitude           10506
Longitude          10506
AddressAccuracy    10506
month              10506
day                10506
Lat_int            10506
Long_int           10506
Tmax_x             10506
Tmin_x             10506
Tavg_x             10506
Depart_x           10506
DewPoint_x         10506
WetBulb_x          10506
Heat_x             10506
Cool_x             10506
Sunrise_x          10506
Sunset_x           10506
Depth_x            10506
Water1_x           10506
SnowFall_x         10506
PrecipTotal_x      10506
StnPressure_x      10506
SeaLevel_x         10506
ResultSpeed_x      10506
ResultDir_x        10506
AvgSpeed_x         10506
Tmax_y             10506
Tmin_y             10506
Tavg_y             10506
Depart_y           10506
DewPoint_y         10506
WetBulb_y          10506
Heat_y             10506
Cool_y             10506
Sunrise_y          10506
Sunset_y           10506


In [None]:
# drop columns with -1s
train = train.loc[:,(train != -1).any(axis=0)]
test = test.loc[:,(test != -1).any(axis=0)]

In [None]:
# Random Forest Classifier 
clf = ensemble.RandomForestClassifier(n_jobs=-1, n_estimators=1000, min_samples_split=2)
clf.fit(train, labels)

In [None]:
# create predictions and submission file
predictions = clf.predict_proba(test)[:,1]
sample['WnvPresent'] = predictions
!mkdir -p {submission_path}/abishek-beat-the-benchmark
sample.to_csv(submission_path/'abishek-beat-the-benchmark'/'submission1.csv', index=False)

In [None]:
sample.head()