In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing, ensemble
from IPython.display import display

### Load Data

In [2]:
# create date parser
dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%d')

# create data type converter
dtype_map = dict(Latitude = 'int', Longitude = 'int')

# read data into PANDAS DataFrames with date parsing
test = pd.read_csv('input/test.csv', parse_dates=['Date'], date_parser=dateparse)
train = pd.read_csv('input/train.csv', parse_dates=['Date'], date_parser=dateparse)
weather = pd.read_csv('input/weather.csv', parse_dates=['Date'], date_parser=dateparse)
sample_sub = pd.read_csv('input/sampleSubmission.csv')

In [3]:
print('Train')
display(train.info())

print('Weather')
display(weather.info())

Train
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10506 entries, 0 to 10505
Data columns (total 12 columns):
Date                      10506 non-null datetime64[ns]
Address                   10506 non-null object
Species                   10506 non-null object
Block                     10506 non-null int64
Street                    10506 non-null object
Trap                      10506 non-null object
AddressNumberAndStreet    10506 non-null object
Latitude                  10506 non-null float64
Longitude                 10506 non-null float64
AddressAccuracy           10506 non-null int64
NumMosquitos              10506 non-null int64
WnvPresent                10506 non-null int64
dtypes: datetime64[ns](1), float64(2), int64(4), object(5)
memory usage: 779.8+ KB


None

Weather
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2944 entries, 0 to 2943
Data columns (total 22 columns):
Station        2944 non-null int64
Date           2944 non-null datetime64[ns]
Tmax           2944 non-null int64
Tmin           2944 non-null int64
Tavg           2944 non-null object
Depart         2944 non-null object
DewPoint       2944 non-null int64
WetBulb        2944 non-null object
Heat           2944 non-null object
Cool           2944 non-null object
Sunrise        2944 non-null object
Sunset         2944 non-null object
CodeSum        2944 non-null object
Depth          2944 non-null object
Water1         2944 non-null object
SnowFall       2944 non-null object
PrecipTotal    2944 non-null object
StnPressure    2944 non-null object
SeaLevel       2944 non-null object
ResultSpeed    2944 non-null float64
ResultDir      2944 non-null int64
AvgSpeed       2944 non-null object
dtypes: datetime64[ns](1), float64(1), int64(5), object(15)
memory usage: 333.5+ KB


None

### Select Columns

In [4]:
# weather
weather_exclude = ['Dewpoint', 'WetBulb', 'CodeSum', 'Depth', 'Water1', 'SnowFall', 'StnPressure',
                 'SeaLevel', 'ResultSpeed', 'ResultDir', 'AvgSpeed','DewPoint']
weather_cols = [col for col in weather.columns if col not in weather_exclude]
weather = weather[weather_cols]


# train
train_exclude = ['Address', 'AddressNumberAndStreet', 'AddressAccuracy']
train_cols = [col for col in train.columns if col not in train_exclude]
train = train[train_cols]

# test
test_exclude = ['Address', 'AddressNumberAndStreet', 'AddressAccuracy', 'ID']
test_cols = [col for col in test.columns if col not in test_exclude]
test = test[test_cols]

In [5]:
print('Weather')
display(weather.head())

print('Train')
display(train.head())

Weather


Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,Depart,Heat,Cool,Sunrise,Sunset,PrecipTotal
0,1,2007-05-01,83,50,67,14,0,2,0448,1849,0.0
1,2,2007-05-01,84,52,68,M,0,3,-,-,0.0
2,1,2007-05-02,59,42,51,-3,14,0,0447,1850,0.0
3,2,2007-05-02,60,43,52,M,13,0,-,-,0.0
4,1,2007-05-03,66,46,56,2,9,0,0446,1851,0.0


Train


Unnamed: 0,Date,Species,Block,Street,Trap,Latitude,Longitude,NumMosquitos,WnvPresent
0,2007-05-29,CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,41.95469,-87.800991,1,0
1,2007-05-29,CULEX RESTUANS,41,N OAK PARK AVE,T002,41.95469,-87.800991,1,0
2,2007-05-29,CULEX RESTUANS,62,N MANDELL AVE,T007,41.994991,-87.769279,1,0
3,2007-05-29,CULEX PIPIENS/RESTUANS,79,W FOSTER AVE,T015,41.974089,-87.824812,1,0
4,2007-05-29,CULEX RESTUANS,79,W FOSTER AVE,T015,41.974089,-87.824812,4,0


In [6]:
# what species have been detected (note that according to the CDC each
# of these species can carry WNV)
set(train.Species)

{'CULEX ERRATICUS',
 'CULEX PIPIENS',
 'CULEX PIPIENS/RESTUANS',
 'CULEX RESTUANS',
 'CULEX SALINARIUS',
 'CULEX TARSALIS',
 'CULEX TERRITANS'}

### Replace missing Data

In [7]:
weather = weather.replace('M', np.NaN)
weather = weather.replace('-', np.NaN)
weather = weather.replace('T', np.NaN)
weather = weather.replace(' T', np.NaN)
weather = weather.replace('  T', np.NaN)

In [8]:
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2944 entries, 0 to 2943
Data columns (total 11 columns):
Station        2944 non-null int64
Date           2944 non-null datetime64[ns]
Tmax           2944 non-null int64
Tmin           2944 non-null int64
Tavg           2933 non-null object
Depart         1472 non-null object
Heat           2933 non-null object
Cool           2933 non-null object
Sunrise        1472 non-null object
Sunset         1472 non-null object
PrecipTotal    2624 non-null object
dtypes: datetime64[ns](1), int64(3), object(7)
memory usage: 172.5+ KB


In [9]:
weather.describe()

Unnamed: 0,Station,Tmax,Tmin
count,2944.0,2944.0,2944.0
mean,1.5,76.166101,57.810462
std,0.500085,11.46197,10.381939
min,1.0,41.0,29.0
25%,1.0,69.0,50.0
50%,1.5,78.0,59.0
75%,2.0,85.0,66.0
max,2.0,104.0,83.0


### Create Month and Day columns

In [10]:
month_func = lambda x: x.month
day_func= lambda x: x.day

train['month'] = train.Date.apply(month_func)
train['day'] = train.Date.apply(day_func)
test['month'] = test.Date.apply(month_func)
test['day'] = test.Date.apply(day_func)

### Create integer latitude and longitude columns

### Merge Data

In [11]:
train = train.merge(weather, on='Date')
test = test.merge(weather, on='Date')
train = train.drop(['Date'], axis = 1)
test = test.drop(['Date'], axis = 1)

### Handle Weather Stations 1

In [12]:
# split the data into two dataframes by station

train_station_1= train[train.Station == 1]
train_station_2= train[train.Station == 2]

In [13]:
# export to JSON for external use
train_station_1.to_json('train_station_1.json')
train_station_2.to_json('train_station_2.json')
train.to_json('train.json')

# epxort to csv for external use
train_station_1.to_csv('train_station_1.csv')
train_station_2.to_csv('train_station_2.csv')
train.to_csv('train.csv')


In [15]:
ls

 Volume in drive C is Acer
 Volume Serial Number is 3829-CAE6

 Directory of C:\Users\vhim98198\Downloads\wnv_data\west_nile

12/29/2016  11:58 PM    <DIR>          .
12/29/2016  11:58 PM    <DIR>          ..
12/18/2016  01:17 PM            15,364 .DS_Store
12/26/2016  10:44 PM    <DIR>          .ipynb_checkpoints
12/27/2016  11:42 PM             1,513 .Rhistory
12/18/2016  01:17 PM    <DIR>          input
12/29/2016  11:43 PM             4,667 plotting_tools.R
12/18/2016  01:17 PM             2,569 README.md
12/18/2016  01:17 PM               296 render_rmarkdown.R
12/22/2016  11:15 AM             3,017 script.py
12/18/2016  01:17 PM    <DIR>          src
12/29/2016  11:58 PM         2,225,988 train.csv
12/29/2016  11:58 PM         5,726,659 train.json
12/29/2016  11:58 PM         1,165,202 train_station_1.csv
12/29/2016  11:58 PM         2,884,378 train_station_1.json
12/29/2016  11:58 PM         1,060,934 train_station_2.csv
12/29/2016  11:58 PM         2,842,508 train_station_2.jso