In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing, ensemble
from IPython.display import display

### Load Data

In [2]:
# create date parser
dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%d')

# create data type converters
dtype_map_weather = dict(Station = 'str')
dtype_map_test_train = dict(Block = 'str')

# read data into PANDAS DataFrames with date parsing
test = pd.read_csv('input/test.csv', parse_dates=['Date'], date_parser=dateparse, dtype= dtype_map_test_train)
train = pd.read_csv('input/train.csv', parse_dates=['Date'], date_parser=dateparse, dtype= dtype_map_test_train)
weather = pd.read_csv('input/weather.csv', parse_dates=['Date'], date_parser=dateparse, dtype= dtype_map_weather)
sample_sub = pd.read_csv('input/sampleSubmission.csv')

In [3]:
print('Train')
display(train.info())

print('Test')
display(test.info())

Train
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10506 entries, 0 to 10505
Data columns (total 12 columns):
Date                      10506 non-null datetime64[ns]
Address                   10506 non-null object
Species                   10506 non-null object
Block                     10506 non-null object
Street                    10506 non-null object
Trap                      10506 non-null object
AddressNumberAndStreet    10506 non-null object
Latitude                  10506 non-null float64
Longitude                 10506 non-null float64
AddressAccuracy           10506 non-null int64
NumMosquitos              10506 non-null int64
WnvPresent                10506 non-null int64
dtypes: datetime64[ns](1), float64(2), int64(3), object(6)
memory usage: 738.7+ KB


None

Test
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116293 entries, 0 to 116292
Data columns (total 11 columns):
Id                        116293 non-null int64
Date                      116293 non-null datetime64[ns]
Address                   116293 non-null object
Species                   116293 non-null object
Block                     116293 non-null object
Street                    116293 non-null object
Trap                      116293 non-null object
AddressNumberAndStreet    116293 non-null object
Latitude                  116293 non-null float64
Longitude                 116293 non-null float64
AddressAccuracy           116293 non-null int64
dtypes: datetime64[ns](1), float64(2), int64(2), object(6)
memory usage: 7.1+ MB


None

In [4]:
print('Weather')
display(weather.info())

Weather
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2944 entries, 0 to 2943
Data columns (total 22 columns):
Station        2944 non-null object
Date           2944 non-null datetime64[ns]
Tmax           2944 non-null int64
Tmin           2944 non-null int64
Tavg           2944 non-null object
Depart         2944 non-null object
DewPoint       2944 non-null int64
WetBulb        2944 non-null object
Heat           2944 non-null object
Cool           2944 non-null object
Sunrise        2944 non-null object
Sunset         2944 non-null object
CodeSum        2944 non-null object
Depth          2944 non-null object
Water1         2944 non-null object
SnowFall       2944 non-null object
PrecipTotal    2944 non-null object
StnPressure    2944 non-null object
SeaLevel       2944 non-null object
ResultSpeed    2944 non-null float64
ResultDir      2944 non-null int64
AvgSpeed       2944 non-null object
dtypes: datetime64[ns](1), float64(1), int64(4), object(16)
memory usage: 322.0+ KB


None

### Select Columns

In [5]:
# weather
weather_exclude = ['Dewpoint', 'WetBulb', 'CodeSum', 'Depth', 'Water1', 'SnowFall', 'StnPressure',
                 'SeaLevel', 'ResultSpeed', 'ResultDir', 'AvgSpeed','DewPoint']
weather_cols = [col for col in weather.columns if col not in weather_exclude]
weather = weather[weather_cols]


# train
train_exclude = ['Address', 'AddressNumberAndStreet', 'AddressAccuracy', 'NumMosquitos']
train_cols = [col for col in train.columns if col not in train_exclude]
train = train[train_cols]

# test
test_exclude = ['Address', 'AddressNumberAndStreet', 'AddressAccuracy', 'Id']
test_cols = [col for col in test.columns if col not in test_exclude]
test = test[test_cols]

In [6]:
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2944 entries, 0 to 2943
Data columns (total 11 columns):
Station        2944 non-null object
Date           2944 non-null datetime64[ns]
Tmax           2944 non-null int64
Tmin           2944 non-null int64
Tavg           2944 non-null object
Depart         2944 non-null object
Heat           2944 non-null object
Cool           2944 non-null object
Sunrise        2944 non-null object
Sunset         2944 non-null object
PrecipTotal    2944 non-null object
dtypes: datetime64[ns](1), int64(2), object(8)
memory usage: 161.0+ KB


In [7]:
print('Weather')
display(weather.head())

print('Train')
display(train.head())

Weather


Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,Depart,Heat,Cool,Sunrise,Sunset,PrecipTotal
0,1,2007-05-01,83,50,67,14,0,2,0448,1849,0.0
1,2,2007-05-01,84,52,68,M,0,3,-,-,0.0
2,1,2007-05-02,59,42,51,-3,14,0,0447,1850,0.0
3,2,2007-05-02,60,43,52,M,13,0,-,-,0.0
4,1,2007-05-03,66,46,56,2,9,0,0446,1851,0.0


Train


Unnamed: 0,Date,Species,Block,Street,Trap,Latitude,Longitude,WnvPresent
0,2007-05-29,CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,41.95469,-87.800991,0
1,2007-05-29,CULEX RESTUANS,41,N OAK PARK AVE,T002,41.95469,-87.800991,0
2,2007-05-29,CULEX RESTUANS,62,N MANDELL AVE,T007,41.994991,-87.769279,0
3,2007-05-29,CULEX PIPIENS/RESTUANS,79,W FOSTER AVE,T015,41.974089,-87.824812,0
4,2007-05-29,CULEX RESTUANS,79,W FOSTER AVE,T015,41.974089,-87.824812,0


In [8]:
# what species have been detected (note that according to the CDC each
# of these species can carry WNV)
set(train.Species)

{'CULEX ERRATICUS',
 'CULEX PIPIENS',
 'CULEX PIPIENS/RESTUANS',
 'CULEX RESTUANS',
 'CULEX SALINARIUS',
 'CULEX TARSALIS',
 'CULEX TERRITANS'}

In [9]:
# does this correspond to the test set
set(test.Species)
# it looks like there is another category

{'CULEX ERRATICUS',
 'CULEX PIPIENS',
 'CULEX PIPIENS/RESTUANS',
 'CULEX RESTUANS',
 'CULEX SALINARIUS',
 'CULEX TARSALIS',
 'CULEX TERRITANS',
 'UNSPECIFIED CULEX'}

In [10]:
train.groupby('Species').sum().WnvPresent

Species
CULEX ERRATICUS             0
CULEX PIPIENS             240
CULEX PIPIENS/RESTUANS    262
CULEX RESTUANS             49
CULEX SALINARIUS            0
CULEX TARSALIS              0
CULEX TERRITANS             0
Name: WnvPresent, dtype: int64

### Examine and Handle missing Data

**What is 'T' and 'M'?**
- From http://www.nws.noaa.gov/om/csd/info/NOWdata/FAQ.php

> "M" stands for "Missing". Data for an element will be missing if the primary sensor for that weather element is inoperable (e.g., has an outage) or malfunctioning (e.g., producing errant data) AND any collocated backup sensor is also inoperable or malfunctioning. "T" stand for "Trace". This is a small amount of precipitation that will wet a raingage but is less than the 0.01 inch measuring limit.

The Precipitation Total column is the only column in the retained data that can contain this value. The value has leading whitespace so we should strip prior to counting.

In [11]:
# strip whitespace
weather.PrecipTotal = weather.PrecipTotal.str.strip()

In [12]:
miss_weather = ['M', '-']
trace_weather = ['T']

In [13]:
cols_not_date = [col for col in weather.columns if col != 'Date']

In [14]:
weather[cols_not_date].apply(pd.value_counts, axis=1)[miss_weather + trace_weather].fillna(0).sum()

M    1507.0
-    2944.0
T     318.0
dtype: float64

In [15]:
# Both stations
check = weather[cols_not_date].apply(pd.value_counts, axis=0).fillna(0)
check.loc[['M', '-', 'T']]

Unnamed: 0,Station,Tmax,Tmin,Tavg,Depart,Heat,Cool,Sunrise,Sunset,PrecipTotal
M,0.0,0.0,0.0,11.0,1472.0,11.0,11.0,0.0,0.0,2.0
-,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1472.0,1472.0,0.0
T,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,318.0


In [16]:
# Station 1
check_stat1 = weather[cols_not_date][weather.Station == '1'].apply(pd.value_counts, axis=0).fillna(0)
check_stat1.loc[['M', '-', 'T']]

Unnamed: 0,Station,Tmax,Tmin,Tavg,Depart,Heat,Cool,Sunrise,Sunset,PrecipTotal
M,,,,,,,,,,
-,,,,,,,,,,
T,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,163.0


In [17]:
# Station 2
check_stat2 = weather[cols_not_date][weather.Station == '2'].apply(pd.value_counts, axis=0).fillna(0)
check_stat2.loc[['M', '-', 'T']]

Unnamed: 0,Station,Tmax,Tmin,Tavg,Depart,Heat,Cool,Sunrise,Sunset,PrecipTotal
M,0.0,0.0,0.0,11.0,1472.0,11.0,11.0,0.0,0.0,2.0
-,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1472.0,1472.0,0.0
T,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,155.0


In [18]:
# Both stations
check.loc[['M', '-', 'T']]/(len(weather)) * 100

Unnamed: 0,Station,Tmax,Tmin,Tavg,Depart,Heat,Cool,Sunrise,Sunset,PrecipTotal
M,0.0,0.0,0.0,0.373641,50.0,0.373641,0.373641,0.0,0.0,0.067935
-,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,50.0,0.0
T,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.80163


In [19]:
# Station 1
check_stat1.loc[['M', '-', 'T']]/(len(weather)) * 100

Unnamed: 0,Station,Tmax,Tmin,Tavg,Depart,Heat,Cool,Sunrise,Sunset,PrecipTotal
M,,,,,,,,,,
-,,,,,,,,,,
T,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.536685


In [20]:
# Station 2()
check_stat2.loc[['M', '-', 'T']]/(len(weather)) * 100

Unnamed: 0,Station,Tmax,Tmin,Tavg,Depart,Heat,Cool,Sunrise,Sunset,PrecipTotal
M,0.0,0.0,0.0,0.373641,50.0,0.373641,0.373641,0.0,0.0,0.067935
-,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,50.0,0.0
T,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.264946


In [21]:
weather = weather.replace('M', np.NaN)
weather = weather.replace('-', np.NaN)
weather = weather.replace('T', 0.005) # very small amounts of rain can impact mosquito hatches
weather.Tmax = weather.Tmax.fillna(method = 'ffill')
weather.Tmin = weather.Tmin.fillna(method = 'ffill')
weather.Depart = weather.Depart.fillna(method = 'ffill')
weather.Heat = weather.Heat.fillna(method = 'ffill')
weather.Cool = weather.Cool.fillna(method = 'ffill')
weather.PrecipTotal = weather.PrecipTotal.fillna(method = 'ffill')

In [22]:
# convert datatpypes

to_numeric = ['Tmax','Tmin','Tavg', 'Depart', 'Heat', 'Cool', 'PrecipTotal']

for col in to_numeric:
    weather[col]= pd.to_numeric(weather[col])

In [23]:
weather.Sunrise = weather.Sunrise.fillna(method = 'ffill')
weather.Sunset = weather.Sunset.fillna(method = 'ffill')

In [24]:
# sunset has entries where instead of incrementing to the next hour after xx59 it incremented to xx60
# This causes an exception, let's take a look
counter = 0
tracker = []
for index, val in enumerate(weather.Sunset):
    try:
        pd.to_datetime(val, format = '%H%M').time()
    except:
        counter += 1
        tracker.append((index, val, val[2:], counter))

print(tracker[-1])

# there are 48 exceptions

(2925, '1660', '60', 48)


In [25]:
# let's deal with this by decrmenting by 1 for each invalid instance
weather.Sunset = weather.Sunset.replace('\+?60', '59', regex = True)

In [26]:
# time conversion lambda function
time_func = lambda x: pd.Timestamp(pd.to_datetime(x, format = '%H%M'))

In [27]:
weather.Sunrise = weather.Sunrise.apply(time_func)

In [28]:
weather.Sunset = weather.Sunset.apply(time_func)

In [29]:
# what is the range of values for sunrise and sunset
(weather.Sunset - weather.Sunrise).astype('timedelta64[h]').unique()

array([ 14.,  15.,  13.,  12.,  11.,  10.])

In [30]:
#create a DayLength column
weather['DayLength'] = (weather.Sunset - weather.Sunrise).astype('timedelta64[h]')

In [31]:
mean_func = lambda x: x.mean()

blend_cols = ['Tmax', 'Tmin', 'Depart' ,'Heat', 'Cool', 'PrecipTotal']

In [32]:
blended_cols= ['blended_' + col for col in blend_cols]

In [33]:
station_1 = weather[blend_cols][weather.Station == '1']
station_2 = weather[blend_cols][weather.Station == '2']

In [34]:
station_blend = pd.DataFrame((station_1.values + station_2.values)/2, columns= blended_cols)

In [35]:
extract_2 = weather[weather.Station == '2'].reset_index(drop = True)
extract_2.head()

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,Depart,Heat,Cool,Sunrise,Sunset,PrecipTotal,DayLength
0,2,2007-05-01,84,52,68.0,14,0,3,1900-01-01 04:48:00,1900-01-01 18:49:00,0.0,14.0
1,2,2007-05-02,60,43,52.0,-3,13,0,1900-01-01 04:47:00,1900-01-01 18:50:00,0.0,14.0
2,2,2007-05-03,67,48,58.0,2,7,0,1900-01-01 04:46:00,1900-01-01 18:51:00,0.0,14.0
3,2,2007-05-04,78,51,,4,7,0,1900-01-01 04:44:00,1900-01-01 18:52:00,0.0,14.0
4,2,2007-05-05,66,54,60.0,5,5,0,1900-01-01 04:43:00,1900-01-01 18:53:00,0.005,14.0


In [36]:
extract_1 = weather[weather.Station == '1'].reset_index(drop = True)
extract_1.head()

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,Depart,Heat,Cool,Sunrise,Sunset,PrecipTotal,DayLength
0,1,2007-05-01,83,50,67.0,14,0,2,1900-01-01 04:48:00,1900-01-01 18:49:00,0.0,14.0
1,1,2007-05-02,59,42,51.0,-3,14,0,1900-01-01 04:47:00,1900-01-01 18:50:00,0.0,14.0
2,1,2007-05-03,66,46,56.0,2,9,0,1900-01-01 04:46:00,1900-01-01 18:51:00,0.0,14.0
3,1,2007-05-04,66,49,58.0,4,7,0,1900-01-01 04:44:00,1900-01-01 18:52:00,0.005,14.0
4,1,2007-05-05,66,53,60.0,5,5,0,1900-01-01 04:43:00,1900-01-01 18:53:00,0.005,14.0


In [37]:
joined_1 = extract_1.join(station_blend)
joined_2 = extract_2.join(station_blend)

In [38]:
weather_blend = pd.concat([joined_1, joined_2])

In [39]:
weather_blend.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2944 entries, 0 to 1471
Data columns (total 18 columns):
Station                2944 non-null object
Date                   2944 non-null datetime64[ns]
Tmax                   2944 non-null int64
Tmin                   2944 non-null int64
Tavg                   2933 non-null float64
Depart                 2944 non-null int64
Heat                   2944 non-null int64
Cool                   2944 non-null int64
Sunrise                2944 non-null datetime64[ns]
Sunset                 2944 non-null datetime64[ns]
PrecipTotal            2944 non-null float64
DayLength              2944 non-null float64
blended_Tmax           2944 non-null float64
blended_Tmin           2944 non-null float64
blended_Depart         2944 non-null float64
blended_Heat           2944 non-null float64
blended_Cool           2944 non-null float64
blended_PrecipTotal    2944 non-null float64
dtypes: datetime64[ns](3), float64(9), int64(5), object(1)
memory usage: 

In [40]:
# Create dummies from the categorical species data and block data
train = pd.get_dummies(train, columns= ['Species'])
test =  pd.get_dummies(test, columns= ['Species'])

train = pd.get_dummies(train, columns= ['Block'])
test =  pd.get_dummies(test, columns= ['Block'])

### Create Month and Day columns

In [41]:
month_func = lambda x: x.month
day_func= lambda x: x.day
day_of_year_func = lambda x: x.dayofyear
week_of_year_func = lambda x: x.week

# train
train['month'] = train.Date.apply(month_func)
train['day'] = train.Date.apply(day_func)
train['day_of_year'] = train.Date.apply(day_of_year_func)
train['week'] = train.Date.apply(week_of_year_func)

# test
test['month'] = test.Date.apply(month_func)
test['day'] = test.Date.apply(day_func)
test['day_of_year'] = test.Date.apply(day_of_year_func)
test['week'] = test.Date.apply(week_of_year_func)

### Create integer latitude and longitude columns

In [42]:
train.describe()

Unnamed: 0,Latitude,Longitude,WnvPresent,Species_CULEX ERRATICUS,Species_CULEX PIPIENS,Species_CULEX PIPIENS/RESTUANS,Species_CULEX RESTUANS,Species_CULEX SALINARIUS,Species_CULEX TARSALIS,Species_CULEX TERRITANS,...,Block_89,Block_90,Block_91,Block_93,Block_96,Block_98,month,day,day_of_year,week
count,10506.0,10506.0,10506.0,10506.0,10506.0,10506.0,10506.0,10506.0,10506.0,10506.0,...,10506.0,10506.0,10506.0,10506.0,10506.0,10506.0,10506.0,10506.0,10506.0,10506.0
mean,41.841139,-87.699908,0.052446,9.5e-05,0.256901,0.452313,0.260803,0.008186,0.000571,0.021131,...,0.014944,0.007329,0.010565,0.001999,0.002951,0.002189,7.692557,15.341424,217.941938,31.687131
std,0.112742,0.096514,0.222936,0.009756,0.436945,0.497744,0.439094,0.090109,0.023892,0.143827,...,0.121334,0.0853,0.102249,0.044666,0.054243,0.04674,1.067675,8.901205,31.735797,4.526874
min,41.644612,-87.930995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5.0,1.0,148.0,22.0
25%,41.732984,-87.76007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,7.0,7.0,196.0,28.0
50%,41.846283,-87.694991,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,8.0,15.0,217.0,31.0
75%,41.95469,-87.627796,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,8.0,24.0,241.0,35.0
max,42.01743,-87.531635,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,10.0,31.0,282.0,41.0


In [43]:
test.describe()

Unnamed: 0,Latitude,Longitude,Species_CULEX ERRATICUS,Species_CULEX PIPIENS,Species_CULEX PIPIENS/RESTUANS,Species_CULEX RESTUANS,Species_CULEX SALINARIUS,Species_CULEX TARSALIS,Species_CULEX TERRITANS,Species_UNSPECIFIED CULEX,...,Block_89,Block_90,Block_91,Block_93,Block_96,Block_98,month,day,day_of_year,week
count,116293.0,116293.0,116293.0,116293.0,116293.0,116293.0,116293.0,116293.0,116293.0,116293.0,...,116293.0,116293.0,116293.0,116293.0,116293.0,116293.0,116293.0,116293.0,116293.0,116293.0
mean,41.849389,-87.693658,0.123352,0.124866,0.132072,0.126147,0.123438,0.123369,0.123404,0.123352,...,0.013156,0.006535,0.013139,0.006535,0.01307,0.006535,7.620244,16.187681,217.057304,31.417833
std,0.106593,0.080699,0.328842,0.330568,0.33857,0.332016,0.328941,0.328862,0.328902,0.328842,...,0.113945,0.080576,0.113871,0.080576,0.113577,0.080576,1.065842,8.800007,32.536523,4.660725
min,41.644612,-87.930995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,6.0,1.0,153.0,22.0
25%,41.753411,-87.750938,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,7.0,10.0,193.0,28.0
50%,41.862292,-87.694991,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,8.0,17.0,217.0,31.0
75%,41.951866,-87.64886,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,8.0,24.0,244.0,35.0
max,42.01743,-87.531635,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,10.0,31.0,275.0,40.0


### Merge Data

In [44]:
train = train.merge(weather_blend, on='Date')
test = test.merge(weather_blend, on='Date')
train = train.drop(['Date'], axis = 1)
test = test.drop(['Date'], axis = 1)

### Inspect DFs

In [45]:
weather_blend.ix[:,:12].describe()

Unnamed: 0,Tmax,Tmin,Tavg,Depart,Heat,Cool,PrecipTotal,DayLength
count,2944.0,2944.0,2933.0,2944.0,2944.0,2944.0,2944.0,2944.0
mean,76.166101,57.810462,67.240027,1.954484,3.417799,5.641304,0.13178,13.217391
std,11.46197,10.381939,10.545442,6.839947,5.955153,6.104849,0.393031,1.573315
min,41.0,29.0,36.0,-17.0,0.0,0.0,0.0,10.0
25%,69.0,50.0,60.0,-3.0,0.0,0.0,0.0,12.0
50%,78.0,59.0,69.0,2.0,0.0,4.0,0.0,14.0
75%,85.0,66.0,75.0,7.0,5.0,10.0,0.06,14.0
max,104.0,83.0,94.0,23.0,29.0,29.0,6.86,15.0


In [46]:
weather_blend.ix[:,12:].describe()

Unnamed: 0,blended_Tmax,blended_Tmin,blended_Depart,blended_Heat,blended_Cool,blended_PrecipTotal
count,2944.0,2944.0,2944.0,2944.0,2944.0,2944.0
mean,76.166101,57.810462,1.954484,3.417799,5.641304,0.13178
std,11.422764,10.26837,6.839947,5.932802,6.061981,0.35829
min,41.5,29.0,-17.0,0.0,0.0,0.0
25%,68.5,50.5,-3.0,0.0,0.0,0.0
50%,78.5,59.5,2.0,0.0,4.0,0.0025
75%,84.5,65.5,7.0,4.5,10.5,0.08
max,103.5,82.5,23.0,29.0,28.5,4.855


In [47]:
train.describe()

Unnamed: 0,Latitude,Longitude,WnvPresent,Species_CULEX ERRATICUS,Species_CULEX PIPIENS,Species_CULEX PIPIENS/RESTUANS,Species_CULEX RESTUANS,Species_CULEX SALINARIUS,Species_CULEX TARSALIS,Species_CULEX TERRITANS,...,Heat,Cool,PrecipTotal,DayLength,blended_Tmax,blended_Tmin,blended_Depart,blended_Heat,blended_Cool,blended_PrecipTotal
count,21012.0,21012.0,21012.0,21012.0,21012.0,21012.0,21012.0,21012.0,21012.0,21012.0,...,21012.0,21012.0,21012.0,21012.0,21012.0,21012.0,21012.0,21012.0,21012.0,21012.0
mean,41.841139,-87.699908,0.052446,9.5e-05,0.256901,0.452313,0.260803,0.008186,0.000571,0.021131,...,0.886208,8.908671,0.146082,13.561489,81.917095,63.647202,2.740244,0.886208,8.908671,0.146082
std,0.112739,0.096512,0.22293,0.009756,0.436935,0.497733,0.439083,0.090106,0.023891,0.143824,...,2.71223,5.755434,0.378659,1.133125,8.272521,7.486933,6.486313,2.697339,5.712594,0.323131
min,41.644612,-87.930995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,11.0,57.5,42.5,-12.0,0.0,0.0,0.0
25%,41.732984,-87.76007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,5.0,0.0,13.0,78.0,59.5,-2.0,0.0,5.0,0.0
50%,41.846283,-87.694991,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,10.0,0.0,14.0,83.0,65.5,4.0,0.0,9.5,0.0
75%,41.95469,-87.627796,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,14.0,0.11,14.0,88.0,69.5,8.0,0.0,13.5,0.13
max,42.01743,-87.531635,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,15.0,22.0,3.97,15.0,96.5,77.5,18.0,14.0,20.5,2.42


In [48]:
test.columns

Index([u'Street', u'Trap', u'Latitude', u'Longitude',
       u'Species_CULEX ERRATICUS', u'Species_CULEX PIPIENS',
       u'Species_CULEX PIPIENS/RESTUANS', u'Species_CULEX RESTUANS',
       u'Species_CULEX SALINARIUS', u'Species_CULEX TARSALIS',
       u'Species_CULEX TERRITANS', u'Species_UNSPECIFIED CULEX', u'Block_10',
       u'Block_11', u'Block_12', u'Block_13', u'Block_14', u'Block_15',
       u'Block_17', u'Block_18', u'Block_20', u'Block_21', u'Block_22',
       u'Block_24', u'Block_25', u'Block_26', u'Block_27', u'Block_28',
       u'Block_29', u'Block_30', u'Block_33', u'Block_34', u'Block_35',
       u'Block_36', u'Block_37', u'Block_38', u'Block_39', u'Block_40',
       u'Block_41', u'Block_42', u'Block_43', u'Block_45', u'Block_46',
       u'Block_47', u'Block_48', u'Block_49', u'Block_50', u'Block_51',
       u'Block_52', u'Block_53', u'Block_55', u'Block_58', u'Block_60',
       u'Block_61', u'Block_62', u'Block_63', u'Block_64', u'Block_65',
       u'Block_66', u'Block

### Handle Weather Stations 1

In [None]:
# split the data into two dataframes by station

train_station_1= train[train.Station == '1']
train_station_2= train[train.Station == '2']

In [None]:
# export to JSON for external use
train_station_1.to_json('train_station_1.json')
train_station_2.to_json('train_station_2.json')
train.to_json('train.json')

# epxort to csv for external use
train_station_1.to_csv('train_station_1.csv')
train_station_2.to_csv('train_station_2.csv')
train.to_csv('train.csv')


In [None]:
ls