In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing, ensemble
from IPython.display import display

### Load Data

In [2]:
# create date parser
dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%d')

# create data type converters
dtype_map_weather = dict(Station = 'str')
dtype_map_test_train = dict(Block = 'str', Street = 'str')

# read data into PANDAS DataFrames with date parsing
test = pd.read_csv('input/test.csv', parse_dates=['Date'], date_parser=dateparse, dtype= dtype_map_test_train)
train = pd.read_csv('input/train.csv', parse_dates=['Date'], date_parser=dateparse, dtype= dtype_map_test_train)
weather = pd.read_csv('input/weather.csv', parse_dates=['Date'], date_parser=dateparse, dtype= dtype_map_weather)
sample_sub = pd.read_csv('input/sampleSubmission.csv')

In [3]:
print('Train')
display(train.info())

print('Test')
display(test.info())

Train
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10506 entries, 0 to 10505
Data columns (total 12 columns):
Date                      10506 non-null datetime64[ns]
Address                   10506 non-null object
Species                   10506 non-null object
Block                     10506 non-null object
Street                    10506 non-null object
Trap                      10506 non-null object
AddressNumberAndStreet    10506 non-null object
Latitude                  10506 non-null float64
Longitude                 10506 non-null float64
AddressAccuracy           10506 non-null int64
NumMosquitos              10506 non-null int64
WnvPresent                10506 non-null int64
dtypes: datetime64[ns](1), float64(2), int64(3), object(6)
memory usage: 738.7+ KB


None

Test
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116293 entries, 0 to 116292
Data columns (total 11 columns):
Id                        116293 non-null int64
Date                      116293 non-null datetime64[ns]
Address                   116293 non-null object
Species                   116293 non-null object
Block                     116293 non-null object
Street                    116293 non-null object
Trap                      116293 non-null object
AddressNumberAndStreet    116293 non-null object
Latitude                  116293 non-null float64
Longitude                 116293 non-null float64
AddressAccuracy           116293 non-null int64
dtypes: datetime64[ns](1), float64(2), int64(2), object(6)
memory usage: 7.1+ MB


None

In [4]:
print('Weather')
display(weather.info())

Weather
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2944 entries, 0 to 2943
Data columns (total 22 columns):
Station        2944 non-null object
Date           2944 non-null datetime64[ns]
Tmax           2944 non-null int64
Tmin           2944 non-null int64
Tavg           2944 non-null object
Depart         2944 non-null object
DewPoint       2944 non-null int64
WetBulb        2944 non-null object
Heat           2944 non-null object
Cool           2944 non-null object
Sunrise        2944 non-null object
Sunset         2944 non-null object
CodeSum        2944 non-null object
Depth          2944 non-null object
Water1         2944 non-null object
SnowFall       2944 non-null object
PrecipTotal    2944 non-null object
StnPressure    2944 non-null object
SeaLevel       2944 non-null object
ResultSpeed    2944 non-null float64
ResultDir      2944 non-null int64
AvgSpeed       2944 non-null object
dtypes: datetime64[ns](1), float64(1), int64(4), object(16)
memory usage: 322.0+ KB


None

### Select Columns

In [5]:
# weather
weather_exclude = ['Dewpoint', 'WetBulb', 'CodeSum', 'Depth', 'Water1', 'SnowFall', 'StnPressure',
                 'SeaLevel', 'ResultSpeed', 'ResultDir', 'AvgSpeed','DewPoint']
weather_cols = [col for col in weather.columns if col not in weather_exclude]
weather = weather[weather_cols]


# train
train_exclude = ['Address', 'AddressNumberAndStreet', 'AddressAccuracy', 'NumMosquitos']
train_cols = [col for col in train.columns if col not in train_exclude]
train = train[train_cols]

# test
test_exclude = ['Address', 'AddressNumberAndStreet', 'AddressAccuracy', 'Id']
test_cols = [col for col in test.columns if col not in test_exclude]
test = test[test_cols]

In [6]:
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2944 entries, 0 to 2943
Data columns (total 11 columns):
Station        2944 non-null object
Date           2944 non-null datetime64[ns]
Tmax           2944 non-null int64
Tmin           2944 non-null int64
Tavg           2944 non-null object
Depart         2944 non-null object
Heat           2944 non-null object
Cool           2944 non-null object
Sunrise        2944 non-null object
Sunset         2944 non-null object
PrecipTotal    2944 non-null object
dtypes: datetime64[ns](1), int64(2), object(8)
memory usage: 161.0+ KB


In [7]:
print('Weather')
display(weather.head())

print('Train')
display(train.head())

Weather


Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,Depart,Heat,Cool,Sunrise,Sunset,PrecipTotal
0,1,2007-05-01,83,50,67,14,0,2,0448,1849,0.0
1,2,2007-05-01,84,52,68,M,0,3,-,-,0.0
2,1,2007-05-02,59,42,51,-3,14,0,0447,1850,0.0
3,2,2007-05-02,60,43,52,M,13,0,-,-,0.0
4,1,2007-05-03,66,46,56,2,9,0,0446,1851,0.0


Train


Unnamed: 0,Date,Species,Block,Street,Trap,Latitude,Longitude,WnvPresent
0,2007-05-29,CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,41.95469,-87.800991,0
1,2007-05-29,CULEX RESTUANS,41,N OAK PARK AVE,T002,41.95469,-87.800991,0
2,2007-05-29,CULEX RESTUANS,62,N MANDELL AVE,T007,41.994991,-87.769279,0
3,2007-05-29,CULEX PIPIENS/RESTUANS,79,W FOSTER AVE,T015,41.974089,-87.824812,0
4,2007-05-29,CULEX RESTUANS,79,W FOSTER AVE,T015,41.974089,-87.824812,0


In [8]:
# what species have been detected (note that according to the CDC each
# of these species can carry WNV)
set(train.Species)

{'CULEX ERRATICUS',
 'CULEX PIPIENS',
 'CULEX PIPIENS/RESTUANS',
 'CULEX RESTUANS',
 'CULEX SALINARIUS',
 'CULEX TARSALIS',
 'CULEX TERRITANS'}

In [9]:
# does this correspond to the test set
set(test.Species)
# it looks like there is another category

{'CULEX ERRATICUS',
 'CULEX PIPIENS',
 'CULEX PIPIENS/RESTUANS',
 'CULEX RESTUANS',
 'CULEX SALINARIUS',
 'CULEX TARSALIS',
 'CULEX TERRITANS',
 'UNSPECIFIED CULEX'}

In [10]:
train.groupby('Species').sum().WnvPresent

Species
CULEX ERRATICUS             0
CULEX PIPIENS             240
CULEX PIPIENS/RESTUANS    262
CULEX RESTUANS             49
CULEX SALINARIUS            0
CULEX TARSALIS              0
CULEX TERRITANS             0
Name: WnvPresent, dtype: int64

### Examine and Handle missing Data

**What is 'T' and 'M'?**
- From http://www.nws.noaa.gov/om/csd/info/NOWdata/FAQ.php

> "M" stands for "Missing". Data for an element will be missing if the primary sensor for that weather element is inoperable (e.g., has an outage) or malfunctioning (e.g., producing errant data) AND any collocated backup sensor is also inoperable or malfunctioning. "T" stand for "Trace". This is a small amount of precipitation that will wet a raingage but is less than the 0.01 inch measuring limit.

The Precipitation Total column is the only column in the retained data that can contain this value. The value has leading whitespace so we should strip prior to counting.

In [11]:
# strip whitespace
weather.PrecipTotal = weather.PrecipTotal.str.strip()

In [12]:
miss_weather = ['M', '-']
trace_weather = ['T']

In [13]:
cols_not_date = [col for col in weather.columns if col != 'Date']

In [14]:
weather[cols_not_date].apply(pd.value_counts, axis=1)[miss_weather + trace_weather].fillna(0).sum()

M    1507.0
-    2944.0
T     318.0
dtype: float64

In [15]:
# Both stations
check = weather[cols_not_date].apply(pd.value_counts, axis=0).fillna(0)
check.loc[['M', '-', 'T']]

Unnamed: 0,Station,Tmax,Tmin,Tavg,Depart,Heat,Cool,Sunrise,Sunset,PrecipTotal
M,0.0,0.0,0.0,11.0,1472.0,11.0,11.0,0.0,0.0,2.0
-,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1472.0,1472.0,0.0
T,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,318.0


In [16]:
# Station 1
check_stat1 = weather[cols_not_date][weather.Station == '1'].apply(pd.value_counts, axis=0).fillna(0)
check_stat1.loc[['M', '-', 'T']]

Unnamed: 0,Station,Tmax,Tmin,Tavg,Depart,Heat,Cool,Sunrise,Sunset,PrecipTotal
M,,,,,,,,,,
-,,,,,,,,,,
T,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,163.0


In [17]:
# Station 2
check_stat2 = weather[cols_not_date][weather.Station == '2'].apply(pd.value_counts, axis=0).fillna(0)
check_stat2.loc[['M', '-', 'T']]

Unnamed: 0,Station,Tmax,Tmin,Tavg,Depart,Heat,Cool,Sunrise,Sunset,PrecipTotal
M,0.0,0.0,0.0,11.0,1472.0,11.0,11.0,0.0,0.0,2.0
-,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1472.0,1472.0,0.0
T,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,155.0


In [18]:
# Both stations
check.loc[['M', '-', 'T']]/(len(weather)) * 100

Unnamed: 0,Station,Tmax,Tmin,Tavg,Depart,Heat,Cool,Sunrise,Sunset,PrecipTotal
M,0.0,0.0,0.0,0.373641,50.0,0.373641,0.373641,0.0,0.0,0.067935
-,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,50.0,0.0
T,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.80163


In [19]:
# Station 1
check_stat1.loc[['M', '-', 'T']]/(len(weather)) * 100

Unnamed: 0,Station,Tmax,Tmin,Tavg,Depart,Heat,Cool,Sunrise,Sunset,PrecipTotal
M,,,,,,,,,,
-,,,,,,,,,,
T,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.536685


In [20]:
# Station 2()
check_stat2.loc[['M', '-', 'T']]/(len(weather)) * 100

Unnamed: 0,Station,Tmax,Tmin,Tavg,Depart,Heat,Cool,Sunrise,Sunset,PrecipTotal
M,0.0,0.0,0.0,0.373641,50.0,0.373641,0.373641,0.0,0.0,0.067935
-,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,50.0,0.0
T,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.264946


In [21]:
weather = weather.replace('M', np.NaN)
weather = weather.replace('-', np.NaN)
weather = weather.replace('T', 0.005) # very small amounts of rain can impact mosquito hatches
weather.Tmax = weather.Tmax.fillna(method = 'ffill')
weather.Tmin = weather.Tmin.fillna(method = 'ffill')
weather.Depart = weather.Depart.fillna(method = 'ffill')
weather.Heat = weather.Heat.fillna(method = 'ffill')
weather.Cool = weather.Cool.fillna(method = 'ffill')
weather.PrecipTotal = weather.PrecipTotal.fillna(method = 'ffill')

In [22]:
# convert datatpypes

to_numeric = ['Tmax','Tmin','Tavg', 'Depart', 'Heat', 'Cool', 'PrecipTotal']

for col in to_numeric:
    weather[col]= pd.to_numeric(weather[col])

In [23]:
weather.Sunrise = weather.Sunrise.fillna(method = 'ffill')
weather.Sunset = weather.Sunset.fillna(method = 'ffill')

In [24]:
# sunset has entries where instead of incrementing to the next hour after xx59 it incremented to xx60
# This causes an exception, let's take a look
counter = 0
tracker = []
for index, val in enumerate(weather.Sunset):
    try:
        pd.to_datetime(val, format = '%H%M').time()
    except:
        counter += 1
        tracker.append((index, val, val[2:], counter))

print(tracker[-1])

# there are 48 exceptions

(2925, '1660', '60', 48)


In [25]:
# let's deal with this by decrmenting by 1 for each invalid instance
weather.Sunset = weather.Sunset.replace('\+?60', '59', regex = True)

In [26]:
# time conversion lambda function
time_func = lambda x: pd.Timestamp(pd.to_datetime(x, format = '%H%M'))

In [27]:
weather.Sunrise = weather.Sunrise.apply(time_func)

In [28]:
weather.Sunset = weather.Sunset.apply(time_func)

In [29]:
# what is the range of values for sunrise and sunset (in hours)
minutes= (weather.Sunset - weather.Sunrise).astype('timedelta64[m]')

In [30]:
hours = minutes/60

In [31]:
set(np.round(hours.values))

{10.0, 11.0, 12.0, 13.0, 14.0, 15.0}

In [32]:
#create a DayLength column with minute level precsion
weather['DayLength_MPrec'] = (weather.Sunset - weather.Sunrise).astype('timedelta64[m]')/60

In [33]:
#create a DayLength column with rounded to the nearest hour
weather['DayLength_NearH'] = np.round(((weather.Sunset - weather.Sunrise).astype('timedelta64[m]')/60).values)

In [34]:
# length of night with minute level precision
weather['NightLength_MPrec']= 24.0 - weather.DayLength_MPrec

In [35]:
# lenght of night rounded to nearest hour
weather['NightLength_NearH']= 24.0 - weather.DayLength_NearH

In [36]:
# function to calculate sunset and sunrise times in hours
hours_RiseSet_func = lambda x: x.minute/60.0 + float(x.hour)

In [37]:
# sunrise in hours
weather['Sunrise_hours'] = weather.Sunrise.apply(hours_RiseSet_func)

In [38]:
# sunset in hours
weather['Sunset_hours'] = weather.Sunset.apply(hours_RiseSet_func)

In [39]:
mean_func = lambda x: x.mean()

blend_cols = ['Tmax', 'Tmin', 'Depart' ,'Heat', 'Cool', 'PrecipTotal']

In [40]:
blended_cols= ['blended_' + col for col in blend_cols]

In [41]:
station_1 = weather[blend_cols][weather.Station == '1']
station_2 = weather[blend_cols][weather.Station == '2']

In [42]:
station_blend = pd.DataFrame((station_1.values + station_2.values)/2, columns= blended_cols)

In [43]:
extract_2 = weather[weather.Station == '2'].reset_index(drop = True)
extract_2.head()

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,Depart,Heat,Cool,Sunrise,Sunset,PrecipTotal,DayLength_MPrec,DayLength_NearH,NightLength_MPrec,NightLength_NearH,Sunrise_hours,Sunset_hours
0,2,2007-05-01,84,52,68.0,14,0,3,1900-01-01 04:48:00,1900-01-01 18:49:00,0.0,14.016667,14.0,9.983333,10.0,4.8,18.816667
1,2,2007-05-02,60,43,52.0,-3,13,0,1900-01-01 04:47:00,1900-01-01 18:50:00,0.0,14.05,14.0,9.95,10.0,4.783333,18.833333
2,2,2007-05-03,67,48,58.0,2,7,0,1900-01-01 04:46:00,1900-01-01 18:51:00,0.0,14.083333,14.0,9.916667,10.0,4.766667,18.85
3,2,2007-05-04,78,51,,4,7,0,1900-01-01 04:44:00,1900-01-01 18:52:00,0.0,14.133333,14.0,9.866667,10.0,4.733333,18.866667
4,2,2007-05-05,66,54,60.0,5,5,0,1900-01-01 04:43:00,1900-01-01 18:53:00,0.005,14.166667,14.0,9.833333,10.0,4.716667,18.883333


In [44]:
extract_1 = weather[weather.Station == '1'].reset_index(drop = True)
extract_1.head()

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,Depart,Heat,Cool,Sunrise,Sunset,PrecipTotal,DayLength_MPrec,DayLength_NearH,NightLength_MPrec,NightLength_NearH,Sunrise_hours,Sunset_hours
0,1,2007-05-01,83,50,67.0,14,0,2,1900-01-01 04:48:00,1900-01-01 18:49:00,0.0,14.016667,14.0,9.983333,10.0,4.8,18.816667
1,1,2007-05-02,59,42,51.0,-3,14,0,1900-01-01 04:47:00,1900-01-01 18:50:00,0.0,14.05,14.0,9.95,10.0,4.783333,18.833333
2,1,2007-05-03,66,46,56.0,2,9,0,1900-01-01 04:46:00,1900-01-01 18:51:00,0.0,14.083333,14.0,9.916667,10.0,4.766667,18.85
3,1,2007-05-04,66,49,58.0,4,7,0,1900-01-01 04:44:00,1900-01-01 18:52:00,0.005,14.133333,14.0,9.866667,10.0,4.733333,18.866667
4,1,2007-05-05,66,53,60.0,5,5,0,1900-01-01 04:43:00,1900-01-01 18:53:00,0.005,14.166667,14.0,9.833333,10.0,4.716667,18.883333


In [45]:
joined_1 = extract_1.join(station_blend)
joined_2 = extract_2.join(station_blend)

In [46]:
weather_blend = pd.concat([joined_1, joined_2])

In [47]:
weather_blend.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2944 entries, 0 to 1471
Data columns (total 23 columns):
Station                2944 non-null object
Date                   2944 non-null datetime64[ns]
Tmax                   2944 non-null int64
Tmin                   2944 non-null int64
Tavg                   2933 non-null float64
Depart                 2944 non-null int64
Heat                   2944 non-null int64
Cool                   2944 non-null int64
Sunrise                2944 non-null datetime64[ns]
Sunset                 2944 non-null datetime64[ns]
PrecipTotal            2944 non-null float64
DayLength_MPrec        2944 non-null float64
DayLength_NearH        2944 non-null float64
NightLength_MPrec      2944 non-null float64
NightLength_NearH      2944 non-null float64
Sunrise_hours          2944 non-null float64
Sunset_hours           2944 non-null float64
blended_Tmax           2944 non-null float64
blended_Tmin           2944 non-null float64
blended_Depart         2944 

### Create Month and Day columns

In [48]:
month_func = lambda x: x.month
day_func= lambda x: x.day
day_of_year_func = lambda x: x.dayofyear
week_of_year_func = lambda x: x.week

# train
train['month'] = train.Date.apply(month_func)
train['day'] = train.Date.apply(day_func)
train['day_of_year'] = train.Date.apply(day_of_year_func)
train['week'] = train.Date.apply(week_of_year_func)

# test
test['month'] = test.Date.apply(month_func)
test['day'] = test.Date.apply(day_func)
test['day_of_year'] = test.Date.apply(day_of_year_func)
test['week'] = test.Date.apply(week_of_year_func)

In [49]:
train.describe()

Unnamed: 0,Latitude,Longitude,WnvPresent,month,day,day_of_year,week
count,10506.0,10506.0,10506.0,10506.0,10506.0,10506.0,10506.0
mean,41.841139,-87.699908,0.052446,7.692557,15.341424,217.941938,31.687131
std,0.112742,0.096514,0.222936,1.067675,8.901205,31.735797,4.526874
min,41.644612,-87.930995,0.0,5.0,1.0,148.0,22.0
25%,41.732984,-87.76007,0.0,7.0,7.0,196.0,28.0
50%,41.846283,-87.694991,0.0,8.0,15.0,217.0,31.0
75%,41.95469,-87.627796,0.0,8.0,24.0,241.0,35.0
max,42.01743,-87.531635,1.0,10.0,31.0,282.0,41.0


In [50]:
test.describe()

Unnamed: 0,Latitude,Longitude,month,day,day_of_year,week
count,116293.0,116293.0,116293.0,116293.0,116293.0,116293.0
mean,41.849389,-87.693658,7.620244,16.187681,217.057304,31.417833
std,0.106593,0.080699,1.065842,8.800007,32.536523,4.660725
min,41.644612,-87.930995,6.0,1.0,153.0,22.0
25%,41.753411,-87.750938,7.0,10.0,193.0,28.0
50%,41.862292,-87.694991,8.0,17.0,217.0,31.0
75%,41.951866,-87.64886,8.0,24.0,244.0,35.0
max,42.01743,-87.531635,10.0,31.0,275.0,40.0


In [51]:
# remove sunrise and sunset since we have extracted critical information into other fields
sunrise= weather_blend.pop('Sunrise')
sunset= weather_blend.pop('Sunset')

### Merge Data

In [52]:
train = train.merge(weather_blend, on='Date')
test = test.merge(weather_blend, on='Date')

### Inspect DFs

In [53]:
weather_blend.ix[:,:12].describe()

Unnamed: 0,Tmax,Tmin,Tavg,Depart,Heat,Cool,PrecipTotal,DayLength_MPrec,DayLength_NearH,NightLength_MPrec
count,2944.0,2944.0,2933.0,2944.0,2944.0,2944.0,2944.0,2944.0,2944.0,2944.0
mean,76.166101,57.810462,67.240027,1.954484,3.417799,5.641304,0.13178,13.669384,13.646739,10.330616
std,11.46197,10.381939,10.545442,6.839947,5.955153,6.104849,0.393031,1.489822,1.485758,1.489822
min,41.0,29.0,36.0,-17.0,0.0,0.0,0.0,10.4,10.0,8.766667
25%,69.0,50.0,60.0,-3.0,0.0,0.0,0.0,12.5125,12.75,9.045833
50%,78.0,59.0,69.0,2.0,0.0,4.0,0.0,14.233333,14.0,9.766667
75%,85.0,66.0,75.0,7.0,5.0,10.0,0.06,14.954167,15.0,11.4875
max,104.0,83.0,94.0,23.0,29.0,29.0,6.86,15.233333,15.0,13.6


In [54]:
weather_blend.ix[:,12:].describe()

Unnamed: 0,NightLength_NearH,Sunrise_hours,Sunset_hours,blended_Tmax,blended_Tmin,blended_Depart,blended_Heat,blended_Cool,blended_PrecipTotal
count,2944.0,2944.0,2944.0,2944.0,2944.0,2944.0,2944.0,2944.0,2944.0
mean,10.353261,4.992663,18.662047,76.166101,57.810462,1.954484,3.417799,5.641304,0.13178
std,1.485758,0.651771,0.843787,11.422764,10.26837,6.839947,5.932802,6.061981,0.35829
min,9.0,4.266667,16.783333,41.5,29.0,-17.0,0.0,0.0,0.0
25%,9.0,4.4125,18.033333,68.5,50.5,-3.0,0.0,0.0,0.0
50%,10.0,4.775,18.983333,78.5,59.5,2.0,0.0,4.0,0.0025
75%,11.25,5.520833,19.370833,84.5,65.5,7.0,4.5,10.5,0.08
max,14.0,6.383333,19.516667,103.5,82.5,23.0,29.0,28.5,4.855


In [55]:
train.describe()

Unnamed: 0,Latitude,Longitude,WnvPresent,month,day,day_of_year,week,Tmax,Tmin,Tavg,...,NightLength_MPrec,NightLength_NearH,Sunrise_hours,Sunset_hours,blended_Tmax,blended_Tmin,blended_Depart,blended_Heat,blended_Cool,blended_PrecipTotal
count,21012.0,21012.0,21012.0,21012.0,21012.0,21012.0,21012.0,21012.0,21012.0,21012.0,...,21012.0,21012.0,21012.0,21012.0,21012.0,21012.0,21012.0,21012.0,21012.0,21012.0
mean,41.841139,-87.699908,0.052446,7.692557,15.341424,217.941938,31.687131,81.917095,63.647202,73.022463,...,9.993359,10.031696,4.890621,18.897262,81.917095,63.647202,2.740244,0.886208,8.908671,0.146082
std,0.112739,0.096512,0.22293,1.067649,8.900993,31.735042,4.526766,8.290251,7.593949,7.501455,...,1.048358,1.047465,0.463813,0.589987,8.272521,7.486933,6.486313,2.697339,5.712594,0.323131
min,41.644612,-87.930995,0.0,5.0,1.0,148.0,22.0,57.0,41.0,50.0,...,8.766667,9.0,4.266667,17.333333,57.5,42.5,-12.0,0.0,0.0,0.0
25%,41.732984,-87.76007,0.0,7.0,7.0,196.0,28.0,78.0,59.0,70.0,...,9.05,9.0,4.483333,18.533333,78.0,59.5,-2.0,0.0,5.0,0.0
50%,41.846283,-87.694991,0.0,8.0,15.0,217.0,31.0,83.0,65.0,75.0,...,9.7,10.0,4.816667,19.116667,83.0,65.5,4.0,0.0,9.5,0.0
75%,41.95469,-87.627796,0.0,8.0,24.0,241.0,35.0,88.0,69.0,79.0,...,10.7,11.0,5.233333,19.4,88.0,69.5,8.0,0.0,13.5,0.13
max,42.01743,-87.531635,1.0,10.0,31.0,282.0,41.0,97.0,79.0,87.0,...,12.616667,13.0,5.95,19.516667,96.5,77.5,18.0,14.0,20.5,2.42


### Handle Weather Stations 1

In [56]:
# columns to write
cols_to_write = [col for col in train.columns if col != 'Date'] # exclude 'Date'

In [57]:
# split the data into two dataframes by station

train_station_1= train[train.Station == '1']
train_station_2= train[train.Station == '2']

In [58]:
# export to JSON for external use
train_station_1.to_json('train_station_1.json')
train_station_2.to_json('train_station_2.json')
train.to_json('train.json')

# epxort to csv for external use
train_station_1.to_csv('train_station_1.csv')
train_station_2.to_csv('train_station_2.csv')
train.to_csv('train.csv')


# Prepare Data Set for Model Building

In [59]:
# set up a merge for stations 1 and 2
# keep unique cols from station 2
keep_cols = ['Date', u'Tmax', u'Tmin', u'Tavg',u'PrecipTotal']
train_station_2 = train_station_2[keep_cols]

# rename cols with prefix
prefix_s2 = 'stat_2_'
rename_cols_s2 = [prefix_s2 + col for col in train_station_2.columns]
train_station_2.columns = rename_cols_s2

In [60]:
# drop cols from station 1 that won't be used in model
drop_cols = ['Latitude', 'Longitude', 'Heat', 'Cool', 'Depart', 'NightLength_MPrec', 'NightLength_NearH',
            'blended_Depart', 'blended_Heat', 'blended_Cool']

for col in drop_cols:
    train_station_1.pop(col)
   

In [61]:
# raname uniqe station 1 columns
prefix_s1 = 'stat_1_'
rename_cols_s1 = [prefix_s1 + col for col in keep_cols]
cols_to_rename= [col for col in train_station_1.columns if col in keep_cols]

# setup name mapping
s1_name_map = dict(zip(cols_to_rename, rename_cols_s1))

train_station_1 = train_station_1.rename(columns= s1_name_map)

In [62]:
# concat (outer join)
train_station_1 =  train_station_1.reset_index(drop= True)
train_station_2 = train_station_2.reset_index(drop = True)
train_merge = pd.concat([train_station_1, train_station_2], axis= 1)

### Create Dummmies from Categorical Variables

In [66]:
train_merge.columns

Index([        u'stat_1_Date',             u'Species',               u'Block',
                    u'Street',                u'Trap',               u'month',
                       u'day',         u'day_of_year',                u'week',
                   u'Station',         u'stat_1_Tmax',         u'stat_1_Tmin',
               u'stat_1_Tavg',  u'stat_1_PrecipTotal',     u'DayLength_MPrec',
           u'DayLength_NearH',       u'Sunrise_hours',        u'Sunset_hours',
              u'blended_Tmax',        u'blended_Tmin', u'blended_PrecipTotal',
               u'stat_2_Date',         u'stat_2_Tmax',         u'stat_2_Tmin',
               u'stat_2_Tavg',  u'stat_2_PrecipTotal'],
      dtype='object')

In [64]:
# get label
labels = train_merge.pop('WnvPresent')

In [68]:
# remove dates
train_merge = train_merge.drop(['stat_1_Date'], axis = 1)
train_merge = train_merge.drop(['stat_2_Date'], axis = 1)

In [69]:
# Create dummies from the categorical species, block, and streetname
train_merge = pd.get_dummies(train_merge, columns= ['Species'])

train_merge = pd.get_dummies(train_merge, columns= ['Block'])

train_merge = pd.get_dummies(train_merge, columns= ['Street'])

In [71]:
len(train_merge.columns)

220

In [70]:
[col for col in train_merge.columns]

['Trap',
 'month',
 'day',
 'day_of_year',
 'week',
 'Station',
 u'stat_1_Tmax',
 u'stat_1_Tmin',
 u'stat_1_Tavg',
 u'stat_1_PrecipTotal',
 'DayLength_MPrec',
 'DayLength_NearH',
 'Sunrise_hours',
 'Sunset_hours',
 'blended_Tmax',
 'blended_Tmin',
 'blended_PrecipTotal',
 'stat_2_Tmax',
 'stat_2_Tmin',
 'stat_2_Tavg',
 'stat_2_PrecipTotal',
 'Species_CULEX ERRATICUS',
 'Species_CULEX PIPIENS',
 'Species_CULEX PIPIENS/RESTUANS',
 'Species_CULEX RESTUANS',
 'Species_CULEX SALINARIUS',
 'Species_CULEX TARSALIS',
 'Species_CULEX TERRITANS',
 'Block_10',
 'Block_11',
 'Block_12',
 'Block_13',
 'Block_14',
 'Block_15',
 'Block_17',
 'Block_18',
 'Block_20',
 'Block_21',
 'Block_22',
 'Block_24',
 'Block_25',
 'Block_27',
 'Block_28',
 'Block_29',
 'Block_30',
 'Block_33',
 'Block_34',
 'Block_35',
 'Block_36',
 'Block_37',
 'Block_38',
 'Block_39',
 'Block_40',
 'Block_41',
 'Block_42',
 'Block_43',
 'Block_45',
 'Block_46',
 'Block_47',
 'Block_48',
 'Block_49',
 'Block_50',
 'Block_51',
 '