In [42]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split

In [2]:
train_data   = pd.read_csv('./assets/train.csv')
test_data    = pd.read_csv('./assets/test.csv')
weather_data = pd.read_csv('./assets/weather.csv')
spray_data   = pd.read_csv('./assets/spray.csv')

In [3]:
train_data.shape, test_data.shape, weather_data.shape, spray_data.shape

((10506, 12), (116293, 11), (2944, 22), (14835, 4))

In [4]:
train_data.tail()

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent
10501,2013-09-26,"5100 West 72nd Street, Chicago, IL 60638, USA",CULEX PIPIENS/RESTUANS,51,W 72ND ST,T035,"5100 W 72ND ST, Chicago, IL",41.763733,-87.742302,8,6,1
10502,2013-09-26,"5800 North Ridge Avenue, Chicago, IL 60660, USA",CULEX PIPIENS/RESTUANS,58,N RIDGE AVE,T231,"5800 N RIDGE AVE, Chicago, IL",41.98728,-87.666066,8,5,0
10503,2013-09-26,"1700 North Ashland Avenue, Chicago, IL 60622, USA",CULEX PIPIENS/RESTUANS,17,N ASHLAND AVE,T232,"1700 N ASHLAND AVE, Chicago, IL",41.912563,-87.668055,9,1,0
10504,2013-09-26,"7100 North Harlem Avenue, Chicago, IL 60631, USA",CULEX PIPIENS/RESTUANS,71,N HARLEM AVE,T233,"7100 N HARLEM AVE, Chicago, IL",42.009876,-87.807277,9,5,0
10505,2013-09-26,"4200 West 65th Street, Chicago, IL 60621, USA",CULEX PIPIENS/RESTUANS,42,W 65TH ST,T235,"4200 W 65TH ST, Chicago, IL",41.776428,-87.627096,8,1,0


In [5]:
test_data.head()

Unnamed: 0,Id,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy
0,1,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9
1,2,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9
2,3,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9
3,4,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX SALINARIUS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9
4,5,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX TERRITANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9


In [6]:
# seems to have two different stations.  What is the location of each station that they have different information
# what is the optimal weather for mosquitos for west Nile?

# In the northern United States, West Nile is spread to humans by a mosquito known as Culex pipiens. 
# Its population depends on the number of spring and summer days above 81 degrees Fahrenheit. 
weather_data.head(10)

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,...,CodeSum,Depth,Water1,SnowFall,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
0,1,2007-05-01,83,50,67,14,51,56,0,2,...,,0,M,0.0,0.00,29.1,29.82,1.7,27,9.2
1,2,2007-05-01,84,52,68,M,51,57,0,3,...,,M,M,M,0.00,29.18,29.82,2.7,25,9.6
2,1,2007-05-02,59,42,51,-3,42,47,14,0,...,BR,0,M,0.0,0.00,29.38,30.09,13.0,4,13.4
3,2,2007-05-02,60,43,52,M,42,47,13,0,...,BR HZ,M,M,M,0.00,29.44,30.08,13.3,2,13.4
4,1,2007-05-03,66,46,56,2,40,48,9,0,...,,0,M,0.0,0.00,29.39,30.12,11.7,7,11.9
5,2,2007-05-03,67,48,58,M,40,50,7,0,...,HZ,M,M,M,0.00,29.46,30.12,12.9,6,13.2
6,1,2007-05-04,66,49,58,4,41,50,7,0,...,RA,0,M,0.0,T,29.31,30.05,10.4,8,10.8
7,2,2007-05-04,78,51,M,M,42,50,M,M,...,,M,M,M,0.00,29.36,30.04,10.1,7,10.4
8,1,2007-05-05,66,53,60,5,38,49,5,0,...,,0,M,0.0,T,29.4,30.1,11.7,7,12.0
9,2,2007-05-05,66,54,60,M,39,50,5,0,...,,M,M,M,T,29.46,30.09,11.2,7,11.5


In [7]:
weather_data.replace('M', 0, inplace=True)
weather_data.replace('T', 0, inplace=True)
weather_data.replace(' T', 0, inplace=True)
weather_data.replace('  T', 0, inplace=True)

# no information in weather_data.SnowFall, dropping
# no information in weather_data.Depth, dropping
# no information in weather_data.Water1, dropping
weather_data.drop(columns=['Water1', 'SnowFall', 'Depth'], axis=1, inplace=True)

In [8]:
# we only care about Shower(SH), drizzle(DZ), rain(RA)
# Thunderstorm(TS), Mist(BR) = Each of these will reduce the number of mosquitoes
patterns = ['SH', 'DZ', 'RA', 'TS', 'BR']

# iterate through all the CodeSum to find the columns with the patterns list
weather_data['CodeSum'] = weather_data['CodeSum'].apply(lambda x: [t for t in x.split('0') 
                                                                   if t in patterns])
# converts everything from the lists that were created in the previous code into strings
weather_data['CodeSum'] = weather_data['CodeSum'].apply(lambda x: 
                                                x if not isinstance(x, list) else x[0] if len(x) else '')

weather_data.CodeSum.value_counts()

      2520
RA     296
BR     110
TS      10
DZ       8
Name: CodeSum, dtype: int64

In [9]:
weather_data = weather_data.apply(pd.to_numeric, errors='ignore')

In [10]:
weather_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2944 entries, 0 to 2943
Data columns (total 19 columns):
Station        2944 non-null int64
Date           2944 non-null object
Tmax           2944 non-null int64
Tmin           2944 non-null int64
Tavg           2944 non-null int64
Depart         2944 non-null int64
DewPoint       2944 non-null int64
WetBulb        2944 non-null int64
Heat           2944 non-null int64
Cool           2944 non-null int64
Sunrise        2944 non-null object
Sunset         2944 non-null object
CodeSum        2944 non-null object
PrecipTotal    2944 non-null float64
StnPressure    2944 non-null float64
SeaLevel       2944 non-null float64
ResultSpeed    2944 non-null float64
ResultDir      2944 non-null int64
AvgSpeed       2944 non-null float64
dtypes: float64(5), int64(10), object(4)
memory usage: 437.1+ KB


In [11]:
weather_date = weather_data[['Date', 'CodeSum']]

In [12]:
weather_data = weather_data.groupby(np.arange(len(weather_data))//2).mean()

In [13]:
weather_data = weather_data.merge(weather_date, right_index=True, left_index=True)

In [14]:
weather_data.drop('Station', axis=1, inplace=True)

In [15]:
weather_data.head()

Unnamed: 0,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed,Date,CodeSum
0,83.5,51.0,67.5,7.0,51.0,56.5,0.0,2.5,0.0,29.14,29.82,2.2,26.0,9.4,2007-05-01,
1,59.5,42.5,51.5,-1.5,42.0,47.0,13.5,0.0,0.0,29.41,30.085,13.15,3.0,13.4,2007-05-01,
2,66.5,47.0,57.0,1.0,40.0,49.0,8.0,0.0,0.0,29.425,30.12,12.3,6.5,12.55,2007-05-02,BR
3,72.0,50.0,29.0,2.0,41.5,50.0,3.5,0.0,0.0,29.335,30.045,10.25,7.5,10.6,2007-05-02,
4,66.0,53.5,60.0,2.5,38.5,49.5,5.0,0.0,0.0,29.43,30.095,11.45,7.0,11.75,2007-05-03,


In [16]:
train_data.describe()

Unnamed: 0,Block,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent
count,10506.0,10506.0,10506.0,10506.0,10506.0,10506.0
mean,35.687797,41.841139,-87.699908,7.819532,12.853512,0.052446
std,24.339468,0.112742,0.096514,1.452921,16.133816,0.222936
min,10.0,41.644612,-87.930995,3.0,1.0,0.0
25%,12.0,41.732984,-87.76007,8.0,2.0,0.0
50%,33.0,41.846283,-87.694991,8.0,5.0,0.0
75%,52.0,41.95469,-87.627796,9.0,17.0,0.0
max,98.0,42.01743,-87.531635,9.0,50.0,1.0


In [17]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10506 entries, 0 to 10505
Data columns (total 12 columns):
Date                      10506 non-null object
Address                   10506 non-null object
Species                   10506 non-null object
Block                     10506 non-null int64
Street                    10506 non-null object
Trap                      10506 non-null object
AddressNumberAndStreet    10506 non-null object
Latitude                  10506 non-null float64
Longitude                 10506 non-null float64
AddressAccuracy           10506 non-null int64
NumMosquitos              10506 non-null int64
WnvPresent                10506 non-null int64
dtypes: float64(2), int64(4), object(6)
memory usage: 985.0+ KB


In [18]:
train_data['WnvPresent'].value_counts(normalize=True)

0    0.947554
1    0.052446
Name: WnvPresent, dtype: float64

In [19]:
weather_data.columns

Index(['Tmax', 'Tmin', 'Tavg', 'Depart', 'DewPoint', 'WetBulb', 'Heat', 'Cool',
       'PrecipTotal', 'StnPressure', 'SeaLevel', 'ResultSpeed', 'ResultDir',
       'AvgSpeed', 'Date', 'CodeSum'],
      dtype='object')

In [20]:
train_data.nunique()

Date                       95
Address                   138
Species                     7
Block                      64
Street                    128
Trap                      136
AddressNumberAndStreet    138
Latitude                  138
Longitude                 138
AddressAccuracy             4
NumMosquitos               50
WnvPresent                  2
dtype: int64

In [21]:
weather_data.nunique()

Tmax           117
Tmin           102
Tavg           112
Depart          41
DewPoint       101
WetBulb         93
Heat            55
Cool            50
PrecipTotal    213
StnPressure    199
SeaLevel       204
ResultSpeed    391
ResultDir       71
AvgSpeed       358
Date           736
CodeSum          5
dtype: int64

In [23]:
# Split up the weather stations.
# weather_data = pd.get_dummies(weather_data, columns=['Station'])

In [24]:
train_data = train_data.merge(weather_data, on="Date")

In [25]:
def date_separate(df):
    df = df.copy()
    df['Year'] = pd.DatetimeIndex(df['Date']).year
    df['Month'] = pd.DatetimeIndex(df['Date']).month
    df['Day'] = pd.DatetimeIndex(df['Date']).day
    return df

In [26]:
train_data = date_separate(train_data)

In [27]:
train_data.head()

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,...,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed,CodeSum,Year,Month,Day
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,...,0.08,29.42,30.1,6.7,21.5,7.6,,2007,5,29
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,...,0.25,29.325,30.01,7.7,24.0,8.65,,2007,5,29
2,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,...,0.08,29.42,30.1,6.7,21.5,7.6,,2007,5,29
3,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,...,0.25,29.325,30.01,7.7,24.0,8.65,,2007,5,29
4,2007-05-29,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX RESTUANS,62,N MANDELL AVE,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9,...,0.08,29.42,30.1,6.7,21.5,7.6,,2007,5,29


In [28]:
# carriers of the virus are Pipiens and Restuans
train_data.Species.value_counts()

CULEX PIPIENS/RESTUANS    5352
CULEX PIPIENS             3948
CULEX RESTUANS            2442
CULEX TERRITANS            256
CULEX SALINARIUS           118
CULEX TARSALIS               4
Name: Species, dtype: int64

In [36]:
train_data.columns

Index(['Date', 'Address', 'Species', 'Block', 'Street', 'Trap',
       'AddressNumberAndStreet', 'Latitude', 'Longitude', 'AddressAccuracy',
       'NumMosquitos', 'WnvPresent', 'Tmax', 'Tmin', 'Tavg', 'Depart',
       'DewPoint', 'WetBulb', 'Heat', 'Cool', 'PrecipTotal', 'StnPressure',
       'SeaLevel', 'ResultSpeed', 'ResultDir', 'AvgSpeed', 'CodeSum', 'Year',
       'Month', 'Day'],
      dtype='object')

In [29]:
# # You can get by without probability estimates, but if you need them, use calibration 
# sklearn.calibration.CalibratedClassifierCV

# # No matter what you do for training, always test on the natural (stratified) distribution your classifier is going to operate upon
# sklearn.cross_validation.StratifiedKFold

NameError: name 'sklearn' is not defined

In [31]:
rfc = RandomForestClassifier(class_weight={1:9})

In [38]:
X = train_data.drop('WnvPresent', axis=1)
y = train_data['WnvPresent']

In [47]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9090 entries, 6068 to 8078
Data columns (total 29 columns):
Date                      9090 non-null object
Address                   9090 non-null object
Species                   9090 non-null object
Block                     9090 non-null int64
Street                    9090 non-null object
Trap                      9090 non-null object
AddressNumberAndStreet    9090 non-null object
Latitude                  9090 non-null float64
Longitude                 9090 non-null float64
AddressAccuracy           9090 non-null int64
NumMosquitos              9090 non-null int64
Tmax                      9090 non-null float64
Tmin                      9090 non-null float64
Tavg                      9090 non-null float64
Depart                    9090 non-null float64
DewPoint                  9090 non-null float64
WetBulb                   9090 non-null float64
Heat                      9090 non-null float64
Cool                      9090 non-nul

In [46]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((9090, 29), (9090,), (3030, 29), (3030,))

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

In [44]:
rfc.fit(X_train, y_train)

ValueError: could not convert string to float: 