In [1]:
import numpy as np
import pandas as pd
import regex as re

In [150]:
train_data   = pd.read_csv('./assets/train.csv')
test_data    = pd.read_csv('./assets/test.csv')
weather_data = pd.read_csv('./assets/weather.csv')
spray_data   = pd.read_csv('./assets/spray.csv')

In [151]:
train_data.shape, test_data.shape, weather_data.shape, spray_data.shape

((10506, 12), (116293, 11), (2944, 22), (14835, 4))

In [152]:
train_data.tail()

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent
10501,2013-09-26,"5100 West 72nd Street, Chicago, IL 60638, USA",CULEX PIPIENS/RESTUANS,51,W 72ND ST,T035,"5100 W 72ND ST, Chicago, IL",41.763733,-87.742302,8,6,1
10502,2013-09-26,"5800 North Ridge Avenue, Chicago, IL 60660, USA",CULEX PIPIENS/RESTUANS,58,N RIDGE AVE,T231,"5800 N RIDGE AVE, Chicago, IL",41.98728,-87.666066,8,5,0
10503,2013-09-26,"1700 North Ashland Avenue, Chicago, IL 60622, USA",CULEX PIPIENS/RESTUANS,17,N ASHLAND AVE,T232,"1700 N ASHLAND AVE, Chicago, IL",41.912563,-87.668055,9,1,0
10504,2013-09-26,"7100 North Harlem Avenue, Chicago, IL 60631, USA",CULEX PIPIENS/RESTUANS,71,N HARLEM AVE,T233,"7100 N HARLEM AVE, Chicago, IL",42.009876,-87.807277,9,5,0
10505,2013-09-26,"4200 West 65th Street, Chicago, IL 60621, USA",CULEX PIPIENS/RESTUANS,42,W 65TH ST,T235,"4200 W 65TH ST, Chicago, IL",41.776428,-87.627096,8,1,0


In [153]:
test_data.head()

Unnamed: 0,Id,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy
0,1,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9
1,2,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9
2,3,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9
3,4,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX SALINARIUS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9
4,5,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX TERRITANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9


In [154]:
# seems to have two different stations.  What is the location of each station that they have different information
# what is the optimal weather for mosquitos for west Nile?

# In the northern United States, West Nile is spread to humans by a mosquito known as Culex pipiens. 
# Its population depends on the number of spring and summer days above 81 degrees Fahrenheit. 
weather_data.tail(10)

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,...,CodeSum,Depth,Water1,SnowFall,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
2934,1,2014-10-27,77,51,64,16,51,58,1,0,...,,0,M,0.0,0.00,28.92,29.66,12.0,19,12.9
2935,2,2014-10-27,79,54,67,M,52,59,0,2,...,RA,M,M,M,0.02,29.0,29.67,12.7,19,13.6
2936,1,2014-10-28,68,45,57,10,38,47,8,0,...,,0,M,0.0,T,29.15,29.85,14.8,26,15.6
2937,2,2014-10-28,66,48,57,M,40,48,8,0,...,RA,M,M,M,0.03,29.23,29.85,14.0,26,14.6
2938,1,2014-10-29,49,36,43,-4,32,40,22,0,...,,0,M,0.0,0.00,29.36,30.06,9.5,29,9.9
2939,2,2014-10-29,49,40,45,M,34,42,20,0,...,,M,M,M,0.00,29.42,30.07,8.5,29,9.0
2940,1,2014-10-30,51,32,42,-4,34,40,23,0,...,,0,M,0.0,0.00,29.34,30.09,5.1,24,5.5
2941,2,2014-10-30,53,37,45,M,35,42,20,0,...,RA,M,M,M,T,29.41,30.1,5.9,23,6.5
2942,1,2014-10-31,47,33,40,-6,25,33,25,0,...,RA SN,0,M,0.1,0.03,29.49,30.2,22.6,34,22.9
2943,2,2014-10-31,49,34,42,M,29,36,23,0,...,RA SN BR,M,M,M,0.04,29.54,30.2,21.7,34,22.6


In [155]:
# we only care about Shower(SH), drizzle(DZ), rain(RA)
# Thunderstorm(TS), Mist(BR) = Each of these will reduce the number of mosquitoes
patterns = ['SH', 'DZ', 'RA', 'TS', 'BR']

# iterate through all the CodeSum to find the columns with the patterns list
weather_data['CodeSum'] = weather_data['CodeSum'].apply(lambda x: [t for t in x.split('0') 
                                                                   if t in patterns])
# converts everything from the lists that were created in the previous code into strings
weather_data['CodeSum'] = weather_data['CodeSum'].apply(lambda x: 
                                                x if not isinstance(x, list) else x[0] if len(x) else '')

weather_data.CodeSum.value_counts()

      2520
RA     296
BR     110
TS      10
DZ       8
Name: CodeSum, dtype: int64

In [134]:
weather_data.replace('M', 0, inplace=True)
weather_data.replace('T', 0, inplace=True)
weather_data.replace(' T', 0, inplace=True)
weather_data.replace('  T', 0, inplace=True)

In [24]:
# no information in weather_data.SnowFall, dropping
# no information in weather_data.Depth, dropping
# no information in weather_data.Water1, dropping
weather_data.drop(columns=['Water1', 'SnowFall', 'Depth'], axis=1, inplace=True)

In [None]:
train_data.describe()

In [None]:
train_data.info()

In [None]:
train_data['WnvPresent'].value_counts(normalize=True)

In [135]:
weather_data.columns

Index(['Station', 'Date', 'Tmax', 'Tmin', 'Tavg', 'Depart', 'DewPoint',
       'WetBulb', 'Heat', 'Cool', 'Sunrise', 'Sunset', 'CodeSum', 'Depth',
       'Water1', 'SnowFall', 'PrecipTotal', 'StnPressure', 'SeaLevel',
       'ResultSpeed', 'ResultDir', 'AvgSpeed'],
      dtype='object')

In [None]:
train_data.nunique()

In [None]:
weather_data.nunique()

In [None]:
train_data = date_separate(train_data)

In [19]:
# Split up the weather stations.
weather_data = pd.get_dummies(weather_data, columns=['Station'])

In [25]:
train_data = train_data.merge(weather_data, on="Date")

In [27]:
def date_separate(df):
    df = df.copy()
    df['Year'] = pd.DatetimeIndex(df['Date']).year
    df['Month'] = pd.DatetimeIndex(df['Date']).month
    df['Day'] = pd.DatetimeIndex(df['Date']).day
    return df

In [31]:
train_data = date_separate(train_data)

In [32]:
train_data.head()

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,...,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed,Station_1,Station_2,Year,Month,Day
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,...,29.39,30.11,5.8,18,6.5,1,0,2007,5,29
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,...,29.44,30.09,5.8,16,7.4,0,1,2007,5,29
2,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,...,29.39,30.11,5.8,18,6.5,1,0,2007,5,29
3,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,...,29.44,30.09,5.8,16,7.4,0,1,2007,5,29
4,2007-05-29,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX RESTUANS,62,N MANDELL AVE,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9,...,29.39,30.11,5.8,18,6.5,1,0,2007,5,29


In [34]:
# carriers of the virus are Pipiens and Restuans
train_data.Species.value_counts()

CULEX PIPIENS/RESTUANS    9504
CULEX RESTUANS            5480
CULEX PIPIENS             5398
CULEX TERRITANS            444
CULEX SALINARIUS           172
CULEX TARSALIS              12
CULEX ERRATICUS              2
Name: Species, dtype: int64

In [35]:
weather_data.columns

Index(['Date', 'Tmax', 'Tmin', 'Tavg', 'Depart', 'DewPoint', 'WetBulb', 'Heat',
       'Cool', 'Sunrise', 'Sunset', 'CodeSum', 'PrecipTotal', 'StnPressure',
       'SeaLevel', 'ResultSpeed', 'ResultDir', 'AvgSpeed', 'Station_1',
       'Station_2'],
      dtype='object')