In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [7]:
weather_data = pd.read_csv('./assets/weather.csv')

In [8]:
weather_data.shape

(2944, 22)

In [9]:
# seems to have two different stations.  What is the location of each station that they have different information
# what is the optimal weather for mosquitos for west Nile?

# In the northern United States, West Nile is spread to humans by a mosquito known as Culex pipiens. 
# Its population depends on the number of spring and summer days above 81 degrees Fahrenheit. 
weather_data.head(5)

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,...,CodeSum,Depth,Water1,SnowFall,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
0,1,2007-05-01,83,50,67,14,51,56,0,2,...,,0,M,0.0,0.0,29.1,29.82,1.7,27,9.2
1,2,2007-05-01,84,52,68,M,51,57,0,3,...,,M,M,M,0.0,29.18,29.82,2.7,25,9.6
2,1,2007-05-02,59,42,51,-3,42,47,14,0,...,BR,0,M,0.0,0.0,29.38,30.09,13.0,4,13.4
3,2,2007-05-02,60,43,52,M,42,47,13,0,...,BR HZ,M,M,M,0.0,29.44,30.08,13.3,2,13.4
4,1,2007-05-03,66,46,56,2,40,48,9,0,...,,0,M,0.0,0.0,29.39,30.12,11.7,7,11.9


In [10]:
weather_data.replace('M', 0, inplace=True)
weather_data.replace('T', 0, inplace=True)
weather_data.replace(' T', 0, inplace=True)
weather_data.replace('  T', 0, inplace=True)

# no information in weather_data.SnowFall, dropping
# no information in weather_data.Depth, dropping
# no information in weather_data.Water1, dropping
weather_data.drop(columns=['Water1', 'SnowFall', 'Depth'], axis=1, inplace=True)

In [11]:
# we only care about Shower(SH), drizzle(DZ), rain(RA)
# Thunderstorm(TS), Mist(BR) = Each of these will reduce the number of mosquitoes
# This code isn't perfect and needs help.
patterns = ['SH', 'DZ', 'RA', 'TS', 'BR']

# iterate through all the CodeSum to find the columns with the patterns list
weather_data['CodeSum'] = weather_data['CodeSum'].apply(lambda x: ' '.join([t for t in x.split(' ') 
                                                                   if t in patterns]))

weather_data.CodeSum.value_counts()

            1692
RA BR        406
RA           366
BR           245
RA DZ BR      81
TS RA BR      37
RA DZ         23
TS BR         23
TS RA         22
DZ BR         21
TS            20
DZ             8
Name: CodeSum, dtype: int64

In [15]:
weather_data = weather_data[(weather_data['Station'] != 2)]
weather_data.drop('Station', axis=1, inplace=True)

weather_data.head()

Unnamed: 0,Date,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,Sunrise,Sunset,CodeSum,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
0,2007-05-01,83,50,67,14,51,56,0,2,448,1849,,0.0,29.1,29.82,1.7,27,9.2
2,2007-05-02,59,42,51,-3,42,47,14,0,447,1850,BR,0.0,29.38,30.09,13.0,4,13.4
4,2007-05-03,66,46,56,2,40,48,9,0,446,1851,,0.0,29.39,30.12,11.7,7,11.9
6,2007-05-04,66,49,58,4,41,50,7,0,444,1852,RA,0.0,29.31,30.05,10.4,8,10.8
8,2007-05-05,66,53,60,5,38,49,5,0,443,1853,,0.0,29.4,30.1,11.7,7,12.0


In [17]:
weather_data = weather_data.apply(pd.to_numeric, errors='ignore')

In [13]:
weather_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2944 entries, 0 to 2943
Data columns (total 19 columns):
Station        2944 non-null int64
Date           2944 non-null object
Tmax           2944 non-null int64
Tmin           2944 non-null int64
Tavg           2944 non-null int64
Depart         2944 non-null int64
DewPoint       2944 non-null int64
WetBulb        2944 non-null int64
Heat           2944 non-null int64
Cool           2944 non-null int64
Sunrise        2944 non-null object
Sunset         2944 non-null object
CodeSum        2944 non-null object
PrecipTotal    2944 non-null float64
StnPressure    2944 non-null float64
SeaLevel       2944 non-null float64
ResultSpeed    2944 non-null float64
ResultDir      2944 non-null int64
AvgSpeed       2944 non-null float64
dtypes: float64(5), int64(10), object(4)
memory usage: 437.1+ KB


In [28]:
weather_data['Daylight'] = weather_data.Sunset - weather_data.Sunrise
weather_data.drop(['Sunset', 'Sunrise'], axis=1, inplace=True)

In [29]:
weather_data.shape

(1472, 17)

In [30]:
weather_data = pd.get_dummies(weather_data, columns=['CodeSum'], drop_first=True)

In [31]:
weather_data.head()

Unnamed: 0,Date,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,PrecipTotal,...,CodeSum_DZ,CodeSum_DZ BR,CodeSum_RA,CodeSum_RA BR,CodeSum_RA DZ,CodeSum_RA DZ BR,CodeSum_TS,CodeSum_TS BR,CodeSum_TS RA,CodeSum_TS RA BR
0,2007-05-01,83,50,67,14,51,56,0,2,0.0,...,0,0,0,0,0,0,0,0,0,0
2,2007-05-02,59,42,51,-3,42,47,14,0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,2007-05-03,66,46,56,2,40,48,9,0,0.0,...,0,0,0,0,0,0,0,0,0,0
6,2007-05-04,66,49,58,4,41,50,7,0,0.0,...,0,0,1,0,0,0,0,0,0,0
8,2007-05-05,66,53,60,5,38,49,5,0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
weather_data.to_csv('weather_station_avg.csv')

In [33]:
weather_data.columns

Index(['Date', 'Tmax', 'Tmin', 'Tavg', 'Depart', 'DewPoint', 'WetBulb', 'Heat',
       'Cool', 'PrecipTotal', 'StnPressure', 'SeaLevel', 'ResultSpeed',
       'ResultDir', 'AvgSpeed', 'Daylight', 'CodeSum_BR', 'CodeSum_DZ',
       'CodeSum_DZ BR', 'CodeSum_RA', 'CodeSum_RA BR', 'CodeSum_RA DZ',
       'CodeSum_RA DZ BR', 'CodeSum_TS', 'CodeSum_TS BR', 'CodeSum_TS RA',
       'CodeSum_TS RA BR'],
      dtype='object')