## Exploratory Data Analysis ##
Although sensitivity seems an appropriate metric to us considering that false positives would not be detrimental to the city, our metric is accuracy based on the Kaggle competition. 

In [1]:
import pandas as pd
import seaborn as sns
pd.core.common.is_list_like = pd.api.types.is_list_like
import pandas_datareader.data as web
from datetime import datetime
import matplotlib.pyplot as plt

In [13]:
spray = pd.read_csv('../data/spray.csv')
weather = pd.read_csv('../data/weather.csv')
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('..//data/test.csv')

In [14]:
train.columns = [col.lower().replace(' ', '_') for col in train.columns]

In [15]:
test.columns = [col.lower().replace(' ', '_') for col in test.columns]

In [16]:
spray.columns = [col.lower().replace(' ', '_') for col in spray.columns]

In [17]:
weather.columns = [col.lower().replace(' ', '_') for col in weather.columns]

In [19]:
#create baseline file
baseline=test.loc[:,("id")]
baseline['wnvpresent'] = 0
baseline.to_csv('../Data/baselinesubmission.csv', index=False)

In [20]:
#baseline on train
1-train['wnvpresent'].mean()

0.9475537787930707

In [21]:
spray.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14835 entries, 0 to 14834
Data columns (total 4 columns):
date         14835 non-null object
time         14251 non-null object
latitude     14835 non-null float64
longitude    14835 non-null float64
dtypes: float64(2), object(2)
memory usage: 463.7+ KB


In [None]:
spray[spray.isnull().any(axis=1)]

In [None]:
#Dropped time column from spray because it isn't contained in any other datasets.

In [None]:
spray.drop(columns='time', inplace=True)

In [None]:
spray.head()

In [None]:
spray.columns

In [None]:
weather.info()

In [None]:
#Dropped all t-averages with missing value 'M' because there were additional missing values in each of these rows.

In [None]:
weather = (weather[weather['tavg'] != 'M'])

In [None]:
weather.shape

In [None]:
weather['tavg'] = weather['tavg'].astype(int)

In [None]:
weather[weather['depart'] == 'M']

In [None]:
weather['depart'].value_counts(normalize=True)

In [None]:
weather.info()

In [None]:
weather[weather.isnull().any(axis=1)]

In [None]:
train.info()

In [None]:
train[train.isnull().any(axis=1)]

In [None]:
train.columns

In [None]:
#Converted dates to datetime.

In [22]:
spray['date']   = pd.to_datetime(spray['date'])
weather['date'] = pd.to_datetime(weather['date'])
train['date']   = pd.to_datetime(train['date'])
test['date']    = pd.to_datetime(test['date'])

In [None]:
weather1 = weather[weather['station'] == 1]
weather2 = weather[weather['station'] == 2]
weather1.drop(columns='station', inplace=True)
weather2.drop(columns='station', inplace=True)

In [None]:
weather = weather1.merge(weather2, on='date')

In [None]:
train.merge(weather, on='date')

In [None]:
test.merge(weather, on='date')

In [None]:
# Change species and trap to string
train.species = train.species.astype(str)
train.trap = train.trap.astype(str)
test.species = test.species.astype(str)
test.trap = test.trap.astype(str)

In [None]:
test.trap

In [None]:
spray.head()

In [None]:
train['trap'].head()

In [None]:
train['addressnumberandstreet'].head()

In [None]:
train['address'].head()

In [None]:
train['species'].value_counts()

In [None]:
test['species'].value_counts()

In [None]:
train['wnvpresent'].head()

In [None]:
train['trap'].value_counts()

In [None]:
test['trap'].value_counts()

In [None]:
### find major center points, like airport, and do a range of a few miles of where wnv was present?

In [10]:
test.trap = [trap.replace("T", "") for trap in test['trap']]

In [11]:
test.trap.head()


0    002
1    002
2    002
3    002
4    002
Name: trap, dtype: object

In [12]:
test.year

AttributeError: 'DataFrame' object has no attribute 'year'

In [28]:
test['year'] = [str(date)[0:4] for date in test.date]

In [29]:
test.year



0         2008
1         2008
2         2008
3         2008
4         2008
5         2008
6         2008
7         2008
8         2008
9         2008
10        2008
11        2008
12        2008
13        2008
14        2008
15        2008
16        2008
17        2008
18        2008
19        2008
20        2008
21        2008
22        2008
23        2008
24        2008
25        2008
26        2008
27        2008
28        2008
29        2008
          ... 
116263    2014
116264    2014
116265    2014
116266    2014
116267    2014
116268    2014
116269    2014
116270    2014
116271    2014
116272    2014
116273    2014
116274    2014
116275    2014
116276    2014
116277    2014
116278    2014
116279    2014
116280    2014
116281    2014
116282    2014
116283    2014
116284    2014
116285    2014
116286    2014
116287    2014
116288    2014
116289    2014
116290    2014
116291    2014
116292    2014
Name: year, Length: 116293, dtype: object