# Project 4 - West Nile Virus Prediction

## Problem Statement
The intent of this project is to analyze weather data and GIS data and predicting the presence of the West Nile virus, for a given time, location, and species. 

##  General Approach
- [Data cleaning and imputation](#Data-Cleaning)
- [Data visualization](#Data-Visualization)
- [Feature selection](#Featuion-Selection)
- [Cross validation](#Cross-Validation)
- [Model fitting](#Model-Fitting)

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist
from haversine import haversine

In [2]:
# Load dataset 
train = pd.read_csv('assets/train_cleaned.csv')
test = pd.read_csv('assets/test_cleaned.csv')
weather = pd.read_csv('assets/weather_cleaned.csv')
spray = pd.read_csv('assets/spray.csv')

### Data Cleaning and Imputation

#### Weather data

In [3]:
# verifying the dataframe
weather.head()

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,DewPoint,WetBulb,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed,Rain
0,1,2007-05-01,83,50,66.5,51,56,0.0,29.1,29.82,1.7,27,9.2,0
1,2,2007-05-01,84,52,68.0,51,57,0.0,29.18,29.82,2.7,25,9.6,0
2,1,2007-05-02,59,42,50.5,42,47,0.0,29.38,30.09,13.0,4,13.4,0
3,2,2007-05-02,60,43,51.5,42,47,0.0,29.44,30.08,13.3,2,13.4,0
4,1,2007-05-03,66,46,56.0,40,48,0.0,29.39,30.12,11.7,7,11.9,0


In [4]:
# checking the datatypes
weather.dtypes

Station          int64
Date            object
Tmax             int64
Tmin             int64
Tavg           float64
DewPoint         int64
WetBulb          int64
PrecipTotal    float64
StnPressure    float64
SeaLevel       float64
ResultSpeed    float64
ResultDir        int64
AvgSpeed       float64
Rain             int64
dtype: object

In [5]:
# changing Date to DateTime 
weather['Date'] = pd.to_datetime(weather['Date'])

In [6]:
# verifying the change
weather.dtypes

Station                 int64
Date           datetime64[ns]
Tmax                    int64
Tmin                    int64
Tavg                  float64
DewPoint                int64
WetBulb                 int64
PrecipTotal           float64
StnPressure           float64
SeaLevel              float64
ResultSpeed           float64
ResultDir               int64
AvgSpeed              float64
Rain                    int64
dtype: object

#### Train data

In [7]:
train.head()

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent,Month,Year,Weekday,Tot_Mos_Species
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",PIPIENS/RESTUANS,41,N OAK PARK AVE,2,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0,5,2007,Tuesday,1
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",RESTUANS,41,N OAK PARK AVE,2,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0,5,2007,Tuesday,1
2,2007-05-29,"6200 North Mandell Avenue, Chicago, IL 60646, USA",RESTUANS,62,N MANDELL AVE,7,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9,1,0,5,2007,Tuesday,1
3,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",PIPIENS/RESTUANS,79,W FOSTER AVE,15,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,1,0,5,2007,Tuesday,1
4,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",RESTUANS,79,W FOSTER AVE,15,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,4,0,5,2007,Tuesday,4


In [8]:
# function takes in coordinates from train data and output station number closest to it
def closest_point(point):
    station1, station2 = [41.995, -87.933], [41.786, -87.752]  # Fixed coordinates for two stations
    points = [station1, station2] 
    if cdist([point], points).argmin() == 0: return 1 # return index of closest point
    return 2

In [9]:
# Assign Station to train data based on station coordinates
train['Station'] = [closest_point(x) for x in train[['Latitude','Longitude']].values]
# test['Station'] = [closest_point(x) for x in test[['Latitude','Longitude']].values]

In [10]:
# Convert test and train data to datetime format
train['Date'] = pd.to_datetime(train['Date'])
# test['Date'] = pd.to_datetime(test['Date'])

### Data Visualization

### Feature Engineering

In [11]:
weather = weather.set_index('Date')

In [12]:
for c in ['PrecipTotal','Tavg','Tmin','Tmax','DewPoint']:
    for p in [14,28,60]:
        if c == 'PrecipTotal':
            weather[f'{c}_{str(p)}'] = weather[c].rolling(p, min_periods=1).sum()
        elif c == 'Tmin':
            weather[f'{c}_{str(p)}'] = weather[c].rolling(p, min_periods=1).min()
        elif c == 'Tmax':
            weather[f'{c}_{str(p)}'] = weather[c].rolling(p, min_periods=1).max()
        else:
            weather[f'{c}_{str(p)}'] = weather[c].rolling(p, min_periods=1).mean()

In [13]:
weather.head(20)

Unnamed: 0_level_0,Station,Tmax,Tmin,Tavg,DewPoint,WetBulb,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,...,Tavg_60,Tmin_14,Tmin_28,Tmin_60,Tmax_14,Tmax_28,Tmax_60,DewPoint_14,DewPoint_28,DewPoint_60
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2007-05-01,1,83,50,66.5,51,56,0.0,29.1,29.82,1.7,...,66.5,50.0,50.0,50.0,83.0,83.0,83.0,51.0,51.0,51.0
2007-05-01,2,84,52,68.0,51,57,0.0,29.18,29.82,2.7,...,67.25,50.0,50.0,50.0,84.0,84.0,84.0,51.0,51.0,51.0
2007-05-02,1,59,42,50.5,42,47,0.0,29.38,30.09,13.0,...,61.666667,42.0,42.0,42.0,84.0,84.0,84.0,48.0,48.0,48.0
2007-05-02,2,60,43,51.5,42,47,0.0,29.44,30.08,13.3,...,59.125,42.0,42.0,42.0,84.0,84.0,84.0,46.5,46.5,46.5
2007-05-03,1,66,46,56.0,40,48,0.0,29.39,30.12,11.7,...,58.5,42.0,42.0,42.0,84.0,84.0,84.0,45.2,45.2,45.2
2007-05-03,2,67,48,57.5,40,50,0.0,29.46,30.12,12.9,...,58.333333,42.0,42.0,42.0,84.0,84.0,84.0,44.333333,44.333333,44.333333
2007-05-04,1,66,49,57.5,41,50,0.0,29.31,30.05,10.4,...,58.214286,42.0,42.0,42.0,84.0,84.0,84.0,43.857143,43.857143,43.857143
2007-05-04,2,78,51,64.5,42,50,0.0,29.36,30.04,10.1,...,59.0,42.0,42.0,42.0,84.0,84.0,84.0,43.625,43.625,43.625
2007-05-05,1,66,53,59.5,38,49,0.0,29.4,30.1,11.7,...,59.055556,42.0,42.0,42.0,84.0,84.0,84.0,43.0,43.0,43.0
2007-05-05,2,66,54,60.0,39,50,0.0,29.46,30.09,11.2,...,59.15,42.0,42.0,42.0,84.0,84.0,84.0,42.6,42.6,42.6


### Merge weather and train data

In [14]:
# Merge train data with weather based on date and station number
train = train.merge(weather, on=['Date','Station'])

In [15]:
# Create dummies for species columns
df = pd.get_dummies(train, columns=['Species'])

In [16]:
# Create month and day columns
df['Month'] = df['Date'].dt.month
df["Day"] = df['Date'].dt.dayofyear

### Finding location with high WnvPresent

In [17]:
train[train['WnvPresent'] == 1].groupby('Latitude')['Longitude'].value_counts().sort_values(ascending=False)

Latitude   Longitude 
41.974689  -87.890615    29
41.673408  -87.599862    15
41.954690  -87.800991    15
41.964242  -87.757639    14
41.743402  -87.731435    11
                         ..
41.800737  -87.711880     1
41.778748  -87.586427     1
41.772846  -87.740029     1
41.768388  -87.678649     1
41.644612  -87.604498     1
Name: Longitude, Length: 99, dtype: int64

In [18]:
wnvpresent_lat_lon  = [(41.974689, -87.890615), (41.673408, -87.599862)]

In [19]:
# Calculate the haversine distance from train data to coordiates with wnv persent area
for i,area in enumerate(wnvpresent_lat_lon):
    df['Dist_' + str(i)] = [haversine(row, (area),unit='mi') for row in zip(train['Latitude'],train['Longitude'])]
    

In [20]:
# Display numeric data
df._get_numeric_data().columns

Index(['Block', 'Trap', 'Latitude', 'Longitude', 'AddressAccuracy',
       'NumMosquitos', 'WnvPresent', 'Month', 'Year', 'Tot_Mos_Species',
       'Station', 'Tmax', 'Tmin', 'Tavg', 'DewPoint', 'WetBulb', 'PrecipTotal',
       'StnPressure', 'SeaLevel', 'ResultSpeed', 'ResultDir', 'AvgSpeed',
       'Rain', 'PrecipTotal_14', 'PrecipTotal_28', 'PrecipTotal_60', 'Tavg_14',
       'Tavg_28', 'Tavg_60', 'Tmin_14', 'Tmin_28', 'Tmin_60', 'Tmax_14',
       'Tmax_28', 'Tmax_60', 'DewPoint_14', 'DewPoint_28', 'DewPoint_60',
       'Species_Others', 'Species_PIPIENS', 'Species_PIPIENS/RESTUANS',
       'Species_RESTUANS', 'Day', 'Dist_0', 'Dist_1'],
      dtype='object')

In [21]:
# Features to include in model
features = [
      'DewPoint',
       'PrecipTotal', 'StnPressure', 'ResultSpeed', 'ResultDir',
        'Rain', 'PrecipTotal_14', 'PrecipTotal_28', 'PrecipTotal_60',
       'DewPoint_14', 'DewPoint_28','DewPoint_60',
       'Species_Others', 'Species_PIPIENS', 'Species_PIPIENS/RESTUANS',
       'Species_RESTUANS',
        'Month', 'Day',
       'Dist_0', 'Dist_1']

In [22]:
df.columns

Index(['Date', 'Address', 'Block', 'Street', 'Trap', 'AddressNumberAndStreet',
       'Latitude', 'Longitude', 'AddressAccuracy', 'NumMosquitos',
       'WnvPresent', 'Month', 'Year', 'Weekday', 'Tot_Mos_Species', 'Station',
       'Tmax', 'Tmin', 'Tavg', 'DewPoint', 'WetBulb', 'PrecipTotal',
       'StnPressure', 'SeaLevel', 'ResultSpeed', 'ResultDir', 'AvgSpeed',
       'Rain', 'PrecipTotal_14', 'PrecipTotal_28', 'PrecipTotal_60', 'Tavg_14',
       'Tavg_28', 'Tavg_60', 'Tmin_14', 'Tmin_28', 'Tmin_60', 'Tmax_14',
       'Tmax_28', 'Tmax_60', 'DewPoint_14', 'DewPoint_28', 'DewPoint_60',
       'Species_Others', 'Species_PIPIENS', 'Species_PIPIENS/RESTUANS',
       'Species_RESTUANS', 'Day', 'Dist_0', 'Dist_1'],
      dtype='object')

In [23]:
X = df[features]
y = df['WnvPresent']

In [24]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X, y = smote.fit_resample(X, y) 

### Modeling

#### Random Forest Classifier

In [25]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier

In [26]:
rfc = RandomForestClassifier(class_weight='balanced', max_features='sqrt',
                             min_samples_leaf=5, n_estimators=1000, n_jobs=-1, random_state=8)
rfc.fit(X, y)


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=-1, oob_score=False, random_state=8, verbose=0,
                       warm_start=False)

In [27]:
feature_import = rfc.feature_importances_
pd.DataFrame(columns=['feature', 'importance'], 
             data=list(zip(features, feature_import))).sort_values('importance',ascending=False).head(19)


Unnamed: 0,feature,importance
11,DewPoint_60,0.13223
15,Species_RESTUANS,0.10161
17,Dist_0,0.094915
18,Dist_1,0.079636
10,DewPoint_28,0.074179
16,Month,0.072753
14,Species_PIPIENS/RESTUANS,0.057918
9,DewPoint_14,0.051696
13,Species_PIPIENS,0.048151
7,PrecipTotal_28,0.0481


#### SVM

In [28]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

In [29]:
ss = StandardScaler()

In [30]:
X = ss.fit_transform(X)

In [31]:
svm = SVC(probability=True)

In [32]:
svm.fit(X,y)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

# Test export

In [33]:
test['Station'] = [closest_point(x) for x in test[['Latitude','Longitude']].values]
test['Date'] = pd.to_datetime(test['Date'])
test = test.merge(weather, on=['Date','Station'])

In [34]:
test = pd.get_dummies(test, columns=['Species'])

In [35]:
test['Month'] = test['Date'].dt.month
test["Day"] = test['Date'].dt.dayofyear

In [36]:
# Repeat for test data
for i,area in enumerate(wnvpresent_lat_lon):
    test['Dist_' + str(i)] = [haversine(row, (area),unit='mi') for row in zip(test['Latitude'],test['Longitude'])]

In [37]:
# Make match above
X_test = test[features]


In [38]:
def preds(model, X_test):
    if model == 'rfc':
        pred = rfc.predict_proba(X_test)
        submission = pd.DataFrame(columns=['Id', 'WnvPresent'], data=list(zip(test['Id'], pred)))
        submission.WnvPresent = submission['WnvPresent'].map(lambda x: x[1])
        submission.to_csv('submission_rfc_new.csv',index=False)        
        
    elif model == 'svm':
        pred = svm.predict_proba(ss.transform(X_test))
        submission = pd.DataFrame(columns=['Id', 'WnvPresent'], data=list(zip(test['Id'], pred)))
        submission.WnvPresent = submission['WnvPresent'].map(lambda x: x[1])
        submission.to_csv('submission_svm_new.csv',index=False)        
    return pred

In [39]:
# Whatever model you decided on:
predictions = svm.predict(ss.transform(X_test))

In [40]:
predictions_rfc = preds('rfc', X_test)
predictions_svm = preds('svm', X_test)