# Kaggle project

Analyzing weather data and GIS data and predicting whether or not West Nile virus is present, for a given time, location, and species

In [529]:
import numpy as np
import pandas as pd

from datetime import timedelta

from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.feature_selection import RFECV
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import cross_val_score
from sklearn import metrics


pd.set_option('display.max_columns', 500)

In [363]:
# load datasets

weather = pd.read_csv('assets/weather.csv')
spray = pd.read_csv('assets/spray.csv')
train = pd.read_csv('assets/train.csv')
test = pd.read_csv('assets/test.csv')

In train and test data:
- NumMosquitos: number of mosquitoes caught in this trap
- WnvPresent: whether West Nile Virus was present in these mosquitos. 1 means WNV is present, and 0 means not present. 


## Join weather and spray info to a new training dataset

In [422]:
test.Species.unique()

array(['CULEX PIPIENS/RESTUANS', 'CULEX RESTUANS', 'CULEX PIPIENS',
       'CULEX SALINARIUS', 'CULEX TERRITANS', 'CULEX TARSALIS',
       'UNSPECIFIED CULEX', 'CULEX ERRATICUS'], dtype=object)

In [380]:
# check if spray
def r(x):
    return round(x * 1000)

sl = zip(spray.Latitude.map(r), spray.Longitude.map(r))

train['ll_loc'] = zip(train.Latitude.map(r), train.Longitude.map(r))

def check(x):
    if x in sl:
        return False
    else:
        return True

train = train[train.ll_loc.map(check)]

In [383]:
# dentifies which weather station is closest to the trap

station_1_lat = 41.995
station_1_long = -87.933
station_2_lat = 41.786
station_2_long = -87.752

train['lat_long_comb'] = zip(train['Latitude'],train['Longitude'])
test['lat_long_comb'] = zip(test['Latitude'],test['Longitude'])

def station(value):
    lat, lon = value
    dist_1 = np.sqrt(abs(station_1_lat - lat) + abs(station_1_long - lon))
    dist_2 = np.sqrt(abs(station_2_lat - lat) + abs(station_2_long - lon))
    if dist_1 > dist_2:
        return 2
    else:
        return 1

train['closest_station'] = train['lat_long_comb'].apply(station)
test['closest_station'] = test['lat_long_comb'].apply(station)

In [386]:
# deal with weather data
cols= ['Station', 'Date', 'Tavg', 'DewPoint', 'WetBulb', 
       'Heat', 'Cool', 'StnPressure', 'SeaLevel', 'ResultSpeed', 'AvgSpeed']
weather_chosen = weather[cols]

def checkifM(x):
    if x == 'M':
        return np.nan
    else:
        return x
        
weather_chosen = weather_chosen.applymap(checkifM)
weather_chosen = weather_chosen.fillna(method='pad')

# convert data to float
col = [c for c in weather_chosen.columns.tolist() if c not in ['Date']]
weather_chosen[col] = weather_chosen[col].astype(float)

weather_chosen['Date'] = pd.to_datetime(weather_chosen['Date'])
weather_chosen.set_index('Date', inplace = True)
weather_chosen['Date'] = weather_chosen.index

w_1 = weather_chosen[weather_chosen.Station == 1]
w_2 = weather_chosen[weather_chosen.Station == 2]

In [387]:
# rebuild weather data
def rolling(value, col):
    date_index = w_1.index.get_loc(value)
    if date_index > 7:
        week_slice = w_1.loc[value - timedelta(days=6):value,col]
        week_slice_df = pd.DataFrame(week_slice, columns = ['Date',col])
        avg = np.mean(week_slice_df[col])
        return avg

for colu in w_1.columns:
    newColumn = 'Rolling' + colu
    w_1[newColumn] = w_1['Date'].apply(rolling, col=colu)

for colu in w_2.columns:
    newColumn = 'Rolling' + colu
    w_2[newColumn] = w_2['Date'].apply(rolling, col=colu)    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [428]:
training = train.copy()
testing = test.copy()

In [429]:
# left join traing
training_left_1 = training[training.closest_station == 1]
training_left_2 = training[training.closest_station == 2]

testing_left_1 = testing[testing.closest_station == 1]
testing_left_2 = testing[testing.closest_station == 2]


training_left_1['Date'] = pd.to_datetime(training_left_1['Date'])
training_left_2['Date'] = pd.to_datetime(training_left_2['Date'])

testing_left_1['Date'] = pd.to_datetime(testing_left_1['Date'])
testing_left_2['Date'] = pd.to_datetime(testing_left_2['Date'])

right_1 = w_1.iloc[8:, 10:-1]
right_2 = w_2.iloc[8:, 10:-1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [475]:
training_station_1 = pd.merge(training_left_1, right_1, how='left', on='Date')
training_station_2 = pd.merge(training_left_2, right_2, how='left', on='Date')

testing_station_1 = pd.merge(testing_left_1, right_1, how='left', on='Date')
testing_station_2 = pd.merge(testing_left_2, right_2, how='left', on='Date')

training = training_station_1.append(training_station_2)
testing = testing_station_1.append(testing_station_2)

In [476]:
training.drop(['Trap', 'Address', 'Street', 'AddressNumberAndStreet', 'll_loc', 'Date',
         'lat_long_comb', 'closest_station', 'RollingStation', 'NumMosquitos'], axis=1, inplace=True)
dummies = pd.get_dummies(training.Species, drop_first=True)
training = pd.concat([dummies, training], axis=1)
training.drop('Species', axis=1, inplace=True)

In [477]:
testing.drop(['Trap', 'Address', 'Street', 'AddressNumberAndStreet', 'Id', 'Date',
         'lat_long_comb', 'closest_station', 'RollingStation'], axis=1, inplace=True)
dummies = pd.get_dummies(testing.Species, drop_first=True)
testing = pd.concat([dummies, testing], axis=1)
testing.drop(['Species', 'UNSPECIFIED CULEX'], axis=1, inplace=True)

In [478]:
print testing.shape, training.shape

(116293, 19) (9543, 20)


## Feature selection

In [523]:
columns = training.columns
columns = [x for x in columns if x not in ['WnvPresent']]

X = training[columns]
X.Longitude = X.Longitude.map(np.abs)

y = df.WnvPresent.values

In [526]:
skb_f = SelectKBest(f_classif, k=5)
skb_chi2 = SelectKBest(chi2, k=5)

# train the selector on data
skb_f.fit(X, y)
skb_chi2.fit(X, y)

# examine results
kbest = pd.DataFrame([columns, list(skb_f.scores_), list(skb_chi2.scores_)], 
                     index=['feature','f_classif','chi2 score']).T.sort_values('f_classif', ascending=False)
kbest

Unnamed: 0,feature,f_classif,chi2 score
0,CULEX PIPIENS,90.287,66.211
11,RollingDewPoint,89.5961,46.8695
2,CULEX RESTUANS,78.1767,57.5322
12,RollingWetBulb,66.6097,26.5039
15,RollingStnPressure,42.0878,0.00990922
16,RollingSeaLevel,37.6438,0.00920372
13,RollingHeat,37.3699,130.021
17,RollingResultSpeed,30.4363,8.80291
10,RollingTavg,29.2229,13.0994
8,Longitude,27.7864,0.00305188


In [527]:
lr = LogisticRegression()
selector = RFECV(lr, step=1, cv=10)
selector = selector.fit(X, y)

print selector.support_
print selector.ranking_

rfecv_columns = np.array(columns)[selector.support_]
rfecv_columns

[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


array(['CULEX PIPIENS', 'CULEX PIPIENS/RESTUANS', 'CULEX RESTUANS',
       'CULEX SALINARIUS', 'CULEX TARSALIS', 'CULEX TERRITANS', 'Block',
       'Latitude', 'Longitude', 'AddressAccuracy', 'RollingTavg',
       'RollingDewPoint', 'RollingWetBulb', 'RollingHeat', 'RollingCool',
       'RollingStnPressure', 'RollingSeaLevel', 'RollingResultSpeed',
       'RollingAvgSpeed'], 
      dtype='|S22')

In [530]:
# Feature elimination using the lasso penalty
ss = StandardScaler()
Xs = ss.fit_transform(X)

lrcv = LogisticRegressionCV(penalty='l1', Cs=100, cv=10, solver='liblinear')
lrcv.fit(Xs, y)

KeyboardInterrupt: 

In [None]:
lrcv.C_

In [None]:
# What are the best coefficients according to a model using lasso?
coeffs = pd.DataFrame(lrcv.coef_, columns=X.columns)
coeffs_t = coeffs.transpose()
coeffs_t.columns = ['lasso_coefs']
coeffs_abs = coeffs_t.abs().sort_values('lasso_coefs', ascending=False)
coeffs_abs

## Build model

In [479]:
print training.WnvPresent.value_counts()
baseline = float(9061) / (9061 + 482)
baseline

0    9061
1     482
Name: WnvPresent, dtype: int64


0.9494917740752384

Without feature selection

In [480]:
columns = training.columns
columns = [x for x in columns if x not in ['WnvPresent']]

X = training[columns]
y = df.WnvPresent.values

In [495]:
ss = StandardScaler()
Xs = ss.fit_transform(X)

In [500]:
# fit training data into model
knn = KNeighborsClassifier(n_neighbors=1)
model = knn.fit(Xs, y)

# evaluate
print model.score(Xs, y), np.mean(cross_val_score(knn, Xs, y, cv=5))

0.973488420832 0.83306777754


In [485]:
# predict testing data
result = model.predict_proba(testing)