# Kaggle project

Analyzing weather data and GIS data and predicting whether or not West Nile virus is present, for a given time, location, and species

In [1]:
import numpy as np
import pandas as pd

from datetime import timedelta

from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import cross_val_score
from sklearn import metrics


pd.set_option('display.max_columns', 500)

In [2]:
# load datasets

weather = pd.read_csv('assets/weather.csv')
spray = pd.read_csv('assets/spray.csv')
train = pd.read_csv('assets/train.csv')
test = pd.read_csv('assets/test.csv')

In train and test data:
- NumMosquitos: number of mosquitoes caught in this trap
- WnvPresent: whether West Nile Virus was present in these mosquitos. 1 means WNV is present, and 0 means not present. 


## Join weather and spray info to a new training dataset

In [3]:
test.Species.unique()

array(['CULEX PIPIENS/RESTUANS', 'CULEX RESTUANS', 'CULEX PIPIENS',
       'CULEX SALINARIUS', 'CULEX TERRITANS', 'CULEX TARSALIS',
       'UNSPECIFIED CULEX', 'CULEX ERRATICUS'], dtype=object)

In [4]:
# check if spray
def r(x):
    return round(x * 1000)

sl = zip(spray.Latitude.map(r), spray.Longitude.map(r))

train['ll_loc'] = zip(train.Latitude.map(r), train.Longitude.map(r))

def check(x):
    if x in sl:
        return False
    else:
        return True

train = train[train.ll_loc.map(check)]

In [5]:
# dentifies which weather station is closest to the trap

station_1_lat = 41.995
station_1_long = -87.933
station_2_lat = 41.786
station_2_long = -87.752

train['lat_long_comb'] = zip(train['Latitude'],train['Longitude'])
test['lat_long_comb'] = zip(test['Latitude'],test['Longitude'])

def station(value):
    lat, lon = value
    dist_1 = np.sqrt(abs(station_1_lat - lat) + abs(station_1_long - lon))
    dist_2 = np.sqrt(abs(station_2_lat - lat) + abs(station_2_long - lon))
    if dist_1 > dist_2:
        return 2
    else:
        return 1

train['closest_station'] = train['lat_long_comb'].apply(station)
test['closest_station'] = test['lat_long_comb'].apply(station)

In [6]:
# deal with weather data
cols= ['Station', 'Date', 'Tavg', 'DewPoint', 'WetBulb', 
       'Heat', 'Cool', 'StnPressure', 'SeaLevel', 'ResultSpeed', 'AvgSpeed']
weather_chosen = weather[cols]

def checkifM(x):
    if x == 'M':
        return np.nan
    else:
        return x
        
weather_chosen = weather_chosen.applymap(checkifM)
weather_chosen = weather_chosen.fillna(method='pad')

# convert data to float
col = [c for c in weather_chosen.columns.tolist() if c not in ['Date']]
weather_chosen[col] = weather_chosen[col].astype(float)

weather_chosen['Date'] = pd.to_datetime(weather_chosen['Date'])
weather_chosen.set_index('Date', inplace = True)
weather_chosen['Date'] = weather_chosen.index

w_1 = weather_chosen[weather_chosen.Station == 1]
w_2 = weather_chosen[weather_chosen.Station == 2]

In [7]:
# rebuild weather data
def rolling(value, col):
    date_index = w_1.index.get_loc(value)
    if date_index > 7:
        week_slice = w_1.loc[value - timedelta(days=6):value,col]
        week_slice_df = pd.DataFrame(week_slice, columns = ['Date',col])
        avg = np.mean(week_slice_df[col])
        return avg

for colu in w_1.columns:
    newColumn = 'Rolling' + colu
    w_1[newColumn] = w_1['Date'].apply(rolling, col=colu)

for colu in w_2.columns:
    newColumn = 'Rolling' + colu
    w_2[newColumn] = w_2['Date'].apply(rolling, col=colu)    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [8]:
training = train.copy()
testing = test.copy()

In [9]:
# left join traing
training_left_1 = training[training.closest_station == 1]
training_left_2 = training[training.closest_station == 2]

testing_left_1 = testing[testing.closest_station == 1]
testing_left_2 = testing[testing.closest_station == 2]


training_left_1['Date'] = pd.to_datetime(training_left_1['Date'])
training_left_2['Date'] = pd.to_datetime(training_left_2['Date'])

testing_left_1['Date'] = pd.to_datetime(testing_left_1['Date'])
testing_left_2['Date'] = pd.to_datetime(testing_left_2['Date'])

right_1 = w_1.iloc[8:, 10:-1]
right_2 = w_2.iloc[8:, 10:-1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [10]:
training_station_1 = pd.merge(training_left_1, right_1, how='left', on='Date')
training_station_2 = pd.merge(training_left_2, right_2, how='left', on='Date')

testing_station_1 = pd.merge(testing_left_1, right_1, how='left', on='Date')
testing_station_2 = pd.merge(testing_left_2, right_2, how='left', on='Date')

training = training_station_1.append(training_station_2)
testing = testing_station_1.append(testing_station_2)

In [11]:
training.drop(['Trap', 'Address', 'Street', 'AddressNumberAndStreet', 'll_loc', 'Date',
         'lat_long_comb', 'closest_station', 'RollingStation', 'NumMosquitos'], axis=1, inplace=True)
dummies = pd.get_dummies(training.Species, drop_first=True)
training = pd.concat([dummies, training], axis=1)
training.drop('Species', axis=1, inplace=True)

In [12]:
testing.drop(['Trap', 'Address', 'Street', 'AddressNumberAndStreet', 'Id', 'Date',
         'lat_long_comb', 'closest_station', 'RollingStation'], axis=1, inplace=True)
dummies = pd.get_dummies(testing.Species, drop_first=True)
testing = pd.concat([dummies, testing], axis=1)
testing.drop(['Species', 'UNSPECIFIED CULEX'], axis=1, inplace=True)

In [13]:
print testing.shape, training.shape

(116293, 19) (9543, 20)


## Feature selection

In [92]:
columns = training.columns
columns = [x for x in columns if x not in ['WnvPresent']]

X = training[columns]
X.Longitude = X.Longitude.map(np.abs)

y = training.WnvPresent.values

In [132]:
# Feature elimination using the lasso penalty
lr = LogisticRegression()

ss = StandardScaler()
Xs = ss.fit_transform(X)
XX_t = ss.transform(testing)

selector = RFECV(lr, step=1, cv=10)
selector = selector.fit(Xs, y)
XX = selector.transform(Xs)
print XX.shape

XX_testing = selector.transform(XX_t)
print XX_testing.shape

(9543, 7)
(116293, 7)


## Build model

In [133]:
model = lr.fit(XX, y)
print model.score(XX, y), np.mean(cross_val_score(logit, XX, y, cv=10))
result = model.predict_proba(XX_testing)

re = pd.DataFrame(result, columns=['WnvPresent', 'no'])
re.index += 1
re.to_csv('result.csv', columns=['WnvPresent'])
re

0.949491774075 0.949492014608


Unnamed: 0,WnvPresent,no
1,0.939345,0.060655
2,0.979281,0.020719
3,0.911038,0.088962
4,0.979281,0.020719
5,0.999127,0.000873
6,0.979281,0.020719
7,0.979281,0.020719
8,0.979281,0.020719
9,0.939345,0.060655
10,0.979281,0.020719
