In [17]:
import numpy as np
import pandas as pd
import re
from sklearn import svm
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
# from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
# from sklearn.metrics import confusion_matrix as cm
# from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

In [3]:
# data comes from Kaggle competition.  
train_data   = pd.read_csv('./assets/train.csv')
test_data    = pd.read_csv('./assets/test.csv', index_col='Id')
weather_data = pd.read_csv('./assets/weather_station_avg.csv')

In [4]:
# train_data = pd.merge(train_data, weather_data, how='inner', on='Date', sort=False, validate='m:1')
train_data = train_data.merge(weather_data, on='Date')
test_data  = test_data.merge(weather_data, on="Date")

train_data.shape, test_data.shape

((10506, 39), (116293, 37))

In [5]:
# to sort the values of the bugs that can have west nile from the ones that don't.
def processing_species(df):
    for bug in df:
        if bug == 'CULEX PIPIENS/RESTUANS':
            return 'wnv_bug'
        elif bug == 'CULEX PIPIENS':
            return 'wnv_bug'
#         elif bug == 'CULEX RESTUANS':
#             return 'wnv_bug'
        elif bug == 'UNSPECIFIED CULEX':  # Assuming that the unspecified bug is a virus carrier.
            return 'wnv_bug'
        else:
            return 'no_wnv_bug'

train_data['Species'] = train_data[['Species']].apply(processing_species, axis=1)
test_data['Species']  = test_data[['Species']].apply(processing_species, axis=1)

In [6]:
# Break down to see if specific days are the most useful. My assumptions
def date_separate(df):
    df = df.copy()
    df['Year'] = pd.DatetimeIndex(df['Date']).year
    df['Month'] = pd.DatetimeIndex(df['Date']).month
    df['Day'] = pd.DatetimeIndex(df['Date']).day
    return df

train_data = date_separate(train_data)
test_data  = date_separate(test_data)

In [7]:
# Removing date, because we 
def processing(df):
    df.drop(['Date', 
             'Address', 
             'Street', 
             'AddressNumberAndStreet', 
             'Unnamed: 0',
             ], axis=1, inplace=True)
    df['Trap'] = [x.strip('TABCabc') for x in df['Trap']]
    df['Trap'].astype(int)
    df = pd.get_dummies(df, columns=['Species'])
    return df

train_data = processing(train_data)
test_data  = processing(test_data)

In [8]:
# the difference in shapes is fine because we have the Target still in the Training Data.
train_data.shape, test_data.shape

((10506, 38), (116293, 36))

In [9]:
target   = train_data.NumMosquitos
features = train_data.drop(['WnvPresent', 'NumMosquitos'], axis=1) 

In [10]:
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=42, stratify=target)

In [18]:
# Using these three different models for classification.
ss  = StandardScaler()

lin = LinearRegression()

In [19]:
# pipeline to streamline the process.
pipe_lin = Pipeline([
    ('ss', ss),
    ('lin', lin)
])

In [20]:
%%time
pipes    = [ 
    pipe_lin, 
]
pipe_idx = {0: 'Linear Regression'}

for idx, pipe in enumerate(pipes):
    pipe.fit(X_train, y_train)
    print('\nScore Train/Test: %s' % pipe_idx[idx])
    print(pipe.score(X_train, y_train))
    print(pipe.score(X_test, y_test))
#     print('Best params: %s' % pipe.best_params_)


Score Train/Test: Linear Regression
0.2136819210086861
0.20365302708183106
CPU times: user 576 ms, sys: 50.1 ms, total: 626 ms
Wall time: 624 ms


In [22]:
feat = pipe_lin.named_steps['lin']

importance = feat.coef_

# Daylight and months correlate to being the most important features.  Clearly based on time of year
# In addition with latitude and longitude to target the specific
sorted(list(zip(importance, features.keys())), reverse=True)

[(119624119757557.97, 'Tavg'),
 (45219768428659.49, 'Heat'),
 (7600142470068.219, 'CodeSum_TS'),
 (1008906699894.123, 'CodeSum_RA DZ'),
 (8.92885553117782, 'Daylight'),
 (6.811251270407045, 'Month'),
 (3.815944668076004, 'Tmax'),
 (3.697151025205107, 'Tmin'),
 (3.571317900905999, 'StnPressure'),
 (2.351179097036793, 'DewPoint'),
 (1.958373614577606, 'Day'),
 (1.9055359103492804, 'CodeSum_TS BR'),
 (1.6590098561822562, 'AvgSpeed'),
 (1.5665264853705845, 'Depart'),
 (1.3280808397915165, 'CodeSum_TS RA'),
 (1.3217033111263747, 'Trap'),
 (0.7143693619673206, 'CodeSum_RA'),
 (0.6036502673652226, 'CodeSum_RA BR'),
 (0.5072846265500657, 'CodeSum_BR'),
 (0.2572358638131105, 'Year'),
 (0.17013511714762725, 'CodeSum_TS RA BR'),
 (0.05821936103722561, 'SeaLevel'),
 (-0.13897114691969406, 'CodeSum_RA DZ BR'),
 (-0.29006246122608037, 'CodeSum_DZ BR'),
 (-0.9133690925223336, 'ResultDir'),
 (-1.0982220296237133, 'ResultSpeed'),
 (-1.417154066678333, 'Block'),
 (-2.4116946240847783, 'PrecipTotal'),
 (

In [23]:
predictions = pipe_lin.predict(test_data)

In [24]:
test_data['NumMosquitos'] = predictions
test_data.to_csv('test_data_nummosquitos.csv', index=False)