In [91]:
#!bin/bash/python
# Libraries
import pandas as pd
import numpy as np
import math
from sklearn.externals import joblib

# read in data
test = pd.read_csv('../data/input/test.csv')
weather = pd.read_csv('../data/input/weather.csv')

# column transformations
weather.Date = pd.DatetimeIndex(weather.Date)
test.Date = pd.to_datetime(test.Date)
weather.PrecipTotal = weather.PrecipTotal.str.strip() # to remove the leading spaces
weather = weather.replace('T', 0.005)
weather = weather.replace('M', np.nan)

weather_excluded = ['Depth', 'Water1', 'SnowFall', 'Depart', 'Heat', 'Cool', 'Sunrise', 'Sunset']
weather_keep = [column for column in weather.columns if column not in weather_excluded]
weather = weather[weather_keep]

'''
Formula for calculating compass bearing between two lat/lon tuples.  Credit:  https://gist.github.com/jeromer/2005586
Corrected output error in which returned bearing (in degrees) needed to be subtracted from 360 in order to be correct.
Validated this change on https://www.sunearthtools.com/tools/distance.php and with manual orienteering using Google
maps in place of a physical map.

Other than that change at the end of the formula, the code was taken from the gist page linked above.
'''


def compass_bearing(loc1, loc2):
    """
    Calculates the bearing between two points.
    The formulae used is the following:
        θ = atan2(sin(Δlong).cos(lat2),
                  cos(lat1).sin(lat2) − sin(lat1).cos(lat2).cos(Δlong))
    :Parameters:
      - `loc1: The tuple representing the latitude/longitude for the
        first point. Latitude and longitude must be in decimal degrees
      - `loc2: The tuple representing the latitude/longitude for the
        second point. Latitude and longitude must be in decimal degrees
    :Returns:
      The bearing in degrees
    :Returns Type:
      float
    """
    if (type(loc1) != tuple) or (type(loc2) != tuple):
        raise TypeError("Only tuples are supported as arguments")

    lat1 = math.radians(loc1[0])
    lat2 = math.radians(loc2[0])

    diffLong = math.radians(loc1[1] - loc2[1])

    x = math.sin(diffLong) * math.cos(lat2)
    y = math.cos(lat1) * math.sin(lat2) - (math.sin(lat1)
            * math.cos(lat2) * math.cos(diffLong))

    initial_bearing = math.atan2(x, y)

    # Now we have the initial bearing but math.atan2 return values
    # from -180° to + 180° which is not what we want for a compass bearing
    # The solution is to normalize the initial bearing as shown below
    initial_bearing = math.degrees(initial_bearing)
    compass_bearing = 360 - ((initial_bearing + 360) % 360) ## Mike: this originally returned an incorrect bearing
                         ## corrected by subtracting result from 360

    return compass_bearing

'''
Calculate distance in KM between two sets of coordinates (lat/lon tuples).  Uses Haversine formula.  Credit to https://gist.github.com/rochacbruno/2883505 and http://www.movable-type.co.uk/scripts/latlong.html

This formula works as-is.  I tested it using Google Maps distance calculator as a validator.
'''

def distance(loc1, loc2):
#     print (loc1, loc2)
    lat1, lon1 = loc1
    lat2, lon2 = loc2
    radius = 6371 # radius of Earth in KM

    dlat = math.radians(lat2-lat1)
    dlon = math.radians(lon2-lon1)
    a = math.sin(dlat/2) * math.sin(dlat/2) + math.cos(math.radians(lat1)) \
        * math.cos(math.radians(lat2)) * math.sin(dlon/2) * math.sin(dlon/2)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    d = radius * c

    return d

"""
Mapping of each trap to its nearest weather station--compares distance in km between a trap and each airport weather station, and returns the station number ('Station' column) of the nearest weather station.

Weather station info from Kaggle:

Station 1: CHICAGO O'HARE INTERNATIONAL AIRPORT Lat: 41.995 Lon: -87.933 Elev: 662 ft. above sea level
Station 2: CHICAGO MIDWAY INTL ARPT Lat: 41.786 Lon: -87.752 Elev: 612 ft. above sea level

"""

station_coords = {
        1: (41.995,-87.933),
           2: (41.786,-87.752)}

test['Coordinates'] = test[['Latitude', 'Longitude']].apply(tuple, axis=1)

def assign_station(i):
    if distance(station_coords[1], i) < distance(station_coords[2], i):
        return 1
    else:
        return 2

test['Weather_Station'] = test.Coordinates.apply(assign_station)

# rename the extra trap
test.loc[test['Coordinates'] == (41.992478000000006, -87.862994999999998), 'Trap'] = 'T009Alt'
test.loc[test['Coordinates'] == (41.763733000000002, -87.742301999999995) , 'Trap'] = 'T035Alt'


# setting up column to build out weather types
weather_types = {'\+FC': 'TORNADO/WATERSPOUT','FC': 'FUNNEL CLOUD','TS': 'THUNDERSTORM','GR': 'HAIL','RA': 'RAIN',
'DZ': 'DRIZZLE','SN': 'SNOW','SG': 'SNOW GRAINS','GS': 'SMALL HAIL ANDOR SNOW PELLETS','PL': 'ICE PELLETS',
'IC': 'ICE CRYSTALS','FG\+': 'HEAVY FOG','FG': 'FOG','BR': 'MIST','UP': "UNKNOWN PRECIPITATION",'HZ': 'HAZE','FU': 'SMOKE',
'VA': 'VOLCANIC ASH','DU': 'WIDESPREAD DUST','DS': 'DUSTSTORM','PO': 'SAND_DUST WHIRLS',
'SA': 'SAND','SS': 'SANDSTORM','PY': 'SPRAY','SQ': 'SQUALL','DR': 'LOW DRIFTING','SH': 'SHOWER','FZ': 'FREEZING',
'MI': 'SHALLOW','PR': 'PARTIAL','BC': 'PATCHES','BL': 'BLOWING','VC': 'VICINITY'}

# builds out the identifying weather features
for i in weather_types:
    weather[i] = 0
    weather.loc[weather.CodeSum.str.contains(i) == True,i] = 1

weather.drop('CodeSum',axis=1,inplace=True)

n_weather = weather.iloc[12:,:] # for every day in the weather dataset after the 6th one
for i in range(1,7): # Hard-coded range of the last 6 days
    n_date = "_date-"+str(i)
    n_weather.loc[:,n_date] = n_weather.Date-pd.DateOffset(i)
    n_weather = pd.merge(left=n_weather,right=weather,left_on=[n_date,'Station'],right_on=['Date','Station'],suffixes =('',n_date))

# Creating de-duped df of traps and coordinates.
traps_master = test[['Trap', 'Coordinates']]
# Drop duplicates
traps_master.drop_duplicates(inplace=True)

# Create list of column names to use with for loops below
trap_cols = traps_master.Trap.tolist()

# Create dictionary of Trap IDs and corresponding lat/lon coordinate tuples
trap_dict = traps_master.set_index('Trap')['Coordinates'].to_dict()

# Creates dictionary where key is a Trap name, and the value for each is a list of the distances to every other trap.
distance_dict = {}
for i in trap_cols:
    dist_list = []
    for k in trap_dict:
        dist = distance(trap_dict[k], trap_dict[i])
        dist_list.append(dist)
    distance_dict[i] = dist_list


# Creates dictionary where key is a Trap name, and the value for each is a list of the compass bearings from every other trap.
bearing_dict = {}
for c in trap_cols:
    bearing_list = []
    for q in trap_dict:
        bearing = compass_bearing(trap_dict[q], trap_dict[c])
        bearing_list.append(bearing)
    bearing_dict[c] = bearing_list

# Creates df of relative distances.  To be merged with a left join into test df.
dist_df = pd.DataFrame.from_dict(distance_dict, orient='index')
distance_labels = dist_df.index.tolist()
distance_cols = []
for c in distance_labels:
    label = c + '_dist'
    distance_cols.append(label)
dist_df.columns = [distance_cols]
dist_df['Trap'] = dist_df.index
# dist_df.to_csv('../data/relative_locations.csv')


# Creates df of relative bearings.  To be merged with a left join into test df.
bearing_df = pd.DataFrame.from_dict(bearing_dict, orient='index')
bearing_labels = bearing_df.index.tolist()
bearing_cols = []
for c in bearing_labels:
    label = c + '_bearing'
    bearing_cols.append(label)
bearing_df.columns = [bearing_cols]
bearing_df['Trap'] = bearing_df.index
# bearing_df.to_csv('../data/relative_bearings.csv')
test = pd.merge(test, dist_df, how='left', left_on='Trap', right_on='Trap')
test = pd.merge(test, bearing_df, how='left', left_on='Trap', right_on='Trap')
test = pd.merge(test,n_weather,left_on=['Date','Weather_Station'],right_on=['Date','Station'])

species = set(test.Species)
species = [i for i in species]
species_labels = {}
for i,v in enumerate(species):
    species_labels[v] = i
test['species_labels'] = test['Species'].map(species_labels)

# Convert the date to the epoch
test['Epoch'] = test.Date.astype(np.int64) // 10**9

# test.to_csv('~/DropBox/DSI/test_transformed.csv')

In [93]:
output_test = pd.read_csv('../data/input/sampleSubmission.csv')
output_test.drop('WnvPresent',axis=1,inplace=True)


In [96]:
cols = [i for i in test.columns]

# deal with NaNs
for i in test.columns[test.isnull().sum()>0]: # for all the columns with nans
    mean_avg_temp = test.loc[test[i].isnull()==True,'Tavg'].mean() # what is the mean Tavg
    mean_i_val = test.loc[test.Tavg==mean_avg_temp,i].mean() # what is i val for that Tavg
    test.loc[test[i].isnull()==True,'to_fill'] = mean_i_val
    test[i] = test[i].fillna(test.to_fill)
    test.drop('to_fill',axis=1,inplace=True)

# remove date columns
dates = ['_date-'+str(i) for i in range(1,7)]
for i,v in enumerate(cols):
    try:
        if v in dates: # drops out the repeated date column
            test.drop(v,axis=1,inplace=True)
    except:
        pass
    try:
        if 'Date_date-' in v:
            test.drop(v,axis=1,inplace=True)
    except:
        pass
    
# label encode Species and Trap
from sklearn.preprocessing import LabelEncoder,normalize,StandardScaler
le = LabelEncoder()
le.fit(test.Species)
test['mosquito'] = le.transform(test.Species)
le.fit(test.Trap)
test['trap_e'] = le.transform(test.Trap)
cols = [i for i in test.columns]
X_cols = cols[12:]
X_cols.append('Latitude')
X_cols.append('Longitude')
X_cols.append('AddressAccuracy')

# set up y
# y = train.dropna()['WnvPresent']

In [113]:
X_values = test[X_cols].dropna().values # convert to numpy array
ss = StandardScaler()
X_std = ss.fit_transform(X_values)

## Modeling start

In [100]:
model = joblib.load('../models/grad_boost_426.pkl')

In [101]:
model

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False)

In [102]:
test[X_cols].head()

Unnamed: 0,Weather_Station,T152_dist,T153_dist,T019_dist,T018_dist,T156_dist,T157_dist,T154_dist,T155_dist,T013_dist,...,SG_date-6,PO_date-6,PL_date-6,SN_date-6,Epoch,mosquito,trap_e,Latitude,Longitude,AddressAccuracy
0,1,13.965004,6.194487,8.496345,13.042652,20.8408,18.071554,5.675603,19.935975,3.67863,...,0,0,0,0,1213142400,2,1,41.95469,-87.800991,9
1,1,13.965004,6.194487,8.496345,13.042652,20.8408,18.071554,5.675603,19.935975,3.67863,...,0,0,0,0,1213142400,3,1,41.95469,-87.800991,9
2,1,13.965004,6.194487,8.496345,13.042652,20.8408,18.071554,5.675603,19.935975,3.67863,...,0,0,0,0,1213142400,1,1,41.95469,-87.800991,9
3,1,13.965004,6.194487,8.496345,13.042652,20.8408,18.071554,5.675603,19.935975,3.67863,...,0,0,0,0,1213142400,4,1,41.95469,-87.800991,9
4,1,13.965004,6.194487,8.496345,13.042652,20.8408,18.071554,5.675603,19.935975,3.67863,...,0,0,0,0,1213142400,6,1,41.95469,-87.800991,9


In [108]:
model_results = model.predict_proba(X_std)
model_wvn_preds = [i[1] for i in model_results]
model_submission = output_test
model_submission['WnvPresent']= model_wvn_preds
model_submission.to_csv('../submissions/grad_boost_426.csv',index=False)

ValueError: Number of features of the model must match the input. Model n_features is 593 and input n_features is 618 

In [110]:
model_columns = ['Weather_Station',
 'T152_dist',
 'T153_dist',
 'T019_dist',
 'T018_dist',
 'T156_dist',
 'T157_dist',
 'T154_dist',
 'T155_dist',
 'T013_dist',
 'T012_dist',
 'T011_dist',
 'T159_dist',
 'T017_dist',
 'T016_dist',
 'T015_dist',
 'T014_dist',
 'T151_dist',
 'T080_dist',
 'T081_dist',
 'T082_dist',
 'T083_dist',
 'T084_dist',
 'T085_dist',
 'T086_dist',
 'T088_dist',
 'T089_dist',
 'T158_dist',
 'T149_dist',
 'T148_dist',
 'T009Alt_dist',
 'T145_dist',
 'T144_dist',
 'T147_dist',
 'T146_dist',
 'T141_dist',
 'T143_dist',
 'T142_dist',
 'T903_dist',
 'T092_dist',
 'T091_dist',
 'T090_dist',
 'T097_dist',
 'T096_dist',
 'T095_dist',
 'T094_dist',
 'T099_dist',
 'T222_dist',
 'T138_dist',
 'T046_dist',
 'T128_dist',
 'T114_dist',
 'T027_dist',
 'T025_dist',
 'T228_dist',
 'T229_dist',
 'T900_dist',
 'T224_dist',
 'T225_dist',
 'T226_dist',
 'T227_dist',
 'T220_dist',
 'T221_dist',
 'T028_dist',
 'T223_dist',
 'T044_dist',
 'T045_dist',
 'T129_dist',
 'T047_dist',
 'T040_dist',
 'T043_dist',
 'T048_dist',
 'T049_dist',
 'T031_dist',
 'T030_dist',
 'T033_dist',
 'T035_dist',
 'T034_dist',
 'T037_dist',
 'T036_dist',
 'T039_dist',
 'T236_dist',
 'T235_dist',
 'T233_dist',
 'T232_dist',
 'T231_dist',
 'T230_dist',
 'T054_dist',
 'T051_dist',
 'T050_dist',
 'T209_dist',
 'T200_dist',
 'T206_dist',
 'T062_dist',
 'T063_dist',
 'T060_dist',
 'T061_dist',
 'T066_dist',
 'T067_dist',
 'T065_dist',
 'T069_dist',
 'T135_dist',
 'T054C_dist',
 'T238_dist',
 'T215_dist',
 'T212_dist',
 'T219_dist',
 'T218_dist',
 'T150_dist',
 'T115_dist',
 'T079_dist',
 'T078_dist',
 'T075_dist',
 'T074_dist',
 'T077_dist',
 'T076_dist',
 'T071_dist',
 'T070_dist',
 'T073_dist',
 'T072_dist',
 'T094B_dist',
 'T100_dist',
 'T103_dist',
 'T102_dist',
 'T107_dist',
 'T008_dist',
 'T009_dist',
 'T162_dist',
 'T161_dist',
 'T160_dist',
 'T001_dist',
 'T002_dist',
 'T003_dist',
 'T004_dist',
 'T005_dist',
 'T006_dist',
 'T007_dist',
 'T237_dist',
 'T152_bearing',
 'T153_bearing',
 'T019_bearing',
 'T018_bearing',
 'T156_bearing',
 'T157_bearing',
 'T154_bearing',
 'T155_bearing',
 'T013_bearing',
 'T012_bearing',
 'T011_bearing',
 'T159_bearing',
 'T017_bearing',
 'T016_bearing',
 'T015_bearing',
 'T014_bearing',
 'T151_bearing',
 'T080_bearing',
 'T081_bearing',
 'T082_bearing',
 'T083_bearing',
 'T084_bearing',
 'T085_bearing',
 'T086_bearing',
 'T088_bearing',
 'T089_bearing',
 'T158_bearing',
 'T149_bearing',
 'T148_bearing',
 'T009Alt_bearing',
 'T145_bearing',
 'T144_bearing',
 'T147_bearing',
 'T146_bearing',
 'T141_bearing',
 'T143_bearing',
 'T142_bearing',
 'T903_bearing',
 'T092_bearing',
 'T091_bearing',
 'T090_bearing',
 'T097_bearing',
 'T096_bearing',
 'T095_bearing',
 'T094_bearing',
 'T099_bearing',
 'T222_bearing',
 'T138_bearing',
 'T046_bearing',
 'T128_bearing',
 'T114_bearing',
 'T027_bearing',
 'T025_bearing',
 'T228_bearing',
 'T229_bearing',
 'T900_bearing',
 'T224_bearing',
 'T225_bearing',
 'T226_bearing',
 'T227_bearing',
 'T220_bearing',
 'T221_bearing',
 'T028_bearing',
 'T223_bearing',
 'T044_bearing',
 'T045_bearing',
 'T129_bearing',
 'T047_bearing',
 'T040_bearing',
 'T043_bearing',
 'T048_bearing',
 'T049_bearing',
 'T031_bearing',
 'T030_bearing',
 'T033_bearing',
 'T035_bearing',
 'T034_bearing',
 'T037_bearing',
 'T036_bearing',
 'T039_bearing',
 'T236_bearing',
 'T235_bearing',
 'T233_bearing',
 'T232_bearing',
 'T231_bearing',
 'T230_bearing',
 'T054_bearing',
 'T051_bearing',
 'T050_bearing',
 'T209_bearing',
 'T200_bearing',
 'T206_bearing',
 'T062_bearing',
 'T063_bearing',
 'T060_bearing',
 'T061_bearing',
 'T066_bearing',
 'T067_bearing',
 'T065_bearing',
 'T069_bearing',
 'T135_bearing',
 'T054C_bearing',
 'T238_bearing',
 'T215_bearing',
 'T212_bearing',
 'T219_bearing',
 'T218_bearing',
 'T150_bearing',
 'T115_bearing',
 'T079_bearing',
 'T078_bearing',
 'T075_bearing',
 'T074_bearing',
 'T077_bearing',
 'T076_bearing',
 'T071_bearing',
 'T070_bearing',
 'T073_bearing',
 'T072_bearing',
 'T094B_bearing',
 'T100_bearing',
 'T103_bearing',
 'T102_bearing',
 'T107_bearing',
 'T008_bearing',
 'T009_bearing',
 'T162_bearing',
 'T161_bearing',
 'T160_bearing',
 'T001_bearing',
 'T002_bearing',
 'T003_bearing',
 'T004_bearing',
 'T005_bearing',
 'T006_bearing',
 'T007_bearing',
 'T237_bearing',
 'Station',
 'Tmax',
 'Tmin',
 'Tavg',
 'DewPoint',
 'WetBulb',
 'PrecipTotal',
 'StnPressure',
 'SeaLevel',
 'ResultSpeed',
 'ResultDir',
 'AvgSpeed',
 'HZ',
 'VA',
 'VC',
 'GS',
 'GR',
 'FG\\+',
 'BC',
 'BL',
 'FZ',
 'UP',
 'FC',
 'DZ',
 'BR',
 'FG',
 'IC',
 'DU',
 'DR',
 'DS',
 'FU',
 'PR',
 'SS',
 'SQ',
 'PY',
 'MI',
 'TS',
 'SH',
 'RA',
 '\\+FC',
 'SA',
 'SG',
 'PO',
 'PL',
 'SN',
 'Tmax_date-1',
 'Tmin_date-1',
 'Tavg_date-1',
 'DewPoint_date-1',
 'WetBulb_date-1',
 'PrecipTotal_date-1',
 'StnPressure_date-1',
 'SeaLevel_date-1',
 'ResultSpeed_date-1',
 'ResultDir_date-1',
 'AvgSpeed_date-1',
 'HZ_date-1',
 'VA_date-1',
 'VC_date-1',
 'GS_date-1',
 'GR_date-1',
 'FG\\+_date-1',
 'BC_date-1',
 'BL_date-1',
 'FZ_date-1',
 'UP_date-1',
 'FC_date-1',
 'DZ_date-1',
 'BR_date-1',
 'FG_date-1',
 'IC_date-1',
 'DU_date-1',
 'DR_date-1',
 'DS_date-1',
 'FU_date-1',
 'PR_date-1',
 'SS_date-1',
 'SQ_date-1',
 'PY_date-1',
 'MI_date-1',
 'TS_date-1',
 'SH_date-1',
 'RA_date-1',
 '\\+FC_date-1',
 'SA_date-1',
 'SG_date-1',
 'PO_date-1',
 'PL_date-1',
 'SN_date-1',
 'Tmax_date-2',
 'Tmin_date-2',
 'Tavg_date-2',
 'DewPoint_date-2',
 'WetBulb_date-2',
 'PrecipTotal_date-2',
 'StnPressure_date-2',
 'SeaLevel_date-2',
 'ResultSpeed_date-2',
 'ResultDir_date-2',
 'AvgSpeed_date-2',
 'HZ_date-2',
 'VA_date-2',
 'VC_date-2',
 'GS_date-2',
 'GR_date-2',
 'FG\\+_date-2',
 'BC_date-2',
 'BL_date-2',
 'FZ_date-2',
 'UP_date-2',
 'FC_date-2',
 'DZ_date-2',
 'BR_date-2',
 'FG_date-2',
 'IC_date-2',
 'DU_date-2',
 'DR_date-2',
 'DS_date-2',
 'FU_date-2',
 'PR_date-2',
 'SS_date-2',
 'SQ_date-2',
 'PY_date-2',
 'MI_date-2',
 'TS_date-2',
 'SH_date-2',
 'RA_date-2',
 '\\+FC_date-2',
 'SA_date-2',
 'SG_date-2',
 'PO_date-2',
 'PL_date-2',
 'SN_date-2',
 'Tmax_date-3',
 'Tmin_date-3',
 'Tavg_date-3',
 'DewPoint_date-3',
 'WetBulb_date-3',
 'PrecipTotal_date-3',
 'StnPressure_date-3',
 'SeaLevel_date-3',
 'ResultSpeed_date-3',
 'ResultDir_date-3',
 'AvgSpeed_date-3',
 'HZ_date-3',
 'VA_date-3',
 'VC_date-3',
 'GS_date-3',
 'GR_date-3',
 'FG\\+_date-3',
 'BC_date-3',
 'BL_date-3',
 'FZ_date-3',
 'UP_date-3',
 'FC_date-3',
 'DZ_date-3',
 'BR_date-3',
 'FG_date-3',
 'IC_date-3',
 'DU_date-3',
 'DR_date-3',
 'DS_date-3',
 'FU_date-3',
 'PR_date-3',
 'SS_date-3',
 'SQ_date-3',
 'PY_date-3',
 'MI_date-3',
 'TS_date-3',
 'SH_date-3',
 'RA_date-3',
 '\\+FC_date-3',
 'SA_date-3',
 'SG_date-3',
 'PO_date-3',
 'PL_date-3',
 'SN_date-3',
 'Tmax_date-4',
 'Tmin_date-4',
 'Tavg_date-4',
 'DewPoint_date-4',
 'WetBulb_date-4',
 'PrecipTotal_date-4',
 'StnPressure_date-4',
 'SeaLevel_date-4',
 'ResultSpeed_date-4',
 'ResultDir_date-4',
 'AvgSpeed_date-4',
 'HZ_date-4',
 'VA_date-4',
 'VC_date-4',
 'GS_date-4',
 'GR_date-4',
 'FG\\+_date-4',
 'BC_date-4',
 'BL_date-4',
 'FZ_date-4',
 'UP_date-4',
 'FC_date-4',
 'DZ_date-4',
 'BR_date-4',
 'FG_date-4',
 'IC_date-4',
 'DU_date-4',
 'DR_date-4',
 'DS_date-4',
 'FU_date-4',
 'PR_date-4',
 'SS_date-4',
 'SQ_date-4',
 'PY_date-4',
 'MI_date-4',
 'TS_date-4',
 'SH_date-4',
 'RA_date-4',
 '\\+FC_date-4',
 'SA_date-4',
 'SG_date-4',
 'PO_date-4',
 'PL_date-4',
 'SN_date-4',
 'Tmax_date-5',
 'Tmin_date-5',
 'Tavg_date-5',
 'DewPoint_date-5',
 'WetBulb_date-5',
 'PrecipTotal_date-5',
 'StnPressure_date-5',
 'SeaLevel_date-5',
 'ResultSpeed_date-5',
 'ResultDir_date-5',
 'AvgSpeed_date-5',
 'HZ_date-5',
 'VA_date-5',
 'VC_date-5',
 'GS_date-5',
 'GR_date-5',
 'FG\\+_date-5',
 'BC_date-5',
 'BL_date-5',
 'FZ_date-5',
 'UP_date-5',
 'FC_date-5',
 'DZ_date-5',
 'BR_date-5',
 'FG_date-5',
 'IC_date-5',
 'DU_date-5',
 'DR_date-5',
 'DS_date-5',
 'FU_date-5',
 'PR_date-5',
 'SS_date-5',
 'SQ_date-5',
 'PY_date-5',
 'MI_date-5',
 'TS_date-5',
 'SH_date-5',
 'RA_date-5',
 '\\+FC_date-5',
 'SA_date-5',
 'SG_date-5',
 'PO_date-5',
 'PL_date-5',
 'SN_date-5',
 'Tmax_date-6',
 'Tmin_date-6',
 'Tavg_date-6',
 'DewPoint_date-6',
 'WetBulb_date-6',
 'PrecipTotal_date-6',
 'StnPressure_date-6',
 'SeaLevel_date-6',
 'ResultSpeed_date-6',
 'ResultDir_date-6',
 'AvgSpeed_date-6',
 'HZ_date-6',
 'VA_date-6',
 'VC_date-6',
 'GS_date-6',
 'GR_date-6',
 'FG\\+_date-6',
 'BC_date-6',
 'BL_date-6',
 'FZ_date-6',
 'UP_date-6',
 'FC_date-6',
 'DZ_date-6',
 'BR_date-6',
 'FG_date-6',
 'IC_date-6',
 'DU_date-6',
 'DR_date-6',
 'DS_date-6',
 'FU_date-6',
 'PR_date-6',
 'SS_date-6',
 'SQ_date-6',
 'PY_date-6',
 'MI_date-6',
 'TS_date-6',
 'SH_date-6',
 'RA_date-6',
 '\\+FC_date-6',
 'SA_date-6',
 'SG_date-6',
 'PO_date-6',
 'PL_date-6',
 'SN_date-6',
 'Trap_Mosquitos_today',
 'Trap_Species_today',
 'min_Wnv_species_today',
 'Epoch',
 'mosquito',
 'trap_e',
 'Latitude',
 'Longitude',
 'AddressAccuracy']

In [111]:
len(model_columns)

593

In [117]:
test_cols = [i for i in test.columns[12:]]
test_cols

['Weather_Station',
 'T152_dist',
 'T153_dist',
 'T019_dist',
 'T018_dist',
 'T156_dist',
 'T157_dist',
 'T154_dist',
 'T155_dist',
 'T013_dist',
 'T012_dist',
 'T011_dist',
 'T159_dist',
 'T017_dist',
 'T016_dist',
 'T015_dist',
 'T014_dist',
 'T151_dist',
 'T080_dist',
 'T081_dist',
 'T082_dist',
 'T083_dist',
 'T084_dist',
 'T085_dist',
 'T086_dist',
 'T090A_dist',
 'T088_dist',
 'T089_dist',
 'T090C_dist',
 'T218B_dist',
 'T158_dist',
 'T218A_dist',
 'T149_dist',
 'T148_dist',
 'T009Alt_dist',
 'T145_dist',
 'T144_dist',
 'T147_dist',
 'T146_dist',
 'T141_dist',
 'T143_dist',
 'T142_dist',
 'T128A_dist',
 'T903_dist',
 'T092_dist',
 'T091_dist',
 'T090_dist',
 'T097_dist',
 'T096_dist',
 'T095_dist',
 'T094_dist',
 'T099_dist',
 'T222_dist',
 'T138_dist',
 'T046_dist',
 'T128_dist',
 'T002B_dist',
 'T114_dist',
 'T002A_dist',
 'T027_dist',
 'T090B_dist',
 'T025_dist',
 'T228_dist',
 'T229_dist',
 'T900_dist',
 'T224_dist',
 'T225_dist',
 'T226_dist',
 'T227_dist',
 'T220_dist',
 'T

In [118]:
len(test_cols)

615