In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
df = pd.read_csv('../data/dataframe.csv')

**Columns have been shifted**

In [3]:
df.columns

Index(['Date', 'Address', 'Species', 'Street', 'Trap',
       'AddressNumberAndStreet', 'NumMosquitos', 'WnvPresent', 'Day_length',
       'Tmax', 'Tmin', 'Tavg', 'Day_length_shift', 'Tavg_shift', 'ResultSpeed',
       'ResultSpeed_shift', 'ResultDir', 'AvgSpeed', 'ResultDir_shift',
       'AvgSpeed_shift', 'Sunset', 'Sunrise', 'Heat', 'Depart', 'DewPoint',
       'WetBulb', 'Cool', 'CodeSum', 'PrecipTotal', 'StnPressure'],
      dtype='object')

In [4]:
plt.figure(figsize=(15,15))
df.corr()

Unnamed: 0,NumMosquitos,WnvPresent,Day_length,Tmax,Tmin,Tavg,Day_length_shift,Tavg_shift,ResultSpeed,ResultSpeed_shift,...,AvgSpeed_shift,Sunset,Sunrise,Heat,Depart,DewPoint,WetBulb,Cool,PrecipTotal,StnPressure
NumMosquitos,1.0,0.19682,0.07114,0.15931,0.196325,0.187541,0.153381,0.124675,0.003658,-0.056444,...,-0.028559,0.088884,-0.044844,-0.130254,0.104274,0.159264,0.173568,0.183511,-0.041789,0.02242
WnvPresent,0.19682,1.0,-0.084682,0.048244,0.074048,0.064256,0.070725,0.088897,-0.046298,-0.088235,...,-0.098491,-0.068451,0.105227,-0.05474,0.055436,0.085883,0.083225,0.058101,-0.00222,0.022815
Day_length,0.07114,-0.084682,1.0,0.179942,0.238243,0.219781,0.251879,-0.087166,0.088805,0.299927,...,0.327499,0.992948,-0.986366,-0.236917,-0.256982,0.138682,0.121697,0.175268,0.033399,-0.125237
Tmax,0.15931,0.048244,0.179942,1.0,0.785216,0.950719,0.472377,0.278268,-0.046229,-0.06405,...,-0.016125,0.220295,-0.119714,-0.740174,0.820943,0.745688,0.772449,0.89258,0.018245,-0.069216
Tmin,0.196325,0.074048,0.238243,0.785216,1.0,0.938078,0.565114,0.312613,0.018924,-0.118545,...,-0.125385,0.279003,-0.176084,-0.681273,0.714957,0.898852,0.819149,0.903877,0.104099,-0.116735
Tavg,0.187541,0.064256,0.219781,0.950719,0.938078,1.0,0.547359,0.313879,-0.019379,-0.095168,...,-0.069731,0.262313,-0.155594,-0.753615,0.814752,0.865062,0.840425,0.950617,0.064626,-0.097022
Day_length_shift,0.153381,0.070725,0.251879,0.472377,0.565114,0.547359,1.0,0.460263,-0.156871,-0.0684,...,-0.050967,0.30228,-0.175856,-0.527872,0.156968,0.469396,0.471793,0.464724,0.030486,-0.025099
Tavg_shift,0.124675,0.088897,-0.087166,0.278268,0.312613,0.313879,0.460263,1.0,-0.255903,-0.158015,...,-0.161182,-0.048668,0.138564,-0.325041,0.171394,0.257635,0.36522,0.256593,-0.162736,0.19464
ResultSpeed,0.003658,-0.046298,0.088805,-0.046229,0.018924,-0.019379,-0.156871,-0.255903,1.0,0.138466,...,0.099184,0.067839,-0.115829,0.032318,0.0019,-0.036898,-0.04794,-0.010058,0.026455,-0.041376
ResultSpeed_shift,-0.056444,-0.088235,0.299927,-0.06405,-0.118545,-0.095168,-0.0684,-0.158015,0.138466,1.0,...,0.930663,0.270873,-0.333232,0.015145,-0.125322,-0.188415,-0.166943,-0.117182,-0.227953,-0.041388


<Figure size 1080x1080 with 0 Axes>

In [5]:
df['Day_length'].index[84:]

RangeIndex(start=84, stop=10506, step=1)

In [6]:
np.corrcoef(df['Tavg'].rolling(3).mean().shift(14).dropna(), df['NumMosquitos'].drop(df.index[0:16]))

array([[1.        , 0.18465773],
       [0.18465773, 1.        ]])

### 'Grid Searching' Through Rolling & Shifted Feature Means

We created a function that will help us search through different permutations of rolling means, along with shifted days, to see which configuration will give us the highest correlation.

In [7]:
def hi_corr(f1, f2, rm_min=3, rm_max=7, min_val=1, max_val=30):
    corr_array = []
    new_array = []
    for k in range(rm_min, rm_max + 1):
        for i in range(min_val, max_val + 1):
            new_array.append(np.corrcoef(f1.rolling(k).mean().shift(i).dropna(), f2.drop(f2.index[:i+k-1]))[0,1])
            corr_array.append(np.corrcoef(f1.rolling(k).mean().shift(i).dropna(), f2.drop(f2.index[:i+k-1]))[0,1])
            new_array.append('Rolling Mean: {}'.format(k))
            new_array.append('Shifted Value: {}'.format(i))
    corr_max = max(corr_array)
    windex = new_array.index(corr_max)
    print(corr_max, new_array[windex + 1], new_array[windex + 2])
    return

Best Daylength Rolling Mean

In [20]:
hi_corr(df['Day_length'], df['NumMosquitos'], rm_min=1, rm_max=7, min_val=20, max_val=100)

0.09606920821956817 Rolling Mean: 4 Shifted Value: 75


### Temperature Related

Best Temperature Rolling Mean

In [10]:
hi_corr(df['Tavg'], df['NumMosquitos'], rm_min=1, rm_max=7, min_val=7, max_val=21)

0.1853162276195896 Rolling Mean: 7 Shifted Value: 12


Best Departure Rolling Mean

In [40]:
hi_corr(df['Depart'], df['NumMosquitos'], rm_min=1, rm_max=7, min_val=7, max_val=21)

0.1010909163192755 Rolling Mean: 4 Shifted Value: 15


### Wind Related

Best Wind Speed Rolling Mean

In [22]:
hi_corr(df['ResultSpeed'], df['NumMosquitos'], rm_min=1, rm_max=7, min_val=14, max_val=28)

-0.009308151755235217 Rolling Mean: 1 Shifted Value: 15


In [25]:
hi_corr(df['ResultDir'], df['NumMosquitos'], rm_min=1, rm_max=7, min_val=14, max_val=28)

-0.02612574399157251 Rolling Mean: 7 Shifted Value: 28


### Precipitation Related

Best Total Precipitation

In [31]:
hi_corr(df['PrecipTotal'], df['NumMosquitos'], rm_min=1, rm_max=100, min_val=7, max_val=100)

0.15863625225249642 Rolling Mean: 100 Shifted Value: 61


The best rolling mean values for `PrecipTotal` tend to be the max passed through it. Given that, it appears that an expanding mean for total precipitation may give us the best results.

Best Wetbulb Rolling Mean
>The lowest temperature that can be reached by evaporating water into the air. Note: the wet bulb temperature will always be less than or equal to the temperature. It feels more comfortable when wet-bulb temperature is low. [Source] (http://apollo.lsc.vsc.edu/classes/met130/notes/chapter4/wet_bulb.html)

In [32]:
hi_corr(df['WetBulb'], df['NumMosquitos'], rm_min=1, rm_max=100, min_val=7, max_val=100)

0.19604921383756166 Rolling Mean: 100 Shifted Value: 48


Best Dewpoint Rolling Mean
 > the atmospheric temperature (varying according to pressure and humidity) below which water droplets begin to condense and dew can form.

In [33]:
hi_corr(df['WetBulb'], df['NumMosquitos'], rm_min=1, rm_max=100, min_val=7, max_val=100)

0.19604921383756166 Rolling Mean: 100 Shifted Value: 48


In [9]:
# features = ['Tavg']

# new_df = df[['WnvPresent']].set_index(df['Date'])

# for f in features:
#     for roll in [1,2,3,4,5,6,7]:
#         columns = list(new_df.columns)
#         columns.append(f"{f}_with_roll_of_{roll}")
#         new_df = pd.concat([new_df, df[[f]].rolling(roll).mean().set_index(df['Date']).replace()], axis=1)
#         new_df.columns = columns
    
# sns.heatmap(new_df.corr())