## Prepare environment

In [1]:
import pandas as pd
from datetime import timedelta
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn import svm
from sklearn import metrics
import bokeh.plotting as bp
import itertools
from sklearn.model_selection import GridSearchCV
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from pandas.tools.plotting import autocorrelation_plot
import datetime
from fbprophet import Prophet
from sklearn.preprocessing import MinMaxScaler

## Acquire

In [2]:
df = pd.read_csv('south_central_moisture_data_1988_2017.csv')
df.head()

Unnamed: 0,final_date,soil_moisture,pdsi
0,1988-03-01,4.79,0.0
1,1988-03-08,4.67,0.0
2,1988-03-15,4.24,0.0
3,1988-03-22,5.59,0.0
4,1988-03-29,5.0,0.0


## Scale data

In [3]:
df = df[['final_date', 'soil_moisture']]

mmscaler = MinMaxScaler(feature_range=(0, 1))
mmscaler.fit(df[['soil_moisture']])
df.soil_moisture = mmscaler.transform(df[['soil_moisture']])

df.head()

Unnamed: 0,final_date,soil_moisture
0,1988-03-01,0.598248
1,1988-03-08,0.583229
2,1988-03-15,0.529412
3,1988-03-22,0.698373
4,1988-03-29,0.624531


## Set index

In [4]:
df.final_date = pd.to_datetime(df.final_date)
df = df.set_index('final_date')

In [5]:
df_agg = df.soil_moisture.resample('W').agg('mean')

In [6]:
df_agg.head()

final_date
1988-03-06    0.598248
1988-03-13    0.583229
1988-03-20    0.529412
1988-03-27    0.698373
1988-04-03    0.624531
Freq: W-SUN, Name: soil_moisture, dtype: float64

## Only get data from 1997-present to match PDSI data

In [7]:
df_agg = df_agg['1997':]

In [8]:
df_agg = df_agg.to_frame(name=None)

In [9]:
df_agg.head()

Unnamed: 0_level_0,soil_moisture
final_date,Unnamed: 1_level_1
1997-01-05,0.446809
1997-01-12,0.515645
1997-01-19,0.52816
1997-01-26,0.658323
1997-02-02,0.639549


Compute soil moisture mean from weekly aggregate means.

In [10]:
agg_mean = df_agg.soil_moisture.mean()
agg_mean

0.4830032772573354

Computer soil moisture standard deviation from weekly aggregate means.

In [11]:
agg_stdev = df_agg.soil_moisture.std()
agg_stdev

0.35018915819138896

Compute mean-(4xstdev) to get lower bound

In [12]:
df_agg["lower_bound"] = (agg_mean - (4*agg_stdev))

In [13]:
df_agg.head()

Unnamed: 0_level_0,soil_moisture,lower_bound
final_date,Unnamed: 1_level_1,Unnamed: 2_level_1
1997-01-05,0.446809,-0.917753
1997-01-12,0.515645,-0.917753
1997-01-19,0.52816,-0.917753
1997-01-26,0.658323,-0.917753
1997-02-02,0.639549,-0.917753


In [14]:
df_agg.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1105 entries, 1997-01-05 to 2018-03-04
Freq: W-SUN
Data columns (total 2 columns):
soil_moisture    1101 non-null float64
lower_bound      1105 non-null float64
dtypes: float64(2)
memory usage: 25.9 KB


There are four weeks missing data. These will be backfilled.

In [15]:
df_agg[df_agg.soil_moisture.isnull()]

Unnamed: 0_level_0,soil_moisture,lower_bound
final_date,Unnamed: 1_level_1,Unnamed: 2_level_1
1999-02-28,,-0.917753
2004-02-29,,-0.917753
2010-02-28,,-0.917753
2016-02-28,,-0.917753


In [16]:
df_agg.soil_moisture.fillna(method='bfill', inplace=True)

In [17]:
df_agg.soil_moisture.isnull().sum()

0

Created a column for "soil moisture - lower bound"

In [18]:
df_agg["sm_lb"] = (df_agg.soil_moisture - df_agg.lower_bound)

In [19]:
df_agg.head()

Unnamed: 0_level_0,soil_moisture,lower_bound,sm_lb
final_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1997-01-05,0.446809,-0.917753,1.364562
1997-01-12,0.515645,-0.917753,1.433398
1997-01-19,0.52816,-0.917753,1.445914
1997-01-26,0.658323,-0.917753,1.576076
1997-02-02,0.639549,-0.917753,1.557303


Created a boolean column for whether or not this value falls below normal range

In [20]:
df_agg['outside_normal'] = (df_agg.sm_lb < 1)

In [21]:
df_agg.tail()

Unnamed: 0_level_0,soil_moisture,lower_bound,sm_lb,outside_normal
final_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-02-04,0.824781,-0.917753,1.742534,False
2018-02-11,0.79975,-0.917753,1.717503,False
2018-02-18,0.813517,-0.917753,1.73127,False
2018-02-25,0.798498,-0.917753,1.716251,False
2018-03-04,0.874844,-0.917753,1.792597,False


In [22]:
df_agg.outside_normal.sum()

195

In [23]:
df_agg.soil_moisture.count()

1105

In [24]:
(195/1105)*100

17.647058823529413

In [37]:
# df_agg['2011':]

In [26]:
sm_non_normal = df_agg[df_agg.sm_lb < 1]

In [39]:
sm_non_normal['2017':]

Unnamed: 0_level_0,soil_moisture,lower_bound,sm_lb,outside_normal
final_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1


## PDSI df

In [28]:
pdsi = pd.read_csv('res_PDSI.csv')

In [29]:
pdsi.head()

Unnamed: 0,date,reservoir,PDSI
0,1997-09-30,285458.0,2.86
1,1997-10-31,282507.709677,2.88
2,1997-11-30,277624.3,2.71
3,1997-12-31,275259.387097,3.07
4,1998-01-31,272926.612903,3.02


In [30]:
pdsi = pdsi[['date', 'PDSI']]

In [31]:
pdsi.tail()

Unnamed: 0,date,PDSI
251,2018-08-31,-3.71
252,2018-09-30,1.5
253,2018-10-31,3.38
254,2018-11-30,3.58
255,2018-12-31,4.57


In [41]:
pdsi['drought'] = (pdsi.PDSI < (-2))

In [44]:
pdsi.sample(10)

Unnamed: 0,date,PDSI,drought
66,2003-03-31,-0.2,False
136,2009-01-31,-3.32,True
52,2002-01-31,-0.54,False
81,2004-06-30,2.28,False
159,2010-12-31,-1.84,False
5,1998-02-28,3.33,False
75,2003-12-31,-1.51,False
205,2014-10-31,-2.92,True
31,2000-04-30,-3.57,True
157,2010-10-31,-0.78,False


    Moderate Drought: PDSI < -2
    Severe Drought: PDSI < -3

In [32]:
drought = pdsi[pdsi.PDSI < (-2)]

In [33]:
drought.date = pd.to_datetime(drought.date)
drought = drought.set_index('date')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [40]:
drought['2018':]

Unnamed: 0_level_0,PDSI
date,Unnamed: 1_level_1
2018-05-31,-2.94
2018-06-30,-3.5
2018-07-31,-3.77
2018-08-31,-3.71


In [35]:
pdsi.PDSI.count()

256

In [36]:
(85/256)*100

33.203125

## Soil moisture monthly aggregation

In [59]:
df_monthly = df.soil_moisture.resample('M').agg('mean')

In [60]:
df_monthly = df_monthly['1997':]

In [61]:
df_monthly = df_monthly.to_frame(name=None)

In [62]:
df_monthly.head()

Unnamed: 0_level_0,soil_moisture
final_date,Unnamed: 1_level_1
1997-01-31,0.557697
1997-02-28,0.766792
1997-03-31,0.948436
1997-04-30,0.989675
1997-05-31,0.977222


Compute soil moisture mean from monthly aggregate means.

In [63]:
monthly_mean = df_monthly.soil_moisture.mean()
monthly_mean

0.48451829550717923

Computer soil moisture standard deviation from monthly aggregate means.

In [64]:
monthly_stdev = df_monthly.soil_moisture.std()
monthly_stdev

0.33799572004510314

Compute mean-(4xstdev) to get lower bound

In [65]:
df_monthly["lower_bound"] = (monthly_mean - (4*monthly_stdev))

In [66]:
df_monthly.head()

Unnamed: 0_level_0,soil_moisture,lower_bound
final_date,Unnamed: 1_level_1,Unnamed: 2_level_1
1997-01-31,0.557697,-0.867465
1997-02-28,0.766792,-0.867465
1997-03-31,0.948436,-0.867465
1997-04-30,0.989675,-0.867465
1997-05-31,0.977222,-0.867465


In [67]:
df_monthly.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 254 entries, 1997-01-31 to 2018-02-28
Freq: M
Data columns (total 2 columns):
soil_moisture    254 non-null float64
lower_bound      254 non-null float64
dtypes: float64(2)
memory usage: 6.0 KB


Created a column for "soil moisture - lower bound"

In [68]:
df_monthly["sm_lb"] = (df_monthly.soil_moisture - df_monthly.lower_bound)

In [69]:
df_monthly.head()

Unnamed: 0_level_0,soil_moisture,lower_bound,sm_lb
final_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1997-01-31,0.557697,-0.867465,1.425162
1997-02-28,0.766792,-0.867465,1.634256
1997-03-31,0.948436,-0.867465,1.8159
1997-04-30,0.989675,-0.867465,1.857139
1997-05-31,0.977222,-0.867465,1.844686


Created a boolean column for whether or not this value falls below normal range

In [70]:
df_monthly['outside_normal'] = (df_monthly.sm_lb < 1)

In [71]:
df_monthly.tail()

Unnamed: 0_level_0,soil_moisture,lower_bound,sm_lb,outside_normal
final_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-10-31,0.627972,-0.867465,1.495437,False
2017-11-30,0.491114,-0.867465,1.358578,False
2017-12-31,0.649562,-0.867465,1.517027,False
2018-01-31,0.793492,-0.867465,1.660956,False
2018-02-28,0.821652,-0.867465,1.689117,False


In [72]:
df_monthly.outside_normal.sum()

58

In [73]:
df_monthly.soil_moisture.count()

254

In [74]:
(58/254)*100

22.83464566929134

In [77]:
df_monthly[df_monthly.sm_lb < 1]

Unnamed: 0_level_0,soil_moisture,lower_bound,sm_lb,outside_normal
final_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1998-07-31,0.092303,-0.867465,0.959767,True
1998-08-31,0.091114,-0.867465,0.958578,True
1999-09-30,0.083855,-0.867465,0.951319,True
1999-10-31,0.070713,-0.867465,0.938178,True
1999-11-30,0.045557,-0.867465,0.913022,True
1999-12-31,0.048498,-0.867465,0.915963,True
2000-08-31,0.069086,-0.867465,0.936551,True
2000-09-30,0.028473,-0.867465,0.895938,True
2000-10-31,0.11796,-0.867465,0.985425,True
2003-08-31,0.100626,-0.867465,0.96809,True
