## Prepare environment

In [4]:
import pandas as pd
from datetime import timedelta
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn import svm
from sklearn import metrics
import bokeh.plotting as bp
import itertools
from sklearn.model_selection import GridSearchCV
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from pandas.tools.plotting import autocorrelation_plot
import datetime
from fbprophet import Prophet
from sklearn.preprocessing import MinMaxScaler

## Acquire

In [34]:
df = pd.read_csv('south_central_moisture_data_1988_2017.csv')
df.head()

Unnamed: 0,final_date,soil_moisture,pdsi
0,1988-03-01,4.79,0.0
1,1988-03-08,4.67,0.0
2,1988-03-15,4.24,0.0
3,1988-03-22,5.59,0.0
4,1988-03-29,5.0,0.0


## Scale data

In [35]:
df = df[['final_date', 'soil_moisture']]

mmscaler = MinMaxScaler(feature_range=(0, 1))
mmscaler.fit(df[['soil_moisture']])
df.soil_moisture = mmscaler.transform(df[['soil_moisture']])

df.head()

Unnamed: 0,final_date,soil_moisture
0,1988-03-01,0.598248
1,1988-03-08,0.583229
2,1988-03-15,0.529412
3,1988-03-22,0.698373
4,1988-03-29,0.624531


## Set index

In [36]:
df.final_date = pd.to_datetime(df.final_date)
df = df.set_index('final_date')

In [37]:
df_agg = df.soil_moisture.resample('W').agg('median')

In [38]:
df_agg.head()

final_date
1988-03-06    0.598248
1988-03-13    0.583229
1988-03-20    0.529412
1988-03-27    0.698373
1988-04-03    0.624531
Freq: W-SUN, Name: soil_moisture, dtype: float64

## Only get data from 1997-present to match PDSI data

In [39]:
df_agg = df_agg['1997':]

In [40]:
df_agg = df_agg.to_frame(name=None)

In [41]:
df_agg.head()

Unnamed: 0_level_0,soil_moisture
final_date,Unnamed: 1_level_1
1997-01-05,0.446809
1997-01-12,0.515645
1997-01-19,0.52816
1997-01-26,0.658323
1997-02-02,0.639549


Compute soil moisture mean from weekly aggregate medians.

In [45]:
agg_mean = df_agg.soil_moisture.mean()
agg_mean

0.4830032772573354

Computer soil moisture standard deviation from weekly aggregate medians.

In [46]:
agg_stdev = df_agg.soil_moisture.std()
agg_stdev

0.35018915819138896

Compute mean-(3xstdev) to get lower bound

In [54]:
df_agg["lower_bound"] = (agg_mean - (3*agg_stdev))

In [55]:
df_agg.head()

Unnamed: 0_level_0,soil_moisture,lower_bound
final_date,Unnamed: 1_level_1,Unnamed: 2_level_1
1997-01-05,0.446809,-0.567564
1997-01-12,0.515645,-0.567564
1997-01-19,0.52816,-0.567564
1997-01-26,0.658323,-0.567564
1997-02-02,0.639549,-0.567564


In [56]:
df_agg.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1105 entries, 1997-01-05 to 2018-03-04
Freq: W-SUN
Data columns (total 2 columns):
soil_moisture    1101 non-null float64
lower_bound      1105 non-null float64
dtypes: float64(2)
memory usage: 25.9 KB


There are four weeks missing data. These will be backfilled.

In [64]:
df_agg[df_agg.soil_moisture.isnull()]

Unnamed: 0_level_0,soil_moisture,lower_bound
final_date,Unnamed: 1_level_1,Unnamed: 2_level_1
1999-02-28,,-0.567564
2004-02-29,,-0.567564
2010-02-28,,-0.567564
2016-02-28,,-0.567564


In [67]:
df_agg.soil_moisture.fillna(method='bfill', inplace=True)

In [68]:
df_agg.soil_moisture.isnull().sum()

0