# Smoothing

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Generate an artifical (and noisy) signal

In [None]:
np.random.seed(42)
noise = np.random.normal(0,1.5,365*5)

In [None]:
x = np.linspace(0,5,5*365)

In [None]:
y = 2*x + np.sin(2*np.pi*x/(1/2)) + noise

In [None]:
plt.plot(x,y)

We'll turn it into a simple Pandas dataframe to use some convenient methods.

In [None]:
d = pd.DataFrame({'time':x, 'y':y})

In [None]:
d

In [None]:
d.set_index('time', inplace=True)

In [None]:
d

`rolling` allows us to do aggregation over a moving window.

In [None]:
d.rolling(7)

In [None]:
d.rolling(7).mean().plot()

In [None]:
fig,ax=plt.subplots(2,1,figsize=(8,5))
d.plot(ax=ax[0])
d.rolling(7).mean().plot(ax=ax[1])

In [None]:
fig,ax=plt.subplots(2,1,figsize=(8,5))
d.plot(ax=ax[0])
d.rolling(7).mean().plot(ax=ax[1])
d.rolling(21).mean().plot(ax=ax[1])

In [None]:
fig,ax=plt.subplots(2,1,figsize=(8,5),sharex=True)
d.iloc[0:42].plot(ax=ax[0])
d.iloc[0:42].rolling(7).mean().plot(ax=ax[1])
d.iloc[0:42].rolling(21).mean().plot(ax=ax[1])
ax[0].legend(['rolling 7'])
ax[1].legend(['rolling 7','rolling 21'])

Other smoothing techniques:
* LOWESS : https://www.statsmodels.org/dev/generated/statsmodels.nonparametric.smoothers_lowess.lowess.html
* SciPy's UnivariateSpline : 

In [None]:
d

In [None]:
d.index

## LOWESS

In [None]:
import statsmodels.api as sm
lowess = sm.nonparametric.lowess

In [None]:
z = lowess(y,x)

z here will be two-dimensional, containing the sorted x and the lowess y.

In [None]:
plt.plot(x,y)
plt.plot(z[:,0],z[:,1])

You can specify the fraction of points over which to do the LOWESS in the method call.

In [None]:
z = lowess(y,x,frac=0.01)

In [None]:
plt.plot(x,y)
plt.plot(z[:,0],z[:,1])

## Splines

In [None]:
from scipy.interpolate import UnivariateSpline
from scipy.interpolate import BSpline

In [None]:
spl = UnivariateSpline(x, y)

This is a spline object which will give other values of y for any x that you pass into it.

In [None]:
spl

In [None]:
plt.plot(x,y)
plt.plot(x,spl(x))

Splines are commonly used for interpolation (as indeed you can see from the import statement).  Let's make a more sparse set of data.

In [None]:
np.random.seed(42)
noise = np.random.normal(0,1.5,50)

x = np.linspace(0, 10, 50)
y = 4 + 2*x - x**2 + 0.075*x**3 + noise
ytrue = 4 + 2*x - x**2 + 0.075*x**3

In [None]:
plt.plot(x,y,'ko')
plt.plot(x,ytrue,'b')

In [None]:
spl = UnivariateSpline(x, y)

In [None]:
plt.plot(x,y,'ko')
plt.plot(x,ytrue,'b')
xnew = np.linspace(0, 10, 1000)
plt.plot(xnew,spl(xnew))

In [None]:
spl.set_smoothing_factor(100)

In [None]:
plt.plot(x,y,'ko')
plt.plot(x,ytrue,'b')
xnew = np.linspace(0, 10, 1000)
plt.plot(xnew,spl(xnew))

In [None]:
spl = BSpline(x,y,3,False)
# the 4 here is the order of the spline
# the False specifies whether the spline should be extrapolated up to the edge

plt.plot(x,y,'ko')
plt.plot(x,ytrue,'b')
xnew = np.linspace(0, 10, 1000)
plt.plot(xnew,spl(xnew))

For good measure, let's also compare the LOWESS on this data.

In [None]:
z = lowess(y,x)
#z = lowess(y,x,0.2)

In [None]:
plt.plot(x,y,'ko')
plt.plot(x,ytrue,'b')
plt.plot(z[:,0], z[:,1])

## Polynomial fit

In [None]:
z = np.polyfit(x, y, 3)

In [None]:
z

From before:
$y_{true} = 4 + 2x - x^2 + 0.075x^3$

The coefficients are written from high order to low order.

In [None]:
z[0]

In [None]:
plt.plot(x,y,'ko')
plt.plot(x,ytrue,'b')
xnew = np.linspace(0, 10, 1000)
plt.plot(xnew, z[3] + z[2]*xnew + z[1]*xnew**2 + z[0]*xnew**3)

The advantage here?
-> we get an equation with known coefficients -> much more interpretable than simply having a smooth curve