# Let's decompose time series data

In [None]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# from scipy.stats import boxcox
df = sns.load_dataset('flights')
df.head()

In [None]:
def plot_p(df, column, title, ylabel):
    '''
    The function plot_p helps me to plot the number of passengers over time.
    
    -----------
    Parameters:
    df: DataFrame that works as basis of the plot
    column: The column that is plotted on the y-axis
    title: The title of the plot
    ylabel: The lable of the y-axis
    '''
    df[column].plot(figsize=(12, 8))
    plt.title(title)
    plt.xlabel('Date')
    plt.ylabel(ylabel)
    plt.show()

In [None]:
df['Date'] = pd.to_datetime(df.month.astype(str) + '-' + df.year.astype(str))
df.set_index('Date', inplace=True)
df.head()

In [None]:
df.passengers.plot(figsize=(12,6), title='Passenger numbers over time')
plt.show()

## 1) Remove the trend

- Differencing - if the trend is linear
- Second order differencing - if the trend is exponential
- Take the rolling mean and subtract it

### Differencing

In [None]:
df['difference'] = df['passengers'].diff()
df.head()

In [None]:
df['difference'].plot(figsize=(12,6), title='Change in numbers of passengers from t-1 to t')
plt.show()

## Second order differencing

In [None]:
df['2nddifference'] = df['difference'].diff()
df.head()

In [None]:
df['2nddifference'].plot(figsize=(12,6), title='Change in changes of numbers of passengers from t-1 to t')

In [None]:
df['2nddifference'].mean()

## Take the rolling mean and subtract it from the data

In [None]:
df['rolling'] = df.passengers.rolling(12, center=True).mean()
df.head()

In [None]:
df['rolling'].plot(figsize=(12, 6))
plt.title('Rolling mean of the number of passengers over the surrounding 12 months')
plt.ylabel('Number of passengers')
plt.xlabel('Date')
plt.show()

In [None]:
df['de-trended'] = df.passengers - df['rolling']
df.head()

In [None]:
df['de-trended'].plot(figsize=(12,6), title='De-trended number of passengers')

### Take out the change in volatility by taking the logarithm of the time series

In [None]:
df['log_y'] = np.log(df.passengers)
df['log_diff'] = df.log_y.diff()
df.head()

In [None]:
df['log_diff'].plot(figsize=(12,6), title='Difference in the logarithm of passenger numbers')
plt.show()

### Take out seasonality

Demean each value by subtracting the seasonal mean

In [None]:
df['monthly_mean'] = df.groupby('month')['log_diff'].transform('mean')
df['de-seasonalized'] = df['log_diff'] - df['monthly_mean']

In [None]:
df['de-seasonalized'].plot(figsize=(12,6), title='Deseasonalized differences in the log_passenger numbers')

## This is the time series we will actually do our analysis on!

1. We make predictions for the df['de-seasonalized']
- We add back on the monthly_mean
- We take the first value and create whole log_series
- We exponentiate the values to arrive back at the acutal values

In [None]:
df.head()

In [None]:
# 2. add back the monthly mean
reconstruct = df['de-seasonalized'] + df.monthly_mean
reconstruct

In [None]:
# 3. Take the whole series and recreate the whole log series
reconstruct[0] = df.log_y[0]
reconstruct.head()

In [None]:
reconstruct = reconstruct.cumsum()
reconstruct.head()

In [None]:
# 4. We exponentiate the values to arrive back at the acutal values
reconstruct = np.exp(reconstruct)
reconstruct.head()

## Introduce time series package

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose

Additive model

In [None]:
decompose = seasonal_decompose(df['passengers'], model='additive')
print(decompose.plot())

In [None]:
decompose = seasonal_decompose(df['passengers'], model='multiplicative')
print(decompose.plot())