# Analyze time series

In [None]:
!pip install -r requirements.txt --quiet

In [None]:
import pandas as pd
from timeseries.styler import style_dataframe

In [None]:
flights = pd.read_csv('data/flights_preprocessed.csv', index_col=0)

In [None]:
flights["Date"] = pd.to_datetime(flights["Date"])
flights.set_index("Date", inplace=True)

## Visualize time series

In [None]:
%%writefile timeseries/viz.py
import seaborn as sns
import matplotlib.pyplot as plt
from typing import List
import pandas as pd

def plot_time_series(data: pd.DataFrame, 
                     columns: List[str],
                     title: str = "Time Series Plot",
                     xlab:str = "Date", ylab:str = "Values",
                     figsize: tuple = (12,6),
                     colors: List[str] = None, linewidth:float=2.0,
                     max_cols: int = 3, title_fontsize:int=14, 
                     axis_label_fontsize:int=12, plot_grid:bool = True,
                     show_legend=True):
    
    if colors is None:
        color_pal  = sns.color_palette("husl", len(columns))

    else: 
        color_pal = colors[:len(columns)]
        plt.figure(figsize=figsize)

    for i, col in enumerate(columns[:max_cols]):
        if col in data.columns:
            sns.lineplot(x=data.index, y=data[col],
                            color=color_pal[i], linewidth=linewidth, 
                            label=col)
        else:
            print(f"Warning: Column '{col}' not found in the DataFrame.")
    
    plt.title(title, fontsize=title_fontsize)
    plt.xlabel(xlab, fontsize=axis_label_fontsize)
    plt.ylabel(ylab, fontsize=axis_label_fontsize)
    plt.legend().set_visible(show_legend)
    plt.grid(plot_grid)
    plt.show()


In [None]:
from timeseries.viz import plot_time_series
plot_time_series(data=flights, columns=["Passengers"], 
                 colors=['grey'], show_legend=True)

## Time Series Decomposition

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose
decomp = seasonal_decompose(flights['Passengers'], 
                            model='multiplicative', 
                            period=12)

In [None]:
decomp.plot()

## Non-Stationarity Checks


In [None]:
from statsmodels.tsa.stattools import adfuller

In [None]:
result = adfuller(flights['Passengers'].dropna())
print(f'ADF Statistic: {result[0]}')
print(f'p-value: {result[1]}')

In [None]:
if result[1] < 0.05:
    print("The time series is stationary.")
else:
    print("The time series is non-stationary.")

## How to make a time series stationary

### Differencing

In [None]:
flights['Diff1'] = flights['Passengers'].diff().dropna()
flights['Diff2'] = flights['Passengers'].diff().diff().dropna() 

In [None]:
style_dataframe(flights[["Passengers", "Diff1", "Diff2"]].head())

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10,5))
plt.plot(flights['Diff1'], 
         label="First-order Differenced Data", 
         color='black', linestyle='-')
plt.plot(flights['Diff2'], 
         label='Second-order Differenced Data', 
         color='grey', linestyle='-.', alpha=0.3)
plt.legend()
plt.title('Differencing on Passengers dataset')
plt.show()

### Log Transformation

In [None]:
import numpy as np
flights['log'] = np.log(flights['Passengers'])
style_dataframe(flights.head())

### Log Transformation with Differencing

In [None]:
flights['log_diff'] = flights['log'].diff().dropna()
style_dataframe(flights.head())

## Check stationarity after differencing

In [None]:
def stationarity_check(data: pd.DataFrame, 
                       column:str, 
                       conf_level:float = 0.05, 
                       **kwargs):
    adf_test_dict = dict()
    adf_test = adfuller(data[column].dropna(), **kwargs)

    if adf_test[1] < conf_level:
        result = "Stationary series"
    else:
        result = "Non-Stationary series"

    adf_test_dict = {
        "ADF Stat" : adf_test[0], 
        "p-value": adf_test[1],
        "TS Type": result
    }
    return adf_test_dict

In [None]:
stationarity_check(data=flights, column="Diff2")['TS Type']

## Detecting Autocorrelation and Partial Autocorrelation

In [None]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

In [None]:
import matplotlib.pyplot as plt
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
plot_acf(flights['Passengers'].dropna(), lags=50, ax=axes[0])
axes[0].set_title("Autocorrelation (ACF)")
plot_pacf(flights['Passengers'].dropna(), lags=50, ax=axes[1])
axes[1].set_title("Partial Autocorrelation (PACF)")
plt.tight_layout()
plt.show()

## How to handle autocorrelation

- ARIMA (to estimate the differencing, seasonality and trend)
- Use ML model with lagged features - see below:

### Lag features for ML Models

In [None]:
from typing import List
def create_lagged_columns(df: pd.DataFrame, 
                          column:str, 
                          lag_periods:List[int]):
    for lag in lag_periods:
        df[f'Lag_{lag}'] = df[column].shift(periods=lag).fillna(value=0)
    return df

In [None]:
style_dataframe(create_lagged_columns(df=flights, 
                      column='Passengers', 
                      lag_periods=[1,2,3,12,24,36]).head())

## Multiplicative vs Additive (Additional Section)

In [None]:
from modelviz.timeseries import analyze_and_preprocess_time_series

In [None]:
results = analyze_and_preprocess_time_series(data=flights, column='Passengers')

In [None]:
results

## Save full dataset

In [None]:
flights.to_csv('data/flights_decomp.csv')