# Preprocessing time series data

In [None]:
%pip install -r requirements.txt --quiet

In [None]:
import pandas as pd
from timeseries.styler import style_dataframe
import numpy as np

In [None]:
flights = pd.read_csv('data/flights_prepared.csv')
style_dataframe(flights.head())


In [None]:
len(flights)

In [None]:
np.mean(flights['Passengers'])

## Add in Missing Data

In [None]:
np.random.seed(42)
num_missing = int(len(flights) * 0.15)

In [None]:
miss_idx = np.random.choice(flights.index, 
                            size=num_missing, 
                            replace=False)
flights.loc[miss_idx, "Passengers"] = np.nan

## Handling Missing Data

In [None]:
missing_vals = flights.isna().sum()
missing_vals

In [None]:
np.mean(flights['Passengers'])

### Forward Fill 

In [None]:
flights_ffilled = flights.ffill()
np.mean(flights_ffilled['Passengers'])

### Backward Fill

In [None]:
flights_bfilled = flights.bfill()
np.mean(flights_bfilled['Passengers'])

### Kalman Filter

In [None]:
flights_kalman = flights.copy()

In [None]:
from pykalman import KalmanFilter
import pandas as pd

def kalman_imputer(df, 
                   column, 
                   initial_state_mean = 0, 
                   n_dim_obs=1, **kalmanargs):
    df = df.copy()
    df[column] = df[column].astype(float)
    missing_mask = df[column].isna()

    if not missing_mask.any():
        return df

    kf = KalmanFilter(initial_state_mean=initial_state_mean, 
                      n_dim_obs=n_dim_obs, **kalmanargs)
    state_means, _ = kf.em(df[column].dropna()).smooth(df[column].ffill())
    df.loc[missing_mask, column] = state_means[missing_mask.to_numpy()]
    return df


In [None]:
kalman_df = kalman_imputer(df=flights_kalman, 
                           column='Passengers')
np.mean(kalman_df['Passengers'])

## Resampling time series data

In [None]:
flights["Date"] = pd.to_datetime(flights["Date"])
flights.set_index("Date", inplace=True)
print(type(flights.index))

In [None]:
flights_quarterly = flights.resample("QE").sum()
flights_quarterly

In [None]:
flights_yearly = flights.resample("YE").sum()
flights_yearly

## Save preprocessed data

In [None]:
flights_ffilled.to_csv("data/flights_preprocessed.csv")