# ARIMA Imputation

## Load Data

In [None]:
## Libraries
# lightkurve
#import lightkurve as lk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#import random
from statsmodels.tsa.arima.model import ARIMA
#import statsmodels.api as sm
import itertools
import warnings

In [7]:
## Data
df = pd.read_csv("0.Data/031381302.csv")

# Check
print(df.shape)
#df.info()

## Time Series of interest
pdcsap = df.loc[:, ["time", "pdcsap_flux"]]
# Set "time" as index
pdcsap.set_index("time", inplace=True)
pdcsap.sort_index()

# Check
print(pdcsap.shape); pdcsap.info(); pdcsap.head()

(17719, 24)
(17719, 1)
<class 'pandas.core.frame.DataFrame'>
Float64Index: 17719 entries, 1437.978645522663 to 1464.2854684532865
Data columns (total 1 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   pdcsap_flux  17033 non-null  float64
dtypes: float64(1)
memory usage: 276.9 KB


Unnamed: 0_level_0,pdcsap_flux
time,Unnamed: 1_level_1
1437.978646,
1437.980034,
1437.981423,
1437.982812,
1437.984201,


## Additional Exploratory Analysis

In [15]:
from pmdarima.arima.utils import ndiffs
y = pdcsap.pdcsap_flux.dropna()

## Adf Test
print(ndiffs(y, test='adf'))

# KPSS test
print(ndiffs(y, test='kpss'))

# PP test:
print(ndiffs(y, test='pp'))

0
0
0


In [None]:
#

## Choosing Order
Hyperparameter Tuning

In [None]:
## Hyperparameters (Order)
# Define range of p, d, q values to search
p = range(0, 3)  # Autoregressive terms
d = [0]  # Differencing (0 since ADF test confirmed stationarity)
q = range(0, 3)  # Moving average terms

# Generate all possible combinations of p, d, q
pdq_combinations = list(itertools.product(p, d, q))

## Iteration
aic_results = []

# Iterate over all combinations and fit ARIMA models
for order in pdq_combinations:
    try:
        model = ARIMA(pdcsap["pdcsap_flux"].dropna(), order=order)
        model_fit = model.fit()
        aic_results.append((order, model_fit.aic))
    except:
        continue

# Find the best order with lowest AIC
best_order, best_aic = min(aic_results, key=lambda x: x[1])

# Display best order
best_order, best_aic

In [None]:
## SARIMA
# Define SARIMA parameter ranges
p = range(0, 3)  # Non-seasonal AR terms
d = [0]  # Differencing (0 since ADF confirmed stationarity)
q = range(0, 3)  # Non-seasonal MA terms
P = range(0, 2)  # Seasonal AR terms
D = [0]  # Seasonal differencing (0 since ADF confirmed stationarity)
Q = range(0, 2)  # Seasonal MA terms
s = [200]  # Seasonal period from ACF analysis

# Generate all possible SARIMA parameter combinations
param_combinations = list(itertools.product(p, d, q, P, D, Q, s))

## Diagnostics

In [None]:
##

## Impute Missing Values

In [None]:
## Function to impute missing values with ARIMA model
def arima_impute(series, order=(5,1,0)):
    """
    Impute missing values in a time series using an ARIMA model.
    
    Parameters:
        series (pd.Series): Time series data with NaN values.
        order (tuple): ARIMA order (p, d, q).
    
    Returns:
        pd.Series: Time series with missing values imputed.
    """
    series_filled = series.copy()
    
    # Identify missing value indices
    missing_indices = series[series.isna()].index
    
    # Fit ARIMA on non-missing values
    model = ARIMA(series.dropna(), order=order)
    model_fit = model.fit()
    
    # Predict missing values
    for idx in missing_indices:
        pred_value = model_fit.forecast(steps=1)[0]  # Get prediction
        series_filled.loc[idx] = pred_value  # Fill missing value
        
        # Update model with newly imputed value (optional for better predictions)
        model = ARIMA(series_filled.dropna(), order=order)
        model_fit = model.fit()
    
    return series_filled

In [None]:
# ## Order
# order = (1, 0, 1)

# ## Invoke helper function
# pdcsap["pdcsap_101"] = arima_impute(pdcsap["pdcsap_flux"], order = order)

# ## Save the imputed dataset
# pdcsap.to_csv('imputed_data.csv')