### Import and Load Pre-processed Data

In [1]:
# import libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import datetime as dt
import statsmodels.api as sm
from statsmodels.api import tsa
from pmdarima.arima import auto_arima

In [2]:
# load in file and convert to datetime index
df_main = pd.read_csv('capstone_df_main.csv')
df_main['DATE'] = pd.to_datetime(df_main['DATE'])
df_main.set_index('DATE', inplace=True)

### SARIMA

In [3]:
import warnings
warnings.simplefilter('ignore', category=FutureWarning)

Custom function is created with auto_arima function to perform a grid search of the best SARIMA model parameters.

Note that the input dataframe is not the differentiated dataframe but the original as the auto_arima function can find the best difference order during its search.

In [4]:
def smodel(n, periods):

    # setup the dataframes
    dataset = df_main.iloc[:,n]

    #train, test, split the data
    train_set = dataset.loc[:'2023-01-01']
    test_set = dataset.loc['2023-01-01':]

    # finding the best parameters of (p,d,q)x(P,D,Q)
    model = auto_arima(train_set, 
                        start_p=0, start_q=0, d=0,
                        max_p=3, max_q=5, max_d=3,
                        start_P=0, start_Q=0, D=0,
                        max_P=3, max_Q=5, max_D=3,
                        m=12,
                        seasonal=True,
                        trace=False,
                        test='adf',
                        suppress_warnings=True,
                        stepwise=True,                      
                        )
    
    df_train = pd.DataFrame(train_set)
    df_test = pd.DataFrame(test_set)

    # Forecast
    fitted, confint = model.predict(n_periods=periods, return_conf_int=True)

    return df_train, df_test, fitted, confint

From the custom function to find the best SARIMA model parameters, 2 years of data are forecasted for each destinations and the results are shown on the plot compared to the test data.

In [5]:
# forecast of 2 years with less than 24 months missing
df_fit=[]
names = df_main.columns

for i, dest in enumerate(names):
    train, test, fit, ci = smodel(i,24)
    df_fit.append(fit)

    fig = px.line(train, x=train.index, y=dest)

    fig.add_trace(
        go.Scatter(x=test.index , y=test[dest], name='Test', mode='lines', line_color='orange'))

    fig.add_trace(
        go.Scatter(x=fit.index , y=fit, name='Forecast', mode='lines', line_color='crimson'))

    fig.add_trace(
        go.Scatter(x=fit.index, y=ci[:,[0]].flatten(), name='Confidence Interval', fill=None, mode='lines', line_color='lightgray'))

    fig.add_trace(
        go.Scatter(x=fit.index, y=ci[:,[1]].flatten(), name='Confidence Interval', fill='tonexty', mode='lines', line_color='lightgray'))

    fig.update_layout(
        title=f'Forecast of {dest}',
        yaxis_title='PASSENGERS',
        width=1600,
        height=300
    )
    fig.show()

df_fit = pd.concat(df_fit, axis=1)
df_fit.columns = [names]


Traceback:
Traceback (most recent call last):
  File "c:\Users\Joe\anaconda3\envs\capstone_joe_cha\Lib\site-packages\pmdarima\arima\_auto_solvers.py", line 508, in _fit_candidate_model
    fit.fit(y, X=X, **fit_params)
  File "c:\Users\Joe\anaconda3\envs\capstone_joe_cha\Lib\site-packages\pmdarima\arima\arima.py", line 603, in fit
    self._fit(y, X, **fit_args)
  File "c:\Users\Joe\anaconda3\envs\capstone_joe_cha\Lib\site-packages\pmdarima\arima\arima.py", line 524, in _fit
    fit, self.arima_res_ = _fit_wrapper()
                           ^^^^^^^^^^^^^^
  File "c:\Users\Joe\anaconda3\envs\capstone_joe_cha\Lib\site-packages\pmdarima\arima\arima.py", line 510, in _fit_wrapper
    fitted = arima.fit(
             ^^^^^^^^^^
  File "c:\Users\Joe\anaconda3\envs\capstone_joe_cha\Lib\site-packages\statsmodels\tsa\statespace\mlemodel.py", line 704, in fit
    mlefit = super(MLEModel, self).fit(start_params, method=method,
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


Traceback:
Traceback (most recent call last):
  File "c:\Users\Joe\anaconda3\envs\capstone_joe_cha\Lib\site-packages\pmdarima\arima\_auto_solvers.py", line 508, in _fit_candidate_model
    fit.fit(y, X=X, **fit_params)
  File "c:\Users\Joe\anaconda3\envs\capstone_joe_cha\Lib\site-packages\pmdarima\arima\arima.py", line 603, in fit
    self._fit(y, X, **fit_args)
  File "c:\Users\Joe\anaconda3\envs\capstone_joe_cha\Lib\site-packages\pmdarima\arima\arima.py", line 524, in _fit
    fit, self.arima_res_ = _fit_wrapper()
                           ^^^^^^^^^^^^^^
  File "c:\Users\Joe\anaconda3\envs\capstone_joe_cha\Lib\site-packages\pmdarima\arima\arima.py", line 510, in _fit_wrapper
    fitted = arima.fit(
             ^^^^^^^^^^
  File "c:\Users\Joe\anaconda3\envs\capstone_joe_cha\Lib\site-packages\statsmodels\tsa\statespace\mlemodel.py", line 704, in fit
    mlefit = super(MLEModel, self).fit(start_params, method=method,
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


Traceback:
Traceback (most recent call last):
  File "c:\Users\Joe\anaconda3\envs\capstone_joe_cha\Lib\site-packages\pmdarima\arima\_auto_solvers.py", line 508, in _fit_candidate_model
    fit.fit(y, X=X, **fit_params)
  File "c:\Users\Joe\anaconda3\envs\capstone_joe_cha\Lib\site-packages\pmdarima\arima\arima.py", line 603, in fit
    self._fit(y, X, **fit_args)
  File "c:\Users\Joe\anaconda3\envs\capstone_joe_cha\Lib\site-packages\pmdarima\arima\arima.py", line 524, in _fit
    fit, self.arima_res_ = _fit_wrapper()
                           ^^^^^^^^^^^^^^
  File "c:\Users\Joe\anaconda3\envs\capstone_joe_cha\Lib\site-packages\pmdarima\arima\arima.py", line 510, in _fit_wrapper
    fitted = arima.fit(
             ^^^^^^^^^^
  File "c:\Users\Joe\anaconda3\envs\capstone_joe_cha\Lib\site-packages\statsmodels\tsa\statespace\mlemodel.py", line 704, in fit
    mlefit = super(MLEModel, self).fit(start_params, method=method,
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


Traceback:
Traceback (most recent call last):
  File "c:\Users\Joe\anaconda3\envs\capstone_joe_cha\Lib\site-packages\pmdarima\arima\_auto_solvers.py", line 508, in _fit_candidate_model
    fit.fit(y, X=X, **fit_params)
  File "c:\Users\Joe\anaconda3\envs\capstone_joe_cha\Lib\site-packages\pmdarima\arima\arima.py", line 603, in fit
    self._fit(y, X, **fit_args)
  File "c:\Users\Joe\anaconda3\envs\capstone_joe_cha\Lib\site-packages\pmdarima\arima\arima.py", line 524, in _fit
    fit, self.arima_res_ = _fit_wrapper()
                           ^^^^^^^^^^^^^^
  File "c:\Users\Joe\anaconda3\envs\capstone_joe_cha\Lib\site-packages\pmdarima\arima\arima.py", line 510, in _fit_wrapper
    fitted = arima.fit(
             ^^^^^^^^^^
  File "c:\Users\Joe\anaconda3\envs\capstone_joe_cha\Lib\site-packages\statsmodels\tsa\statespace\mlemodel.py", line 704, in fit
    mlefit = super(MLEModel, self).fit(start_params, method=method,
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


Traceback:
Traceback (most recent call last):
  File "c:\Users\Joe\anaconda3\envs\capstone_joe_cha\Lib\site-packages\pmdarima\arima\_auto_solvers.py", line 508, in _fit_candidate_model
    fit.fit(y, X=X, **fit_params)
  File "c:\Users\Joe\anaconda3\envs\capstone_joe_cha\Lib\site-packages\pmdarima\arima\arima.py", line 603, in fit
    self._fit(y, X, **fit_args)
  File "c:\Users\Joe\anaconda3\envs\capstone_joe_cha\Lib\site-packages\pmdarima\arima\arima.py", line 524, in _fit
    fit, self.arima_res_ = _fit_wrapper()
                           ^^^^^^^^^^^^^^
  File "c:\Users\Joe\anaconda3\envs\capstone_joe_cha\Lib\site-packages\pmdarima\arima\arima.py", line 510, in _fit_wrapper
    fitted = arima.fit(
             ^^^^^^^^^^
  File "c:\Users\Joe\anaconda3\envs\capstone_joe_cha\Lib\site-packages\statsmodels\tsa\statespace\mlemodel.py", line 704, in fit
    mlefit = super(MLEModel, self).fit(start_params, method=method,
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


Traceback:
Traceback (most recent call last):
  File "c:\Users\Joe\anaconda3\envs\capstone_joe_cha\Lib\site-packages\pmdarima\arima\_auto_solvers.py", line 508, in _fit_candidate_model
    fit.fit(y, X=X, **fit_params)
  File "c:\Users\Joe\anaconda3\envs\capstone_joe_cha\Lib\site-packages\pmdarima\arima\arima.py", line 603, in fit
    self._fit(y, X, **fit_args)
  File "c:\Users\Joe\anaconda3\envs\capstone_joe_cha\Lib\site-packages\pmdarima\arima\arima.py", line 524, in _fit
    fit, self.arima_res_ = _fit_wrapper()
                           ^^^^^^^^^^^^^^
  File "c:\Users\Joe\anaconda3\envs\capstone_joe_cha\Lib\site-packages\pmdarima\arima\arima.py", line 510, in _fit_wrapper
    fitted = arima.fit(
             ^^^^^^^^^^
  File "c:\Users\Joe\anaconda3\envs\capstone_joe_cha\Lib\site-packages\statsmodels\tsa\statespace\mlemodel.py", line 704, in fit
    mlefit = super(MLEModel, self).fit(start_params, method=method,
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

### Evaluate SARIMA Results

In [15]:
# sample of the forecast results
df_fit.head()
forecast = df_fit.copy()

Unnamed: 0,"London, United Kingdom","Toronto, Canada","Tokyo, Japan","Cancun, Mexico","Mexico City, Mexico","Frankfurt, Germany","Seoul, South Korea","Paris, France","Vancouver, Canada","Sao Paulo, Brazil",...,"Lisbon, Portugal","Cartagena, Colombia","Morelia, Mexico","Amman, Jordan","Cali, Colombia","Addis Ababa, Ethiopia","Warsaw, Poland","Georgetown, Guyana","Helsinki, Finland","Queretaro, Mexico"
2023-02-01,449388.667323,461712.976413,224154.381717,535077.828263,259700.323714,154226.911618,209758.832086,211156.70972,214903.698424,121835.39437,...,45481.939045,20158.995684,21466.763385,10092.342411,15201.565479,15318.322351,15160.004682,15562.141999,11242.364293,13671.818344
2023-03-01,539588.308195,526001.340261,227589.176032,592613.446105,268513.548197,197179.741192,208626.725241,212133.271418,237898.249284,131744.313832,...,44031.61498,14964.413414,21005.888562,11853.907932,15616.995278,15536.604488,15997.655287,16844.82038,11503.329912,15303.438086
2023-04-01,618331.158195,542902.420658,233267.549547,496861.295011,246627.668431,229649.415313,216316.521617,200551.941386,240435.820705,126878.194725,...,45394.977039,16084.394246,20664.304429,12943.020446,16881.620217,17106.155229,17236.673128,17854.009647,11836.10059,15474.847441
2023-05-01,695709.869021,572120.135335,248534.548658,492357.738391,274740.485827,274815.34018,230113.538148,237249.863634,242398.77595,127618.33725,...,45603.537824,14215.936807,21116.52327,17496.465686,15428.910039,18852.978075,22842.456347,16456.312406,12346.152622,16507.945097
2023-06-01,785992.145255,591109.127178,266312.540902,526831.135428,283336.671123,293189.333806,238419.882921,242522.195558,266482.916692,123429.630878,...,43835.248887,11646.574767,22000.637962,22801.630824,17476.313043,25228.425904,29559.318561,16764.409978,13450.696173,18237.949944


The resulting forecast dataframe is reconfigured for ranking and plotting purposes.

In [8]:
forecast = forecast.stack()

In [9]:
df_forecast = forecast.sum(axis=1).reset_index().\
    rename(columns={'level_0':'Date','level_1':'Destination',0:'Passengers'})

Monthly top 10 destinations are ranked from the forecast dataset and plotted.

In [16]:
df_top=df_forecast.sort_values(by=['Date','Passengers'], ascending=[True,False]).reset_index(drop=True).\
    groupby(['Date']).apply(lambda x: x.head(10)).reset_index(drop=True)
df_top

Unnamed: 0,Date,Destination,Passengers
0,2023-02-01,"Cancun, Mexico",535077.828263
1,2023-02-01,"Toronto, Canada",461712.976413
2,2023-02-01,"London, United Kingdom",449388.667323
3,2023-02-01,"Mexico City, Mexico",259700.323714
4,2023-02-01,"Tokyo, Japan",224154.381717
...,...,...,...
235,2025-01-01,"Frankfurt, Germany",251601.701383
236,2025-01-01,"Seoul, South Korea",236686.151096
237,2025-01-01,"Vancouver, Canada",225923.904276
238,2025-01-01,"Paris, France",197698.134690


In [20]:
fig = px.line(
    df_top, x='Date', y='Passengers', color='Destination', facet_col='Destination',
    facet_col_wrap=5,
    # facet_row_spacing=0.07,
    height=600,
    width=2400,
    title='Popular Destinations'
              )
fig.for_each_annotation(lambda x: x.update(text=x.text.split('=')[-1]))
fig.show()

#### Conclusion of SARIMA Modeling
All the destinations forecasted to be in the top 10 within the next 24 months were previously in the monthly top 10 as seen from the EDA. Airlines with flights outbound from USA to these destinations should continue to focus on these destinations.

These forecasts should be taken with consideration as the impact from COVID has played a significant factor. Many of the destinations and even the total passenger volume plots show an upward trend, however, many of the forecasts and even the ones shown in 'Popular Destinations' show a stagnant or slightly declining trend. Furthermore, the confidence interval plotted with each forecast shows a wide range as the model prediction is casting a wide net due to the anomaly in the data. A sliver of positivity in this SARIMA model is that there are some instances where the forecast fits well against the test data but at a lower passenger volume.

### Bonus: SARIMAX

In [12]:
# df_smax = df_main.copy()
# flight frequency as exogenous variable for SARIMAX model

In [13]:
# def sxmodel(n, periods):

#     # setup the dataframes
#     dataset = df_smax.iloc[:,n]
#     train_set = dataset.loc[:'2023-01-01']
#     test_set = dataset.loc['2023-01-01':]

#     model = auto_arima(train_set, exogenous=df_smax[['Year']] ,
#                         start_p=0, start_q=0, d=0,
#                         max_p=3, max_q=5, max_d=2,
#                         start_P=0, start_Q=0, D=0,
#                         max_P=3, max_Q=5, max_D=2,
#                         m=12,
#                         seasonal=True,
#                         trace=False,
#                         test='adf',
#                         suppress_warnings=True,
#                         stepwise=True,                      
#                         )
    
#     df_train = pd.DataFrame(train_set)
#     df_test = pd.DataFrame(test_set)

#     # Forecast
#     fitted, confint = model.predict(n_periods=periods, return_conf_int=True)

#     # df_fitted = pd.DataFrame(fitted)
#     # df_confint = pd.DataFrame(confint)

#     return df_train, df_test, fitted, confint

In [14]:
# # forecast of 2 years with less than 24 months missing
# df_fit=[]
# names = df_smax.columns[0:-1]
# # fig = make_subplots(rows=len(names), cols=1, shared_xaxes=True, x_title='Years', y_title='Passengers', subplot_titles=names, vertical_spacing=0.1)

# for i, dest in enumerate(names):
#     train, test, fit, ci = smodel(i,24)
#     df_fit.append(fit)

#     fig = px.line(train, x=train.index, y=dest)
#     # fig.add_trace(
#     #     go.Scatter(x=train.index , y=train[dest], name='Train', mode='lines', line_color='lightskyblue'
#     #                #, legendgroup='train'
#     #                ),
#         # row=i+1,
#         # col=1)
#     fig.add_trace(
#         go.Scatter(x=test.index , y=test[dest], name='Test', mode='lines', line_color='orange'
#                    #, legendgroup='test'
#                    ))
#         # ,row=i+1,
#         # col=1)
#     fig.add_trace(
#         go.Scatter(x=fit.index , y=fit, name='Forecast', mode='lines', line_color='crimson'
#                    #, legendgroup='fit'
#                    ))
#         # ,row=i+1,
#         # col=1)
#     fig.add_trace(
#         go.Scatter(x=fit.index, y=ci[:,[0]].flatten(), name='Confidence Interval', fill=None, mode='lines', line_color='lightgray'
#                    #, legendgroup='ci'
#                    ))
#         # ,row=i+1,
#         # col=1)
#     fig.add_trace(
#         go.Scatter(x=fit.index, y=ci[:,[1]].flatten(), name='Confidence Interval', fill='tonexty', mode='lines', line_color='lightgray'
#                    # ,showlegend=False
#                    #, legendgroup='ci'
#                    ))
#         # ,row=i+1,
#         # col=1)


#     fig.update_layout(
#         title=f'Forecast of {dest}',
#         yaxis_title='PASSENGERS',
#         # showlegend=False,
#         width=1600,
#         height=300
#     )
#     fig.show()

# df_fit = pd.concat(df_fit, axis=1)
# df_fit.columns = [names]

### Advanced Modeling: LSTM/Prophet

Normalize the data with MinMax scale for LSTM