In [44]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import statsmodels.tsa.stattools as ts


In [2]:
# read raw tourism file
data_dir = "C:/Users/benlc/OneDrive/Desktop/python_learn/tourism-demand-malaysia/data/raw/"
tourism_raw = pd.read_csv(data_dir+"Tourist Arrivals _ Tourism Malaysia [2021-Aug-31 03.22 AM].csv",nrows=21)
tourism_raw.tail()

Unnamed: 0,Year,Destination,Jan - No. of Arrivals,Feb - No. of Arrivals,Mar - No. of Arrivals,Apr - No. of Arrivals,May - No. of Arrivals,Jun - No. of Arrivals,Jul - No. of Arrivals,Aug - No. of Arrivals,...,Mar - Percentage Change Year on Year,Apr - Percentage Change Year on Year,May - Percentage Change Year on Year,Jun - Percentage Change Year on Year,Jul - Percentage Change Year on Year,Aug - Percentage Change Year on Year,Sep - Percentage Change Year on Year,Oct - Percentage Change Year on Year,Nov - Percentage Change Year on Year,Dec - Percentage Change Year on Year
16,2016,Malaysia,2376166,2091098,2198716,2101280,2144119,2121396,2296615,2282173,...,-1.933966,1.416945,1.190671,12.018427,3.635569,4.565194,1.632556,11.696432,2.467758,2.45186
17,2017,Malaysia,2350270,2043215,2238184,2145734,2039016,2134647,2263478,2129013,...,1.795048,2.115568,-4.90192,0.624636,-1.442863,-6.711148,-1.226841,-11.067846,-2.249089,-7.981155
18,2018,Malaysia,2276750,2050613,2192855,1957248,1976981,2275921,2305324,2253534,...,-2.025258,-8.78422,-3.042399,6.618143,1.848748,5.848767,0.215592,1.690676,-0.902556,-3.413665
19,2019,Malaysia,2195684,2165933,2334613,2159517,2098267,2400561,2415097,2342438,...,6.464541,10.334357,6.13491,5.476464,4.761717,3.945092,-4.759241,-3.459024,-1.031589,-15.360934
20,2020,Malaysia,2164459,1397912,671084,7546,5411,6585,18660,11631,...,-71.255022,-99.65057,-99.742121,-99.725689,-99.22736,-99.503466,-99.192276,-99.44294,-99.420103,-99.46923


In [103]:
# clean raw data and transform to long format
list_column = [0] + list(range(2,14))
column_name = ["year", "1","2","3","4","5","6","7","8","9","10","11","12"]

# select and rename column
tourism_clean = tourism_raw.iloc[:,list_column]
tourism_clean.columns = column_name

# change to longer format
tourism_clean = tourism_clean.melt(id_vars='year', var_name = "month", value_name = "tourist_arrival")

# convert to date
tourism_clean['day'] = 1
tourism_clean['date'] = pd.to_datetime(tourism_clean[['year', 'month','day']])
tourism_clean = tourism_clean[['date','tourist_arrival']].sort_values(by="date").reset_index(drop=True)
tourism_clean.tail()

Unnamed: 0,date,tourist_arrival
247,2020-08-01,11631
248,2020-09-01,16131
249,2020-10-01,11315
250,2020-11-01,11420
251,2020-12-01,10568


In [104]:
## export to intermediate clean dataset
# tourism_clean.to_csv("C:/Users/benlc/OneDrive/Desktop/python_learn/tourism-demand-malaysia/data/interim/tourism_clean.csv")


In [105]:
tourism_clean.set_index("date", inplace=True)
tourism_clean.head()

Unnamed: 0_level_0,tourist_arrival
date,Unnamed: 1_level_1
2000-01-01,731509
2000-02-01,786040
2000-03-01,737678
2000-04-01,916382
2000-05-01,894350


Explorenatary data analysis
- Try to understand existing data and identify which transformation or features are useful for 12 months ahead forecasting

List of EDA tasks
1) target variable viz & stationary check & distribution
2) acf and pacf, decompositon & seasonality check - multiple correlation 
3) external variable cross correlation

In [112]:
# tourism_clean.head()
# tourism_clean.reset_index()


Unnamed: 0_level_0,tourist_arrival,tourist_arrival_rolmean,tourist_arrival_rolstd
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000-01-01,731509,,
2000-02-01,786040,,
2000-03-01,737678,,
2000-04-01,916382,,
2000-05-01,894350,,


In [113]:
## check stationary and visualise plot and histogram
def check_stationary(timeseries):
    dftest = ts.adfuller(timeseries)
    dfoutput = pd.Series(dftest[0:4], 
                            index=['Test Statistic','p-value','Lags Used','Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    print(dfoutput)

check_stationary(timeseries = tourism_clean['tourist_arrival'])

def display_time_series(df, y_col):


    # Determing rolling statistics
    df[y_col+'_rolmean'] = df[y_col].rolling(window=12).mean()
    df[y_col+'_rolstd'] = df[y_col].rolling(window=12).std()
    df = df.reset_index()
    df = df.melt( id_vars='date', var_name = "type", value_name = "value")

    fig = px.line(df, x="date", y="value", color="type")
    fig.show()

display_time_series(df = tourism_clean.copy(), y_col = "tourist_arrival")

def display_histogram(df, x_col):
    fig = px.histogram(df, x=x_col)
    fig.show()

display_histogram(df = tourism_clean, x_col = "tourist_arrival")






Test Statistic           -1.116766
p-value                   0.708340
Lags Used                12.000000
Observations Used       239.000000
Critical Value (1%)      -3.458011
Critical Value (5%)      -2.873710
Critical Value (10%)     -2.573256
dtype: float64


In [40]:
#line + (table + histogram)
def display_time_series_obj(df, y_col):
    return(go.Scatter(x=df.index, y=df[y_col],
                    mode='lines',
                    name='markers'))

def display_histogram_obj(df, y_col):
    return(go.Histogram(x=df[y_col]))

# Initialize figure with subplots
fig = make_subplots(rows=1, cols=2, column_widths=[0.7, 0.3])

fig.add_trace(
    display_time_series_obj(df=tourism_clean, y_col = "tourist_arrival"),
    row=1, col=1
)
fig.add_trace(
    display_histogram_obj(df=tourism_clean, y_col ='tourist_arrival'),
    row=1, col=2
)


fig.show()