# GDP Data Reading, Cleaning, & Preprocessing

In [1]:
# Importing modules/libraries

import pandas as pd

import plotly.graph_objects as go
from statsmodels.tsa.stattools import adfuller, kpss

import warnings
warnings.filterwarnings("ignore")

## Load and Read the GDP data

In [13]:
# Load the data
gdp_data_path = 'data/GDP_Data/ZA_GDP.csv'
# Read the data
gdp_data = pd.read_csv(gdp_data_path)
# Display the first few rows of the data
gdp_data.head()

Unnamed: 0,DATE,NGDPRSAXDCZAQ
0,2004-01-01,793035.1
1,2004-04-01,804117.4
2,2004-07-01,817265.7
3,2004-10-01,825993.5
4,2005-01-01,834390.5


## Checking the column names

In [14]:
gdp_data.columns

Index(['DATE', 'NGDPRSAXDCZAQ'], dtype='object')

## List data types of each column

In [15]:
gdp_data.dtypes

DATE              object
NGDPRSAXDCZAQ    float64
dtype: object

## Select the date and GDP column

In [16]:
gdpts = gdp_data[['DATE', 'NGDPRSAXDCZAQ']]
gdpts.head()

Unnamed: 0,DATE,NGDPRSAXDCZAQ
0,2004-01-01,793035.1
1,2004-04-01,804117.4
2,2004-07-01,817265.7
3,2004-10-01,825993.5
4,2005-01-01,834390.5


## Checking missing values in the dataframes

In [17]:
# calculate the percentage of missing values in the dataframes
missing_values = gdpts.isnull().mean() * 100
missing_values

DATE             0.0
NGDPRSAXDCZAQ    0.0
dtype: float64

## Remove missing values from a DataFrame  

In [18]:
# remove missing values (NaN) from a DataFrame or Series.  
gdpts = gdpts.dropna()
gdpts.shape

(79, 2)

In [19]:
# Create a date range from 2004-01 to the end of the DataFrame, with a quarterly frequency
date_range = pd.date_range(start='2004-01', periods=len(gdpts), freq='Q')

# Convert the date range to the desired format (YYYY-MM)
formatted_dates = date_range.strftime('%Y-%m')

# Add the formatted dates as a new column in the DataFrame
gdpts['Date'] = formatted_dates

# Display the first few rows of the DataFrame
gdpts.head()

Unnamed: 0,DATE,NGDPRSAXDCZAQ,Date
0,2004-01-01,793035.1,2004-03
1,2004-04-01,804117.4,2004-06
2,2004-07-01,817265.7,2004-09
3,2004-10-01,825993.5,2004-12
4,2005-01-01,834390.5,2005-03


In [21]:
gdpts.index = gdpts['Date']
gdpts.head()

Unnamed: 0_level_0,DATE,NGDPRSAXDCZAQ,Date
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2004-03,2004-01-01,793035.1,2004-03
2004-06,2004-04-01,804117.4,2004-06
2004-09,2004-07-01,817265.7,2004-09
2004-12,2004-10-01,825993.5,2004-12
2005-03,2005-01-01,834390.5,2005-03


In [22]:
# drop the Date column
gdpts = gdpts.drop(columns = ['Date', 'DATE'])
gdpts.head()

Unnamed: 0_level_0,NGDPRSAXDCZAQ
Date,Unnamed: 1_level_1
2004-03,793035.1
2004-06,804117.4
2004-09,817265.7
2004-12,825993.5
2005-03,834390.5


## Rename columns

In [23]:
# rename columns
gdpts = gdpts.rename(columns = {'NGDPRSAXDCZAQ': 'GDP'})
gdpts.head()

Unnamed: 0_level_0,GDP
Date,Unnamed: 1_level_1
2004-03,793035.1
2004-06,804117.4
2004-09,817265.7
2004-12,825993.5
2005-03,834390.5


## Create a copy of the original data

In [24]:
# Create a copy of the original data
gdp_original = gdpts.copy()
gdpts.tail()

Unnamed: 0_level_0,GDP
Date,Unnamed: 1_level_1
2022-09,1160699.9
2022-12,1148001.1
2023-03,1152669.1
2023-06,1157940.6
2023-09,1155074.2


## Plot for GDP

In [31]:
# Create a new figure
fig = go.Figure()

# Add a scatter plot to the figure
fig.add_trace(go.Scatter(x=gdpts.index,
                         y=gdpts['GDP'],
                         mode='markers+lines',
                         name='GDP',
                         marker=dict(color='blue'),
                         line=dict(color='blue', width=2),
                         showlegend=True,
                         hoverlabel=dict(bgcolor='white', font_size=12, font_family='Rockwell'),
                         hoverinfo='x+y',
                         # hovertemplate='Quarter: %{x}<br>GDP: %{y:.2f} trillion USD<br>',
                         textposition='top center',
                         textfont=dict(family='Rockwell', size=12, color='blue'),
                         text='GDP'))

# Set the title, x-label, and y-label of the plot
fig.update_layout(title="Quarterly GDP value over time [2004-01 - 2023-09]", 
                  xaxis_title='Date [Quarters]', 
                  yaxis_title='GDP', 
                  autosize=False, 
                  width=900,
                  height=500,
                  margin=dict(l=50, r=50, b=100, t=100, pad=4),
                  hoverlabel=dict(bgcolor='white', font_size=12, font_family='Rockwell'),
                  hovermode='x unified')

# Show the figure
fig.show()

## Calculate GDP Growth Rate

In [32]:
# Calculate the GDP growth rate
# pct_change: is a pandas function that calculates the percentage change between the current and a prior element.
# pct_change = ((current_element - previous_element) / |previous_element|) * 100

gdpts['GDP_GrowthRate'] = gdpts['GDP'].pct_change()
gdpts.shape

(79, 2)

In [39]:
gdpts

Unnamed: 0_level_0,GDP,GDP_GrowthRate
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2004-03,793035.1,
2004-06,804117.4,0.013975
2004-09,817265.7,0.016351
2004-12,825993.5,0.010679
2005-03,834390.5,0.010166
...,...,...
2022-09,1160699.9,0.017740
2022-12,1148001.1,-0.010941
2023-03,1152669.1,0.004066
2023-06,1157940.6,0.004573


## Plot for GDP growth rate  

In [34]:
# Create a new figure
fig = go.Figure()

# Add a scatter plot to the figure
fig.add_trace(go.Scatter(x=gdpts.index,
                         y=gdpts['GDP_GrowthRate'],
                         mode='markers+lines',
                         name='GDP GrowthRat',
                         marker=dict(color='blue'),
                         line=dict(color='blue', width=2),
                         showlegend=True,
                         hoverlabel=dict(bgcolor='white', font_size=12, font_family='Rockwell'),
                         hoverinfo='x+y',
                         #hovertemplate='Quarter: %{x}<br>GDP: %{y:.2f} trillion USD<br>',
                         textposition='top center',
                         textfont=dict(family='Rockwell', size=12, color='blue'),
                         text='GDP'))

# Set the title, x-label, and y-label of the plot
fig.update_layout(title="Quarterly GDP value over time [2004-01 - 2023-09]", 
                  xaxis_title='Date [Quarters]', 
                  yaxis_title='GDP Growth Rate [%]', 
                  autosize=False, 
                  width=900,
                  height=500,
                  margin=dict(l=50, r=50, b=100, t=100, pad=4),
                  hoverlabel=dict(bgcolor='white', font_size=12, font_family='Rockwell'),
                  hovermode='x unified')

# Show the figure
fig.show()

## Check for stationarity

Stationarity: 
    - Mean and Standard deviation should be constant and no seasonality 

How to check Stationarity:

    1) Visual 

    2) Global vs local test 

    3) Statistical Test: ADF/KPPS Test

- Augmented Dickey-Fuller (ADF) test is used to determine whether a given time series is stationary or not.

- The test uses an autoregressive model and optimizes an information criterion across multiple different lag values.

- A time series is said to be stationary if its statistical properties do not change over time.

- In other words, it has constant mean and variance, and covariance is independent of time.


- Null Hypothesis (H0): If accepted, it suggests the time series has some time dependent structure (meaning it is non-stationary).

- Alternate Hypothesis (H1): The null hypothesis is rejected; meaning it is stationary. It does not have time-dependent structure.


- So, if p-value < 0.05, you reject the null hypothesis and infer that the time series is stationary.

- The p-value is used in hypothesis testing to help you support or reject the null hypothesis.
- It represents the probability that the results of your test occurred at random. 

- If p-value < 0.05, you reject the null hypothesis. 

-  if the p-value of the test (obtained via regression) is less than the significance level (0.05 is commonly used), then the null hypothesis is rejected and the time series is considered to be stationary.

- If p-value > 0.05, you fail to reject the null hypothesis (accept the null hypothesis). 

- by setting the autolag='AIC', the adfuller will choose a the number of lags that yields the lowest AIC. 


In [35]:
# Define the time series
timeseries = gdpts['GDP_GrowthRate']

# Run the ADF test
result = adfuller(timeseries.dropna(), autolag='AIC')

# Print the test statistic and the p-value
print(f'ADF Statistic: {result[0]}')
print(f'p-value: {result[1]}')

# Determine if the series is stationary
is_stationary = result[1] < 0.05

# Print the result
if is_stationary:
    print("The series is stationary.")
else:
    print("The series is not stationary.")

ADF Statistic: -8.60114733104079
p-value: 6.863635742652406e-14
The series is stationary.


1) ADF Statistic: value of the test statistic. More negative values indicate stronger evidence against the null hypothesis.

2) p-value: A small p-value (typically ≤ 0.05) indicates strong evidence against the null hypothesis

3) Number of lags used: is the number of lags used in the regression when performing the ADF test.

4) Number of observations used for the ADF regression and calculation of the critical values

5) Critical values for the given significance level: If the computed ADF statistic is less than the critical value at a given significance level, we can reject the null hypothesis and infer that the time series is stationary.

6)  AIC or BIC



In [36]:
# Create a Series with the results
dfoutput = pd.DataFrame(
    result[0:4],
    index=[
        "Test Statistic",
        "p-value",
        "No Lags Used",
        "Number of Observations Used",
    ],
)

dfoutput

Unnamed: 0,0
Test Statistic,-8.601147
p-value,6.863636e-14
No Lags Used,1.0
Number of Observations Used,76.0


In this case, 
 - the ADF statistic is less than the critical values, 
 -  the p-value is less than 0.05, so the null hypothesis is rejected and conclude that the time series is stationary.

### Kwiatkowski-Phillips-Schmidt-Shin test (KPSS):

- The KPSS test has the null hypothesis opposite to that of the ADF test. 

- Null Hypothesis (H0): The process is trend stationary.

- Alternate Hypothesis (H1): The series has a unit root (series is not stationary).

- A trend stationary process is one where the statistical properties (like the mean and variance) are constant over time, but there could be a trend (like an increase or decrease) over time.

-  both tests might be used together to confirm whether a series is truly stationary. If both tests conclude that the series is stationary, you can be fairly confident in your result. If the tests disagree, the series is likely to be difference stationary (meaning it can be made stationary by differencing the series a number of times).

- regression='c' option means that the test includes a constant (or an intercept) in the regression. 

- Other possible options for the regression parameter are:

    - 'nc': No constant or trend (also known as a test for a pure random walk).
    - 'ct': Constant and trend.
    - 'ctt': Constant, linear and quadratic trend.

In [37]:
# Define the time series
timeseries = gdpts['GDP_GrowthRate']

# Run the KPSS test
result = kpss(timeseries.dropna(), regression='c', nlags='auto')

# Print the test statistic and the p-value
print(f'KPSS Statistic: {result[0]}')
print(f'p-value: {result[1]}')

# Determine if the series is stationary
is_stationary = result[1] > 0.05

# Print the result
if is_stationary:
    st.info("The time series is stationary.")
else:
    st.info("The time series is not stationary.")

KPSS Statistic: 0.39868763764104154
p-value: 0.07772084584437865
The series is stationary.


## Make both test at once 

In [38]:
# Define the DataFrame
dataframe = gdpts  

# Iterate over each series in the DataFrame
for column in dataframe:
    timeseries = dataframe[column].dropna()

    # Run the ADF test
    adf_result = adfuller(timeseries, autolag='AIC')
    adf_is_stationary = adf_result[1] < 0.05

    # Run the KPSS test
    kpss_result = kpss(timeseries, regression='c', nlags='auto')
    kpss_is_stationary = kpss_result[1] > 0.05

    # Print the result
    if adf_is_stationary and kpss_is_stationary:
        print(f"Series '{column}' is stationary.")
    elif not adf_is_stationary and not kpss_is_stationary:
        print(f"Series '{column}' is not stationary.")
    elif adf_is_stationary and not kpss_is_stationary:
        print(f"Series '{column}' is not stationary, differencing can be used to make it stationary.")
    elif not adf_is_stationary and kpss_is_stationary:
        print(f"Series '{column}' is trend stationary, trend needs to be removed.")

Series 'GDP' is not stationary.
Series 'GDP_GrowthRate' is stationary.


### To make a non-stationary time series stationary:

- **Differencing**: This involves subtracting the previous observation from the current observation. Differencing can help stabilize the mean of a time series by removing changes in the level of a time series, and so eliminating (or reducing) trend and seasonality.

- **Seasonal Differencing**: This is a method where the observation from the same season in the previous cycle are subtracted from the current value. This method is useful when the time series is seasonal.

- **Transformation**: Transformations such as logarithm, square root, cube root, etc. can help to stabilize the variance of a time series. Log transformation is a commonly used technique which can help to reduce positive skewness.

- **Decomposition**: In this approach, both trend and seasonality are modeled separately and the remaining part of the series is returned.

- **Moving Average**: If a time series has a lot of noise, smoothing the time series could help to identify the underlying trend and seasonality. One way to smooth a time series is by taking a moving average, which means that for each time point, you take the average of the points on either side of it.



In [27]:
# export the data into a csv file
gdp_original.to_csv('data/GDP_Data/SouthAfrica_GDP.csv')

`Hodrick-Prescott filter (HP filter)`: (HP filter provides a way to decompose a time series into its trend and cyclical components,)

- The Hodrick-Prescott (HP) filter is a widely used technique in time series analysis for separating a time series into two components: a trend component and a cyclical component. 

- It's commonly applied in economics and finance to analyze economic data and extract underlying trends from noisy data.

    - identifying long-term patterns for forecasting purposes.

    - removing cyclical fluctuations for forecasting purposes.

- 'get_long_term_trend' is designed to compute the long-term trend of a given DataFrame containing time-series data. 

- `Hodrick-Prescott filter (HP filter)`: a widely used method for separating a time series into a trend component and a cyclical component.

- The function begins by taking the `natural logarithm (np.log)` of the input DataFrame. 

- This transformation is often applied to financial or economic data to `stabilize variance`.

- the function sets the `lambda (λ)` parameter for the HP filter It  controls the `smoothness` of the resulting trend.

- If freq is 'Q' (quarterly), lambda is set to 1600.

- If freq is 'M' (monthly), lambda is set to 1600 multiplied by 3^4
 , which equals 1600*81 = 129,600. 
 
- This value implies a smoother trend for higher frequency data.


- it applies the HP filter (sm.tsa.filters.hpfilter) to extract the trend component. 

- The HP filter separates the time series into two components: `trend` and `cycle`. 
