In [2]:
import pandas as pd

df = pd.read_csv("../data/data.csv")
print(df.columns)
df.head(5)

Index(['id', 'Country', 'year', 'goal1', 'goal2', 'goal3', 'goal4', 'goal5',
       'goal6', 'goal7', 'goal8', 'goal9', 'goal10', 'goal11', 'goal12',
       'goal13', 'goal14', 'goal15', 'goal16', 'goal17'],
      dtype='object')


Unnamed: 0,id,Country,year,goal1,goal2,goal3,goal4,goal5,goal6,goal7,goal8,goal9,goal10,goal11,goal12,goal13,goal14,goal15,goal16,goal17
0,AFG,Afghanistan,2000,8.0,27,19,2,18,14,19,44,5,,34,96,99,,56,42,36
1,AFG,Afghanistan,2001,8.0,30,19,2,18,13,22,44,5,,31,96,99,,56,42,36
2,AFG,Afghanistan,2002,9.0,30,20,2,18,16,21,44,5,,32,95,99,,56,42,36
3,AFG,Afghanistan,2003,10.0,32,20,2,18,17,24,44,5,,32,95,98,,56,42,36
4,AFG,Afghanistan,2004,10.0,31,22,2,18,16,27,44,5,,33,96,98,,56,42,36


In [3]:
from statsmodels.tsa.stattools import adfuller

def stationary_test_adf(df):
    country = df['Country'].unique()[0]
    goals = ['goal1', 'goal2', 'goal3', 'goal4', 'goal5', 'goal6','goal7', 'goal8', 'goal9', 'goal10', 'goal11', 'goal12', 'goal13','goal14', 'goal15', 'goal16', 'goal17']
    stationary, list_stationary = 0, []
    non_stationary, list_non_stationary = 0, []
    constant, list_constant = 0, []
    missing_values_goal, list_missing_values_goal = 0, []
    errors, list_errors = 0, []
    for goal in goals:
        missing_values = df[goal].isnull().sum()
        if missing_values > 0:
            missing_values_goal += 1
            list_missing_values_goal.append(goal)
        else:
            results = []
            if len(df[goal].unique()) == 1:
                constant += 1
                list_constant.append(goal)
            else:
                results = adfuller(df[goal])
                if results[1] <= 0.05:
                    # if the p-value is less than 0.05, the null hypothesis is rejected.
                    # in other words, we reject the idea that the series is a random walk with 95% confidence
                    stationary += 1
                    list_stationary.append(goal)
                else:
                    # when the p-value is 1, we cannot reject the hypothesis that the data follows a random walk
                    non_stationary += 1
                    list_non_stationary.append(goal)
    quantities = [stationary, non_stationary, constant, missing_values_goal]
    lists = [list_stationary, list_non_stationary, list_constant, list_missing_values_goal]
    return country, quantities, lists

In [4]:
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

countries = df['Country'].unique()
total_country, total_missing_values_goal, total_constant = 0, 0, 0
non_stationary_df, stationary_df = pd.DataFrame(columns=['country', 'goal']), pd.DataFrame(columns=['country', 'goal'])
for country in countries:
    country_data = df[df['Country'] == country]
    country, quantities, lists = stationary_test_adf(country_data)
    total_country += 1
    total_constant += quantities[2]
    total_missing_values_goal += quantities[3]
    stationary_line = pd.DataFrame({"country": [country] * len(lists[0]), "goal": lists[0]})
    stationary_df = pd.concat([stationary_df, stationary_line], ignore_index=True)
    non_stationary_line = pd.DataFrame({"country": [country] * len(lists[1]), "goal": lists[1]})
    non_stationary_df = pd.concat([non_stationary_df, non_stationary_line], ignore_index=True)

print(f"{total_country} countries - {17} goals - {total_country*17} models")
print(f"{'='*50}")
print(f"Total stationary: {stationary_df.shape[0]}, or {stationary_df.shape[0]*100/(total_country*17):.2f}% of the total.")
print(f"Total non-stationary: {non_stationary_df.shape[0]}, or {non_stationary_df.shape[0]*100/(total_country*17):.2f}% of the total.")
print(f"Total missing values: {total_missing_values_goal}, or {total_missing_values_goal*100/(total_country*17):.2f}% of the total.")
print(f"Total constant: {total_constant}, or {total_constant*100/(total_country*17):.2f}% of the total.")
#print(f"{'='*50}")
#print(f"Total: {stationary_df.shape[0] + non_stationary_df.shape[0] + total_missing_values_goal + total_constant}")

167 countries - 17 goals - 2839 models
Total stationary: 526, or 18.53% of the total.
Total non-stationary: 2195, or 77.32% of the total.
Total missing values: 70, or 2.47% of the total.
Total constant: 48, or 1.69% of the total.


In [5]:
country_data.head(5)

Unnamed: 0,id,Country,year,goal1,goal2,goal3,goal4,goal5,goal6,goal7,goal8,goal9,goal10,goal11,goal12,goal13,goal14,goal15,goal16,goal17
3984,ZWE,Zimbabwe,2000,74.0,47,27,63,58,58,35,63,22,37.0,82,95,96,,70,42,44
3985,ZWE,Zimbabwe,2001,74.0,47,24,65,56,57,36,64,22,37.0,79,95,97,,70,42,44
3986,ZWE,Zimbabwe,2002,68.0,45,24,63,57,57,37,64,22,37.0,80,95,97,,72,42,44
3987,ZWE,Zimbabwe,2003,55.0,46,23,63,58,57,39,64,22,37.0,79,96,97,,72,43,44
3988,ZWE,Zimbabwe,2004,49.0,49,22,63,58,57,40,64,22,37.0,80,96,97,,72,43,44


In [6]:
import pandas as pd

def filter_country_goal(df_to_filter_from, country, goal):
    df_to_filter_from = df_to_filter_from[df_to_filter_from['Country'] == country]
    columns_to_include = ['Country', 'year', goal]
    goal_data_filtered = df_to_filter_from[columns_to_include]
    return goal_data_filtered

# Treating non stationarity
To transform non stationary goals into stationary goals I'm going to take the difference between one year and the next using the df.diff() method on the dataframe and removing the first value (because there is no previous value to subtract from it).


In [7]:
from statsmodels.stats.diagnostic import acorr_ljungbox

def mini_stationary_test_ljungbox(df, lags=10):
    """
    Realiza o teste de Ljung-Box para verificar a independência dos resíduos
    ou da série temporal. Retorna o número de séries estacionárias e não estacionárias.

    Parâmetros:
    - df: Série ou DataFrame (uma coluna de série temporal).
    - lags: Número de defasagens (lags) para o teste de Ljung-Box. Padrão é 10.

    Retorno:
    - stationary: Contagem de séries independentes (sem autocorrelação significativa).
    - non_stationary: Contagem de séries com autocorrelação significativa.
    """
    stationary, non_stationary = 0, 0
    try:
        result = acorr_ljungbox(df, lags=[lags], return_df=True)
        p_value = result['lb_pvalue'].iloc[-1]
        
        if p_value > 0.05:
            stationary += 1
        else:
            non_stationary += 1
        
        return stationary, non_stationary
    except:
        return None, None

In [8]:
import numpy as np

def treating_non_stationarity_with_diff(df, goal):
    return df[goal].diff().dropna()
def treating_non_stationarity_with_log(df, goal):
    return np.log(df[goal]/df[goal].shift(1)).dropna()

became_stationary = 0
still_non_stationary = 0
for country in non_stationary_df['country'].unique():
    goals_for_country = non_stationary_df[non_stationary_df['country'] == country]['goal'].unique()
    for goal in goals_for_country:
        goal_data_filtered = filter_country_goal(df,country, goal)
        treated_country_goal = treating_non_stationarity_with_diff(goal_data_filtered,goal)
        stationary, non_stationary = mini_stationary_test_ljungbox(treated_country_goal)
        if stationary == 1:
            became_stationary += 1
        else:
            treated_country_goal = treating_non_stationarity_with_log(goal_data_filtered,goal)
            stationary, non_stationary = mini_stationary_test_ljungbox(treated_country_goal)
            if stationary == 1:
                became_stationary += 1
            else:
                still_non_stationary += 1

print(f"{'='*50}")
print(f"Total became stationary: {became_stationary}, or {became_stationary*100/(non_stationary_df.shape[0]):.2f}% of the total.")
print(f"Total still non-stationary: {still_non_stationary}, or {still_non_stationary*100/(non_stationary_df.shape[0]):.2f}% of the total.")

Total became stationary: 1961, or 89.34% of the total.
Total still non-stationary: 234, or 10.66% of the total.
