In [32]:
import pandas as pd
import statsmodels as sm
from scipy.signal import detrend
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv("gdp_pc.csv")

In [3]:
# get countries into a list so we can remove individual countries
countries_list = [
    data[data["Country Name"] == name].drop(columns=["Country Code"])
    for name in data["Country Name"].unique()
]

# turn list into a dict with name being the key
_countries_dict = {}
for country in countries_list:
    key = country["Country Name"].to_list()[0]
    _countries_dict[key] = country.drop(columns=["Country Name"])

In [4]:
# Remove countries if they dont have min_len entries
len_before = len(_countries_dict)
countries_dict = {}
min_len = 50

for key in _countries_dict:
    if _countries_dict[key].__len__() >= min_len:
        countries_dict[key] = _countries_dict[key]

len_after = len(countries_dict)
print(f"Removed {len_before - len_after} entries of {len_before}. New len is {len_after}")

Removed 117 entries of 256. New len is 139


In [58]:
# Find the limits of the dates.
# we find the latest first entry, and the earliest last entry and cut
# each of the series to fit these dates
latest_first = 1000
earlist_last = 3000

for key, country in countries_dict.items():
    if latest_first < country["Year"].to_list()[0]:
        latest_first = country["Year"].to_list()[0]
    
    if earlist_last > country["Year"].to_list()[-1]:
        earlist_last = country["Year"].to_list()[-1]

print(f"The data will be drawn from {latest_first} to {earlist_last}")

for key, country in countries_dict.items():
    country.drop(country[country["Year"] < latest_first].index, inplace=True)
    country.drop(country[country["Year"] > earlist_last].index, inplace=True)

years = [year for year in range(latest_first, earlist_last+1)]

The data will be drawn from 1967 to 2013


In [59]:
# Remove entries that have missing years in the range specified
num_years = max([country.__len__()] for key, country in countries_dict.items())[0]
del_keys = []

for key, country in countries_dict.items():
    if len(country) != num_years:
        del_keys.append(key)

for key in del_keys:
    del countries_dict[key]

print(f"Deleted {len(del_keys)} of {len_after} entries")

Deleted 0 of 139 entries


In [69]:
# Remove the trend from each country, such that each process is stationary
# (fits a first order polynomial using least squares, and subtracts it from the data)
# if breakpoints are specified the data will have multiple polynomials fitted to it
# and removed from it.
countries_detrend = {}
# breakpoints = [10, 20, 30, 40]
breakpoints = None

for key, country in countries_dict.items():
    if breakpoints is None:
        countries_detrend[key] = detrend(country.values[:, 1])
    else:
        countries_detrend[key] = detrend(
            country.values[:, 1], bp=breakpoints
        )

new_data = pd.DataFrame(countries_detrend, index=years)