In [79]:
import pandas as pd
import numpy as np
import statsmodels as sm
from statsmodels.tsa.api import VAR
from scipy.signal import detrend
import datetime as dt


from pandas.plotting import register_matplotlib_converters
import matplotlib
matplotlib.use("TkAgg")
import matplotlib.pyplot as plt

# Plot setup
register_matplotlib_converters()
plt.rc("figure", figsize=(12, 8))
plt.rc("font", size=13)

In [80]:
data = pd.read_csv("gdp_pc.csv")

In [81]:
# get countries into a list so we can remove individual countries
countries_list = [
    data[data["Country Name"] == name].drop(columns=["Country Code"])
    for name in data["Country Name"].unique()
]

# turn list into a dict with name being the key
_countries_dict = {}
for country in countries_list:
    key = country["Country Name"].to_list()[0]
    _countries_dict[key] = country.drop(columns=["Country Name"])

In [82]:
# Remove countries if they dont have min_len entries
len_before = len(_countries_dict)
countries_dict = {}
min_len = 50

for key in _countries_dict:
    if _countries_dict[key].__len__() >= min_len:
        countries_dict[key] = _countries_dict[key]

len_after = len(countries_dict)
print(f"Removed {len_before - len_after} entries of {len_before}. New len is {len_after}")

Removed 117 entries of 256. New len is 139


In [83]:
# Find the limits of the dates.
# we find the latest first entry, and the earliest last entry and cut
# each of the series to fit these dates
latest_first = 1000
earlist_last = 3000

for key, country in countries_dict.items():
    if latest_first < country["Year"].to_list()[0]:
        latest_first = country["Year"].to_list()[0]
    
    if earlist_last > country["Year"].to_list()[-1]:
        earlist_last = country["Year"].to_list()[-1]

print(f"The data will be drawn from {latest_first} to {earlist_last}")

for key, country in countries_dict.items():
    country.drop(country[country["Year"] < latest_first].index, inplace=True)
    country.drop(country[country["Year"] > earlist_last].index, inplace=True)

years = [year for year in range(latest_first, earlist_last+1)]

The data will be drawn from 1967 to 2013


In [84]:
# Remove entries that have missing years in the range specified
num_years = max([country.__len__()] for key, country in countries_dict.items())[0]
del_keys = []

for key, country in countries_dict.items():
    if len(country) != num_years:
        del_keys.append(key)

for key in del_keys:
    del countries_dict[key]

print(f"Deleted {len(del_keys)} of {len_after} entries")

Deleted 3 of 139 entries


In [85]:
# Remove the trend from each country, such that each process is stationary
# (fits a first order polynomial using least squares, and subtracts it from the data)
# if breakpoints are specified the data will have multiple polynomials fitted to it
# and removed from it.
countries_detrend = {}
breakpoints = [10, 20, 30, 40]
# breakpoints = None

for key, country in countries_dict.items():
    if breakpoints is None:
        countries_detrend[key] = detrend(country.values[:, 1])
    else:
        countries_detrend[key] = detrend(
            country.values[:, 1], bp=breakpoints
        )

detrended_data = pd.DataFrame(countries_detrend, index=years)

# Only look at countries. Since the amount of data is too large otherwise
start_appending = False
countries_data = pd.DataFrame()
for i, col in enumerate(detrended_data.columns.values):
    if col == "Algeria":
        start_appending = True

    if start_appending:
        countries_data = pd.concat([detrended_data, detrended_data[col]], axis=1)

countries_data.index = pd.to_datetime([dt.date(year=val, month=1, day=1) for val in countries_data.index.values])

## Grouping of the data
There are 104 countries in the table, which is too many to create the data we want to. Instead we're going to create 13 groups, with 8 countries in each of them. This will be done to avoid combinatorial explosion in the amount of data. 

The code below is used to determine the groupings of the data, by showing which countries have a high amount of mutual information

In [237]:
from generator import varp
samples = 7

# subset = countries_data.sample(samples, axis=1).resample("D").mean().interpolate(method="linear")
subset = countries_data.sample(samples, axis=1)

model = VAR(subset)
results = model.fit(1)
coefs = results.coefs
var = results.sigma_u.to_numpy()

sim_result, cov = varp.simulate_VAR(coefs, var, len(countries_data))

fig, ax = plt.subplots(nrows=1, ncols=2)
ax[0].plot(sim_result, label=subset.columns)
ax[0].legend()
ax[0].set_title("Sim")
subset.plot(ax=ax[1])
ax[1].set_title("Real")
plt.show()

  self._init_dates(dates, freq)
