# Data generation for VAR data for ML

In [1]:
import pandas as pd
import numpy as np
import statsmodels as sm
from statsmodels.tsa.api import VAR
from scipy.signal import detrend
import datetime as dt

# remove
from pandas.plotting import register_matplotlib_converters
import matplotlib
matplotlib.use("TkAgg")
import matplotlib.pyplot as plt
register_matplotlib_converters()
plt.rc("figure", figsize=(12, 8))
plt.rc("font", size=13)

## Data
1. import the data
2. remove data entries where there are less than 50 data-points
3. limit these entries to a window of time where all remaining entries have data
4. Remove trends from the data by fitting first order polynomials to the data in certain ranges, and subtracting this from the data

In [2]:
data = pd.read_csv("gdp_pc.csv")

# get countries into a list so we can remove individual countries
countries_list = [
    data[data["Country Name"] == name].drop(columns=["Country Code"])
    for name in data["Country Name"].unique()
]

# turn list into a dict with name being the key
_countries_dict = {}
for country in countries_list:
    key = country["Country Name"].to_list()[0]
    _countries_dict[key] = country.drop(columns=["Country Name"])

# Remove countries if they dont have min_len entries
len_before = len(_countries_dict)
countries_dict = {}
min_len = 50

for key in _countries_dict:
    if _countries_dict[key].__len__() >= min_len:
        countries_dict[key] = _countries_dict[key]

len_after = len(countries_dict)
print(f"Removed {len_before - len_after} entries of {len_before}. New len is {len_after}")

# Find the limits of the dates.
# we find the latest first entry, and the earliest last entry and cut
# each of the series to fit these dates
latest_first = 1000
earlist_last = 3000

for key, country in countries_dict.items():
    if latest_first < country["Year"].to_list()[0]:
        latest_first = country["Year"].to_list()[0]
    
    if earlist_last > country["Year"].to_list()[-1]:
        earlist_last = country["Year"].to_list()[-1]

print(f"The data will be drawn from {latest_first} to {earlist_last}")

for key, country in countries_dict.items():
    country.drop(country[country["Year"] < latest_first].index, inplace=True)
    country.drop(country[country["Year"] > earlist_last].index, inplace=True)

years = [year for year in range(latest_first, earlist_last+1)]

# Remove entries that have missing years in the range specified
num_years = max([country.__len__()] for key, country in countries_dict.items())[0]
del_keys = []

for key, country in countries_dict.items():
    if len(country) != num_years:
        del_keys.append(key)

for key in del_keys:
    del countries_dict[key]

print(f"Deleted {len(del_keys)} of {len_after} entries")

# Remove the trend from each country, such that each process is stationary
# (fits a first order polynomial using least squares, and subtracts it from the data)
# if breakpoints are specified the data will have multiple polynomials fitted to it
# and removed from it.
countries_detrend = {}
breakpoints = [10, 20, 30, 40]
# breakpoints = None

for key, country in countries_dict.items():
    if breakpoints is None:
        countries_detrend[key] = detrend(country.values[:, 1])
    else:
        countries_detrend[key] = detrend(
            country.values[:, 1], bp=breakpoints
        )

detrended_data = pd.DataFrame(countries_detrend, index=years)

# Dont know why this doesnt remove non-contries, but it's keept for the formatting
# THE NEXT COMMENT IS WRONG
# Only look at countries. Since the amount of data is too large otherwise
start_appending = False
countries_data = pd.DataFrame()
for i, col in enumerate(detrended_data.columns.values):
    if col == "Algeria":
        start_appending = True

    if start_appending:
        countries_data = pd.concat([detrended_data, detrended_data[col]], axis=1)

countries_data.index = pd.to_datetime([dt.date(year=val, month=1, day=1) for val in countries_data.index.values])

Removed 117 entries of 256. New len is 139
The data will be drawn from 1967 to 2013
Deleted 3 of 139 entries


## Data setup

In [3]:
import pickle
from generator import varp
import itertools
from math import floor


def generate_data(data, model_order, samples, choose, N):
    iterations = floor(len(data.columns) / samples)
    dat = data.copy()
    save_str = f"order_{model_order} samples_{samples} choose{choose}"
    save_dict = dict()

    for i in range(iterations):
        subset = data.sample(samples, axis=1)
        data.drop(columns=subset.columns)
        cols = subset.columns
        subset = subset.to_numpy()

        combi = list(itertools.combinations(list(range(samples)), choose))
        for comb in combi:
            chosen_cols = cols[list(comb)]

            model = VAR(subset[:, comb])
            result = model.fit(model_order)
            coefs = result.coefs
            var = result.sigma_u

            sim_result, cov = varp.simulate_VAR(coefs, var, N)

            mis = np.zeros((choose, choose))
            idxs = list(itertools.combinations(list(range(choose)), 2))

            for idx in idxs:
                mi = varp.calculate_MI(idx[0], idx[1], cov)
                mis[idx[0], idx[1]] = mi
                mis[idx[1], idx[0]] = mi
            
            combi_key = ""
            for col in chosen_cols.to_list():
                combi_key += col + " - "
            combi_key = combi_key[:-3]

            save_dict[combi_key] = (sim_result, mis)
    
    with open(f"data/{save_str}.pkl", 'wb') as f:
        pickle.dump(save_dict, f)

## Generate the data

In [4]:
generate_data(
    data=countries_data,
    model_order=2,
    samples=5,
    choose=3,
    N=500
)
generate_data(
    data=countries_data,
    model_order=1,
    samples=7,
    choose=3,
    N=500
)