In [4]:
import numpy as np
import pandas as pd
from filterpy.kalman import KalmanFilter

In [5]:
df = pd.read_csv("Cleaned Datasets/Merged_data.csv")
df = df.loc[df["Country"]=="UNITED STATES"]
df["Log GDP"] = np.log(df["GDP"])
df.head()

Unnamed: 0,Country,Code,Year,avg_years_of_schooling,GDP,GENC,Population,Male Population,Female Population,Annual Growth Rate %,...,"Under Age 5 Mortality, Both Sexes","Under Age 5 Mortality, Males","Under Age 5 Mortality, Females",Crude Death Rate,"Deaths, both sexes",Net Migration Rate,"Net international migrants, both sexes",Fertility Rate,Dependency Ratio,Log GDP
3469,UNITED STATES,USA,1960,8.9,543300000000.0,US,180671158,,,,...,,,,,,,,3.65,66.793151,27.020927
3470,UNITED STATES,USA,1965,9.82,743700000000.0,US,194302963,,,,...,,,,,,,,2.91,67.041566,27.334904
3471,UNITED STATES,USA,1970,10.61,1073303000000.0,US,205052174,,,,...,,,,,,,,2.47,61.517115,27.701762
3472,UNITED STATES,USA,1975,11.4,1684904000000.0,US,215973199,,,,...,,,,,,,,1.77,55.178314,28.15273
3473,UNITED STATES,USA,1980,11.94,2857307000000.0,US,227224681,110398730.0,116825951.0,,...,,,,,,,,1.83,50.946063,28.680901


In [6]:
df = df[df['Year'] >= 1990]

variables = ['Log GDP', 'avg_years_of_schooling','Fertility Rate']
df_var = df[variables]
df_var.index = pd.period_range(start='1990', end='2017', freq='Y')

In [47]:
# Define the function to calculate AIC
def calculate_aic(filtered_states, true_state, k, log_l):

    n = len(filtered_states)

    # AIC calculation
    aic = 2 * k * (1 + (k+1)/(n-k)) - 2 * log_l
    return aic


In [50]:
aic = np.inf
filtered_state = []

x_opt = 0
F_opt = 0
H_opt = 0

# Iterate to find best dimentions
for d in range(1, 10):
    log_likelihood = 0
    f = KalmanFilter(dim_x=d, dim_z=3)
    f.x = np.random.normal(size=(d,1))
    f.F = np.random.normal(size=(d,d))
    f.H = np.random.normal(size=(3,d))

    # Run Kalman filter
    for z in df_var.values:
        f.predict()
        f.update(z)
        log_likelihood += f.log_likelihood_of(z)
        filtered_state.append(f.x.flatten())

    # Compute and compare AIC
    curr_aic = calculate_aic(filtered_state, df, d, log_likelihood)
    if curr_aic < aic:
        aic = curr_aic
        x_opt = f.x
        F_opt = f.F
        H_opt = f.H

[[ 11.73455132]
 [  2.03456756]
 [-15.5567557 ]
 [  4.65263002]
 [-49.29675596]
 [  9.31841649]] [[-1.73997252  1.51580046 -0.38780169  0.5675808  -0.16583849  0.64925621]
 [ 0.86179841  0.4350011   0.65194905 -0.15795424  1.13550239  1.06122275]
 [-2.27680533 -1.04146816 -2.54783578 -1.33801212 -1.21315997  0.45078615]
 [-0.2302062  -0.01015885 -0.36735439 -1.66106287 -1.14888492  0.67170912]
 [-1.97927418 -1.2724342  -1.21923771 -1.26701063  1.46172966  1.53227969]
 [ 0.4942538  -0.86498705  1.54316552 -0.43275028 -1.31088049 -0.30476702]] [[-1.85696398  1.50616603  0.34057909  0.24586345 -0.82497983  1.2474067 ]
 [ 1.28721869  1.25511261  0.86371816 -0.07221588 -0.31025802 -0.50858606]
 [ 0.68120697  0.0154467  -0.49775589 -1.01813585  0.20290007 -0.25832258]]
