In [124]:
import numpy as np
import pandas as pd
from filterpy.kalman import KalmanFilter
import matplotlib.pyplot as plt

In [125]:
df = pd.read_csv("Cleaned Datasets/Merged_data.csv")
df = df.loc[df["Country"]=="UNITED STATES"]
df["Log GDP"] = np.log(df["GDP"])
df.head()

Unnamed: 0,Country,Code,Year,avg_years_of_schooling,GDP,GENC,Population,Male Population,Female Population,Annual Growth Rate %,...,"Under Age 5 Mortality, Both Sexes","Under Age 5 Mortality, Males","Under Age 5 Mortality, Females",Crude Death Rate,"Deaths, both sexes",Net Migration Rate,"Net international migrants, both sexes",Fertility Rate,Dependency Ratio,Log GDP
3469,UNITED STATES,USA,1960,8.9,543300000000.0,US,180671158,,,,...,,,,,,,,3.65,66.793151,27.020927
3470,UNITED STATES,USA,1965,9.82,743700000000.0,US,194302963,,,,...,,,,,,,,2.91,67.041566,27.334904
3471,UNITED STATES,USA,1970,10.61,1073303000000.0,US,205052174,,,,...,,,,,,,,2.47,61.517115,27.701762
3472,UNITED STATES,USA,1975,11.4,1684904000000.0,US,215973199,,,,...,,,,,,,,1.77,55.178314,28.15273
3473,UNITED STATES,USA,1980,11.94,2857307000000.0,US,227224681,110398730.0,116825951.0,,...,,,,,,,,1.83,50.946063,28.680901


In [126]:
df = df[df['Year'] >= 1990]

variables = ['Log GDP', 'avg_years_of_schooling','Fertility Rate']
df_var = df[variables]
df_var.index = pd.period_range(start='1990', end='2017', freq='Y')

In [137]:
df_var

Unnamed: 0,Log GDP,avg_years_of_schooling,Fertility Rate
1990,29.416619,12.32,2.07
1991,29.448794,12.4,2.06
1992,29.505946,12.5,2.04
1993,29.556518,12.6,2.01
1994,29.617145,12.7,1.99
1995,29.664386,12.7,1.97
1996,29.719561,12.7,1.97
1997,29.78017,12.7,1.97
1998,29.835201,12.7,1.99
1999,29.896026,12.7,2.0


In [127]:
# Define the function to calculate AIC
def calculate_aic(filtered_states, true_state, k, log_l):

    n = len(filtered_states)

    # AIC calculation
    aic = 2 * k * (1 + (k+1)/(n-k)) - 2 * log_l
    return aic


In [128]:
aic = np.inf
filtered_state = []

x_opt = 0
F_opt = 0
H_opt = 0

# Iterate to find best dimentions
for d in range(1, 10):
    log_likelihood = 0
    f = KalmanFilter(dim_x=d, dim_z=3)
    f.x = np.random.normal(size=(d,1))
    f.F = np.random.normal(size=(d,d))
    f.H = np.random.normal(size=(3,d))
    

    # Run Kalman filter
    for z in df_var.values:
        f.predict()
        f.update(z)
        log_likelihood += f.log_likelihood_of(z)
        filtered_state.append(f.x.flatten())

    # Compute and compare AIC
    curr_aic = calculate_aic(filtered_state, df, d, log_likelihood)
    if curr_aic < aic:
        aic = curr_aic
        x_opt = f.x
        F_opt = f.F
        H_opt = f.H

In [129]:
print(x_opt.shape, F_opt.shape, H_opt.shape)

(5, 1) (5, 5) (3, 5)


In [130]:
f = KalmanFilter(dim_x=x_opt.shape[0], dim_z=3)
f.x = x_opt
f.F = F_opt
f.H = H_opt

In [190]:
# Initialize array to store filtered state estimates
filtered_state_estimates = []
data = df_var.values

# Perform Kalman filtering
for measurement in data:
    # Predict step
    f.predict()

    # Update step
    f.update(measurement)

    # Save filtered state estimate
    filtered_state_estimates.append(H_opt@f.x.copy())

print(filtered_state_estimates)

np.exp(filter[:,0]).plot(figsize=(10,6), label="VARMAX Predicted GDP", marker='x', color='b', markersize=4)
plt.plot(np.exp(df_var['Log GDP']), label="Actual GDP", marker='o', color='g', markersize=4)
plt.title('Actual vs Forecast GDP United States')
plt.legend()
plt.grid(True)
plt.show()


[array([[30.60770527],
       [10.55258502],
       [ 0.89838968]]), array([[30.62136007],
       [10.62745436],
       [ 0.72923996]]), array([[30.61650225],
       [10.77741865],
       [ 0.88080092]]), array([[30.68461689],
       [10.88806142],
       [ 0.88171576]]), array([[30.75670262],
       [10.971466  ],
       [ 0.90382497]]), array([[30.80822744],
       [10.96886298],
       [ 0.89032435]]), array([[30.86569846],
       [10.96502269],
       [ 0.88420671]]), array([[30.92550839],
       [10.96655882],
       [ 0.87817789]]), array([[30.98127538],
       [10.96979695],
       [ 0.89001951]]), array([[31.04298604],
       [10.97482306],
       [ 0.88938924]]), array([[31.10640835],
       [10.98288886],
       [ 0.92536844]]), array([[31.14110567],
       [10.98767383],
       [ 0.89494016]]), array([[31.1768045 ],
       [10.97909334],
       [ 0.89070046]]), array([[31.22542599],
       [11.06897411],
       [ 0.91408209]]), array([[31.29117844],
       [11.07006165],
   

TypeError: 'type' object is not subscriptable

In [183]:
from scipy.optimize import minimize

# Data
data = df_var.values

# Define your state-space model
def state_space_model(params):
    F = params[:25].reshape((5, 5))  # State transition matrix
    Q = np.diag(params[25:50].reshape((5, 5)))        # Process noise covariance matrix
    H = params[50:65].reshape((3, 5))  # Observation matrix
    R = np.array([[params[-1]]])     # Measurement noise covariance matrix
    
    # Initial state estimate
    x_est = np.array([[0], [0], [0], [0], [0]])
    P_est = np.eye(5)  # Initial error covariance matrix
    
    log_likelihood = 0
    
    for measurement in data:
        # Prediction step
        x_pred = np.dot(F, x_est)
        P_pred = np.dot(F, np.dot(P_est, F.T)) + Q
        
        # Measurement update step
        y_pred = np.dot(H, x_pred)
        innovation = measurement - y_pred
        S = np.dot(H, np.dot(P_pred, H.T)) + R
        K = np.dot(P_pred, np.dot(H.T, np.linalg.inv(S)))
        
        # Update state estimate and error covariance
        x_est = x_pred + np.dot(K, innovation)
        P_est = P_pred - np.dot(K, np.dot(H, P_pred))
        
        # Compute log-likelihood
        log_likelihood += -0.5 * np.log(np.linalg.det(S)) - 0.5 * np.dot(innovation.T, np.dot(np.linalg.inv(S), innovation))
    
    return -log_likelihood


# Initial guess for parameters (F, Q, H, R)
initial_params = np.empty(66)
initial_params[:25] = (np.random.normal(size=(5,5)).reshape(-1))
initial_params[25:50] = ((np.eye(5).reshape(-1))*.1)
initial_params[50:65] = (np.random.normal(size=(3,5)).reshape(-1))
initial_params[65] = (np.random.normal(size=(1))*.1)


# Perform maximum likelihood estimation
result = minimize(state_space_model, initial_params, method='Nelder-Mead')

# Extract estimated parameters
estimated_params = result.x
F_est = estimated_params[:4].reshape((2, 2))
Q_est = np.diag(estimated_params[4:6])
H_est = estimated_params[6:8].reshape((1, 2))
R_est = np.array([[estimated_params[8]]])

print("Estimated F:")
print(F_est)
print("Estimated Q:")
print(Q_est)
print("Estimated H:")
print(H_est)
print("Estimated R:")
print(R_est)


[[nan nan nan]
 [nan nan nan]
 [nan nan nan]]


  log_likelihood += -0.5 * np.log(np.linalg.det(S)) - 0.5 * np.dot(innovation.T, np.dot(np.linalg.inv(S), innovation))


ValueError: The user-provided objective function must return a scalar value.