In [1]:
import pandas as pd
import math
import numpy as np

## Test 7.1: Fit a Normal Distribution From Data

In [2]:
# def fit_univariate_normal_from_data(x: pd.DataFrame):

#     mu = x.mean().item()
#     sigma = x.std().item()
#     return mu, sigma
    
def fit_normal_dist_from_data(x: pd.DataFrame):
    mu_vector = x.mean()
    covariance_matrix = x.cov()

    return mu_vector, covariance_matrix

In [3]:
x = pd.read_csv("testfiles/data/test7_1.csv")

In [9]:
y = np.concat([x,x], axis=1)

In [11]:
np.mean(y, axis=0)

(2,)

In [13]:
# x = x.to_numpy()
np.sqrt(np.cov(y, rowvar=False)) #unbiased by default

array([[0.04677994, 0.04677994],
       [0.04677994, 0.04677994]])

In [4]:
mu_vector, covariance_matrix = fit_normal_dist_from_data(x)

# get the univariate results for this data
mu = mu_vector.iloc[0]
sigma = math.sqrt(covariance_matrix.iloc[0,0])

In [6]:
print(mu, sigma)

0.04602573645286826 0.046779939563919654


In [5]:
# check that these values are correct (within a small margin due to precision)
error_epsilon = 1e-12
test_vals = pd.read_csv("testfiles/data/testout7_1.csv")
assert abs(mu - test_vals.loc[0, 'mu']) < error_epsilon, "Output mu and test mu do not match"
assert abs(sigma - test_vals.loc[0, 'sigma']) < error_epsilon, 'Output sigma and test sigma do not match'

## Test 7.2: Fit a T Distribution

In [30]:
x = pd.read_csv("testfiles/data/test7_2.csv")

#### Attempt to do t-fitting by hand

##### I found that since nu and sigma both deal with how wide our distribution is, any changes that would be made to nu often just get overpowered by changes to sigma. As such, my nu value was never being optimized away from the initial value

In [17]:
import numpy as np
from scipy.special import gamma, gammaln
from scipy.optimize import minimize

In [19]:
def t_log_likelihood(parameters, x):
    mu, sigma, nu = parameters
    log_likelihood = np.sum(
        gammaln((nu + 1) / 2) - gammaln(nu / 2) - np.log(sigma) - 0.5*np.log(np.pi*nu)
        - (nu+1)/2 * np.log(1 + ((x - mu)/sigma)**2 / nu)
    )

    return -log_likelihood

In [15]:
def fit_t_dist_from_data(x: pd.DataFrame):

    mu_guess = x.mean().item()
    sigma_guess = x.std().item()
    nu_guess = 10

    initial_params = [mu_guess, sigma_guess, nu_guess]

    # Minimize negative log-likelihood
    res = minimize(t_log_likelihood, initial_params, args=(x.values,),
                bounds=[(None, None), (1e-5, None), (1e-5, None)])

    mu, sigma, nu = res.x
    return mu, sigma, nu

In [18]:
mu, sigma, nu = fit_t_dist_from_data(x)

NameError: name 't_log_likelihood' is not defined

In [14]:
error_epsilon = 1e-5
test_vals = pd.read_csv("testfiles/data/testout7_2.csv")
print(mu, test_vals.loc[0, 'mu'])
print(sigma, test_vals.loc[0, 'sigma'])
print(nu, test_vals.loc[0, 'nu'])
assert abs(mu - test_vals.loc[0, 'mu']) < error_epsilon, "Output mu and test mu do not match"
assert abs(sigma - test_vals.loc[0, 'sigma']) < error_epsilon, 'Output sigma and test sigma do not match'
assert abs(nu - test_vals.loc[0, 'nu']) < error_epsilon, 'Output nu and test nu do not match'

0.046209336843006064 0.0459403982963629
0.04802717250457455 0.045442845180787
10.000000860912918 6.336874964943411


AssertionError: Output mu and test mu do not match

#### Package way to do t-fitting

In [28]:
from scipy.stats import t

# t =  x - mu / sqrt(unbiased_var/n)

In [27]:
nu, mu, sigma = t.fit(x)
mu, sigma, nu

(np.float64(0.04594038004735414),
 np.float64(0.04544287220830122),
 np.float64(6.336866997308613))

In [17]:
error_epsilon = 1e-5
test_vals = pd.read_csv("testfiles/data/testout7_2.csv")
assert abs(mu - test_vals.loc[0, 'mu']) < error_epsilon, "Output mu and test mu do not match"
assert abs(sigma - test_vals.loc[0, 'sigma']) < error_epsilon, 'Output sigma and test sigma do not match'
assert abs(nu - test_vals.loc[0, 'nu']) < error_epsilon, 'Output nu and test nu do not match'

## Test 7.3: T-regression

In [18]:
from statsmodels.miscmodels.tmodel import TLinearModel
import statsmodels.api as sm

In [19]:
x = pd.read_csv("testfiles/data/test7_3.csv")

In [20]:
def t_regression(X: pd.DataFrame, y:pd.Series, add_constant:bool = True, print_summary = False):

    if add_constant:
        X = sm.add_constant(X)
    
    model = TLinearModel(y, X)
    result = model.fit()

    alpha = result.params[0]
    betas = result.params[1:-2]
    nu = result.params[-2]
    sigma = result.params[-1]
    mu = 0.0 # mean is 0 if we included a constant (intercept term)

    if print_summary:
        result.summary()

    return alpha, betas, mu, sigma, nu

In [21]:
X, y = x.loc[:, ("x1", "x2", "x3")], x.loc[:, ("y")]

alpha, betas, mu, sigma, nu = t_regression(X, y, add_constant=True)

running Tmodel initialize
Optimization terminated successfully.
         Current function value: -1.378814
         Iterations: 281
         Function evaluations: 451


In [22]:
error_epsilon = 1e-4
test_vals = pd.read_csv("testfiles/data/testout7_3.csv")
assert abs(mu - test_vals.loc[0, 'mu']) < error_epsilon, "Output mu and test mu do not match"
assert abs(sigma - test_vals.loc[0, 'sigma']) < error_epsilon, 'Output sigma and test sigma do not match'
assert abs(nu - test_vals.loc[0, 'nu']) < error_epsilon, 'Output nu and test nu do not match'
assert abs(alpha - test_vals.loc[0, 'Alpha']) < error_epsilon, "Output mu and test mu do not match"
for i in range(0,len(betas)):
    assert abs(betas[i] - test_vals.loc[0, f'B{i+1}']) < error_epsilon, f'Output B{i+1} and test B{i+1} do not match'