In [1]:
import numpy as np
import pandas as pd
from scipy.stats import f as Fisher, chi2 as ChiSquare
from math import floor

In [2]:
def boxTest_CovarianceMatrices(populations, significant_level = 0.01):
    g, p = len(populations), populations[0].shape[1]
    covariances = []
    population_sizes = []
    S_pooled = 0
    denominator_Spool = 0
    
    for population in populations:
        covariance = np.cov(population, rowvar = False, ddof = 1)
        pop_size = population.shape[0]
        covariances.append(covariance)
        population_sizes.append(pop_size)
        S_pooled += (pop_size - 1) * covariance
        denominator_Spool += (pop_size - 1)
    
    S_pooled = 1/denominator_Spool * S_pooled
    M = 0
    det_S_pooled = np.linalg.det(S_pooled)
    
    for S, size in zip(covariances, population_sizes):
        det_S = np.linalg.det(S)
        M += ((size - 1)*np.log(det_S_pooled) - (size - 1) * np.log(det_S))
    population_sizes = np.array(population_sizes)
    u = (
        np.sum(1/(population_sizes - 1)) - 1/(np.sum(population_sizes - 1))
    ) * (2*p**2 + 3*p -1)/(6 * (p + 1) * (g - 1))
    C = (1- u)*M
    v = floor(1/2*p*(p + 1)*(g - 1))
    critical_value = ChiSquare.ppf(1 - significant_level, df = v)
    
    print(f'>> Statistic = {C}')
    print(f'>> Chi Square degree of freedom = {v}')
    print(f'>> Critical value = {critical_value}')
    
    if C > critical_value:
        print('>> Conclusion : Reject H_0')
    else:
        print('>> conclusion : Accept H_0')

In [3]:
def meanVec(X):
    number_samples = X.shape[0]
    vec_one = np.ones((number_samples, 1))
    return 1/number_samples * X.T.dot(vec_one)

def covarianceMat(X):
    number_samples = X.shape[0]
    vec_one = np.ones((number_samples, 1))
    I = np.eye(number_samples)
    return 1/(number_samples - 1) * X.T.dot(I - 1/number_samples * vec_one.dot(vec_one.T)).dot(X)

In [4]:
data = pd.read_csv('T6-10.csv', delimiter = '  ', engine = 'python')

In [5]:
data.head()

Unnamed: 0,x1,x2,x3,Truck_type
0,16.44,12.43,11.23,gasoline
1,7.19,2.7,3.92,gasoline
2,9.92,1.35,9.75,gasoline
3,4.24,5.78,7.78,gasoline
4,11.2,5.05,10.67,gasoline


In [6]:
data_gasoline = data.loc[data['Truck_type'] == 'gasoline'].iloc[:, :-1]
data_diesel = data.loc[data['Truck_type'] == 'diesel'].iloc[:, :-1]

In [7]:
data_gasoline.reset_index(drop = True, inplace = True)
data_diesel.reset_index(drop = True, inplace = True)

## (a)

In [8]:
X_gasoline = data_gasoline.to_numpy()
X_diesel = data_diesel.to_numpy()

In [9]:
mu_gasoline, S_gasoline = meanVec(X_gasoline), covarianceMat(X_gasoline)
mu_diesel, S_diesel = meanVec(X_diesel), covarianceMat(X_diesel)

In [10]:
boxTest_CovarianceMatrices((X_gasoline, X_diesel), significant_level = 0.01)

>> Statistic = 30.54428359843359
>> Chi Square degree of freedom = 6
>> Critical value = 16.811893829770927
>> Conclusion : Reject H_0


In [11]:
X_gasoline.shape[0] - X_gasoline.shape[1]

33

In [12]:
X_diesel.shape[0] - X_diesel.shape[1]

20

In [13]:
statistic = (
    (mu_gasoline - mu_diesel).T
    .dot(
        np.linalg.inv(
            1/X_gasoline.shape[0] * S_gasoline + 1/X_diesel.shape[0] * S_diesel
        )
    ).dot(
        mu_gasoline - mu_diesel
    )
).item()

In [14]:
alpha = 0.01
critical_value = ChiSquare.ppf(1 - alpha, df = X_gasoline.shape[1])

In [15]:
statistic

43.176394993631774

In [16]:
critical_value

11.344866730144373

In [17]:
S_gasoline

array([[23.01336087, 12.366395  ,  2.90660897],
       [12.366395  , 17.54411071,  4.77308214],
       [ 2.90660897,  4.77308214, 13.96333421]])

In [18]:
S_diesel

array([[ 4.3623166 ,  0.75988715,  2.36209921],
       [ 0.75988715, 25.85123597,  7.68573221],
       [ 2.36209921,  7.68573221, 46.6543996 ]])

In [19]:
mu_gasoline

array([[12.21861111],
       [ 8.1125    ],
       [ 9.59027778]])

In [20]:
mu_diesel

array([[10.10565217],
       [10.76217391],
       [18.16782609]])

## (b)

In [21]:
np.linalg.inv(
    1/X_gasoline.shape[0] * S_gasoline + 1/X_diesel.shape[0] * S_diesel
).dot(
    mu_gasoline - mu_diesel
)

array([[ 4.04434873],
       [-1.55960644],
       [-3.5556085 ]])

## (c)

In [22]:
alpha = 0.01
p = 3
for a in np.eye(3):
    a = a.reshape((3, 1))
    epsilon = np.sqrt(ChiSquare.ppf(1 - alpha, df = p)) * np.sqrt(
        (
            a.T.dot(
                1/X_gasoline.shape[0] * S_gasoline + 1/X_diesel.shape[0] * S_diesel
            ).dot(a)
        ).item()
    )
    lower = a.T.dot(
        mu_gasoline - mu_diesel
    ).item() - epsilon
    upper = a.T.dot(
        mu_gasoline - mu_diesel
    ).item() + epsilon
    print(a.T)
    print(lower, upper)

[[1. 0. 0.]]
-0.9536441646236367 5.179562039019771
[[0. 1. 0.]]
-6.925187911617055 1.6258400855301005
[[0. 0. 1.]]
-13.813277229023075 -3.3418193893344172


## (d)

In [23]:
common = np.linalg.inv(1/X_gasoline.shape[0] * S_gasoline + 1/X_diesel.shape[0] * S_diesel)
denom_v = 0
for n, S in zip((X_gasoline.shape[0], X_diesel.shape[0]), (S_gasoline, S_diesel)):
    temp = 1/n * S.dot(common)
    denom_v += 1/n * (np.trace(temp.dot(temp)) + np.trace(temp)**2)
v = int((p + p**2)/denom_v)

In [24]:
statistic = (
    (mu_gasoline - mu_diesel).T
    .dot(
        np.linalg.inv(
            1/X_gasoline.shape[0] * S_gasoline + 1/X_diesel.shape[0] * S_diesel
        )
    ).dot(
        mu_gasoline - mu_diesel
    )
).item()

In [25]:
alpha = 0.01
critical_value = v*p/(v - p + 1) * Fisher.ppf(1 - alpha, dfn = p, dfd = v - p + 1)

In [26]:
statistic

43.176394993631774

In [27]:
critical_value

13.414133109155795

## --------------------------------------------------------------------

In [28]:
data_gasoline_remove = data_gasoline.drop([8, 20]).reset_index(drop = True)

In [29]:
X_gasoline = data_gasoline_remove.to_numpy()
X_diesel = data_diesel.to_numpy()

In [30]:
mu_gasoline, S_gasoline = meanVec(X_gasoline), covarianceMat(X_gasoline)
mu_diesel, S_diesel = meanVec(X_diesel), covarianceMat(X_diesel)

In [31]:
boxTest_CovarianceMatrices((X_gasoline, X_diesel), significant_level = 0.01)

>> Statistic = 21.162209248337017
>> Chi Square degree of freedom = 6
>> Critical value = 16.811893829770927
>> Conclusion : Reject H_0


In [32]:
S_gasoline

array([[ 9.02502103,  5.1557492 ,  3.20167059],
       [ 5.1557492 , 14.25869412,  4.31894492],
       [ 3.20167059,  4.31894492, 11.98734403]])

In [33]:
S_diesel

array([[ 4.3623166 ,  0.75988715,  2.36209921],
       [ 0.75988715, 25.85123597,  7.68573221],
       [ 2.36209921,  7.68573221, 46.6543996 ]])

In [34]:
X_gasoline.shape[0] - X_gasoline.shape[1]

31

In [35]:
X_diesel.shape[0] - X_diesel.shape[1]

20

In [36]:
statistic = (
    (mu_gasoline - mu_diesel).T
    .dot(
        np.linalg.inv(
            1/X_gasoline.shape[0] * S_gasoline + 1/X_diesel.shape[0] * S_diesel
        )
    ).dot(
        mu_gasoline - mu_diesel
    )
).item()

In [37]:
common = np.linalg.inv(1/X_gasoline.shape[0] * S_gasoline + 1/X_diesel.shape[0] * S_diesel)
denom_v = 0
for n, S in zip((X_gasoline.shape[0], X_diesel.shape[0]), (S_gasoline, S_diesel)):
    temp = 1/n * S.dot(common)
    denom_v += 1/n * (np.trace(temp.dot(temp)) + np.trace(temp)**2)
v = int((p + p**2)/denom_v)

In [38]:
alpha = 0.01
critical_value = v*p/(v - p + 1) * Fisher.ppf(1 - alpha, dfn = p, dfd = v - p + 1)

In [39]:
statistic

42.63735545790153

In [40]:
critical_value

13.647813885864254

In [41]:
critical_value = ChiSquare.ppf(1 - alpha, df = X_gasoline.shape[1])

In [42]:
critical_value

11.344866730144373