In [1]:
import numpy as np
import pandas as pd
import warnings
from scipy.stats import f, t, chi2
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('t-6.12',sep="\s", header=None)
df.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8
45,0.31,,4.77,,1.97,,30.4,,female
46,0.27,,5.16,,2.03,,39.46,,female
47,0.66,,11.05,,2.32,,39.34,,female
48,0.37,,5.23,,2.48,,34.86,,female
49,0.35,,5.37,,2.25,,35.07,,female


In [3]:
df=df.dropna(axis=1)
df.head()

Unnamed: 0,0,2,4,6,8
0,0.34,3.71,2.87,30.87,male
1,0.39,5.08,3.38,43.85,male
2,0.48,5.13,4.13,44.51,male
3,0.31,3.95,3.6,46.0,male
4,0.36,5.51,3.11,47.02,male


In [4]:
df.rename(columns={0:'RestVolO2(L/min)',
                   2:'RestVolO2(mL/kg/min)',
                   4:'MaxVolO2(L/min)',
                    6: 'MaxVolO2(mL/kg/min)',
                  8:'Sex'}, 
                 inplace=True)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   RestVolO2(L/min)      50 non-null     float64
 1   RestVolO2(mL/kg/min)  50 non-null     float64
 2   MaxVolO2(L/min)       50 non-null     float64
 3   MaxVolO2(mL/kg/min)   50 non-null     float64
 4   Sex                   50 non-null     object 
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [6]:
X_male = df[df['Sex'] == 'male'].to_numpy()
X_female = df[df['Sex'] == 'female'].to_numpy()

In [7]:
X_male = X_male[:,0:-1]
X_female = X_female[:,0:-1]

In [8]:
mu_male = X_male.mean(axis=0)
mu_female = X_female.mean(axis=0)

In [9]:
S1 = np.cov(X_male.T.astype(float))
S2 = np.cov(X_female.T.astype(float))

In [10]:
S1

array([[7.12100000e-03, 7.00030000e-02, 3.14471667e-02, 1.50580333e-01],
       [7.00030000e-02, 1.14417900e+00, 1.47678167e-01, 3.43090850e+00],
       [3.14471667e-02, 1.47678167e-01, 4.55877333e-01, 3.30812183e+00],
       [1.50580333e-01, 3.43090850e+00, 3.30812183e+00, 5.52521457e+01]])

In [11]:
S2

array([[ 9.73233333e-03,  1.54087833e-01,  4.16800000e-03,
         2.97570000e-02],
       [ 1.54087833e-01,  2.78066100e+00, -3.94476667e-02,
         1.28069767e+00],
       [ 4.16800000e-03, -3.94476667e-02,  1.20509333e-01,
         1.09814900e+00],
       [ 2.97570000e-02,  1.28069767e+00,  1.09814900e+00,
         2.32608260e+01]])

In [12]:
S_pooled = 24/48 * S1 + 24/48 * S2

   Gỉa thuyết thống kê:

   $H_{0}: \Sigma_{1} = \Sigma_{2}$
   
   $H_{1}: \Sigma_{1} \neq \Sigma_{2}$

   Mức ý nghĩa $\alpha$ = 0.05

$C = (1-u)\left\{\left[ \sum_{l} (n_{l} - 1)\right]\ln|\textbf{S}_{pooled}| - \sum_{l}\left[(n_{l}-1)\ln|\textbf{S}_{l}|\right]\right\}$

trong đó

   v = $\frac{1}{2}p(p+1)(g-1)$
   
và

   u = $\left[\displaystyle \sum_{l}\dfrac{1}{(n_l - 1)} - \dfrac{1}{\displaystyle \sum_{l}(n_l - 1)}\right]\dfrac{2p^2 + 3p - 1}{6(p + 1)(g - 1)}$


In [13]:
p, g = 4, 2
n1 = n2 = 25
u = (1/(n1-1) + 1/(n2-1) - 1/(n1+n2-2))*(2*p**2 + 3*p - 1)/(6*(p+1)*(g-1))
v = (1/2)*p*(p+1)*(g-1)

In [14]:
C = (1-u)*((n1+n2-2)*np.log(np.linalg.det(S_pooled)) - (n1-1)*np.log(np.linalg.det(S1)) - (n2-1)*np.log(np.linalg.det(S2)))

$\chi^2_{p(p+1)(g - 1)/2}(\alpha)$  = $\chi^2_{10}(0.05) = $

In [15]:
chi2.ppf(1-0.05,10)

18.307038053275146

Kiểm định giả thuyết:

$H_{0}: \mu_{1} = \mu_{2}$

$H_{1}: \mu_{1} \neq \mu_{2}$

Ta có trung bình mẫu

In [16]:
print(mu_male)
print(mu_female)

[0.39720000000000005 5.329599999999999 3.6875999999999998 49.4204]
[0.31359999999999993 5.1788 2.3152 38.1548]


phép kiểm định $T^{2}$ cho thống kê:

$T^{2} = (\bar{\textbf{X}}_{1} - \bar{\textbf{X}}_{2} - (\mu_{1} - \mu_{2}))^{T}\left[\dfrac{1}{n_{1}}\textbf{S}_{1} + \dfrac{1}{n_{2}}\textbf{S}_{2}\right]^{-1}(\bar{\textbf{X}}_{1} - \bar{\textbf{X}}_{2} - (\mu_{1} - \mu_{2}))\quad \tag{6-27}$

In [17]:
factor_S = 1/25 * S1 + 1/25 * S2

In [18]:
factor_S_inv = np.linalg.inv(factor_S)

In [19]:
T2 = (mu_male - mu_female).T.dot(factor_S_inv).dot(mu_male - mu_female)

In [20]:
T2

96.37322129760007

và điểm tới hạn:
$\dfrac{vp}{v-p+1}F_{p,v-p+1}$

trong đó:
$v = \dfrac{p+p^{2}}{\sum_{i}\frac{1}{n_{i}}\left\{tr\left[\left(\frac{1}{n_{i}}\textbf{S}_{i}\left(\frac{1}{n_{1}}\textbf{S}_{1}+\frac{1}{n_{2}}\textbf{S}_{2}\right)^{-1}\right)^{2}\right] + \left(tr\left[\frac{1}{n_{i}}\textbf{S}_{i}\left(\frac{1}{n_{1}}\textbf{S}_{1}+\frac{1}{n_{2}}\textbf{S}_{2}\right)^{-1}\right]\right)^{2}\right\}}$

A = $\dfrac{1}{n1}\{tr\left[(\frac{1}{n1}S_{1}(\frac{1}{n1}S_{1} + \frac{1}{n2}S_{2})^{-1})^{2}\right] + (tr\left[\frac{1}{n1}S_{1}(\frac{1}{n1}S_{1} + \frac{1}{n2}S_{2})^{-1}\right])^{2}\}$

B = $\dfrac{1}{n2}\{tr\left[(\frac{1}{n2}S_{2}(\frac{1}{n}S_{1} + \frac{1}{n2}S_{2})^{-1})^{2}\right] + (tr\left[\frac{1}{n2}S_{2}(\frac{1}{n1}S_{1} + \frac{1}{n2}S_{2})^{-1}\right])^{2}\}$

In [21]:
A = 1/n1 * S1.dot(factor_S_inv)

In [22]:
first = np.trace(A.dot(A))

In [23]:
second = np.trace(A)**2

In [24]:
term1 = 1/n1 * (first + second)

In [25]:
term1

0.29038629997409465

In [26]:
B = 1/n2 * S2.dot(factor_S_inv)

In [27]:
first = np.trace(B.dot(B))

In [28]:
second = np.trace(B)**2

In [29]:
term2 = 1/n2 * (first + second)

In [30]:
denom = term1 + term2
print(denom)

0.4419619174583903


In [31]:
v = (p + p**2)/denom
print(v)

45.25276773848496


$\dfrac{vp}{v-p+1}F_{p,v-p+1}$ = $\dfrac{45.253\times 4}{45.253-4+1}F_{4,45.253-4+1}(0.05) \approx 11.11$

In [32]:
(v*4)/(v - 4 + 1)*f.ppf(q = 1-0.05, dfn = 4, dfd = v -4+1)

11.107851380093228

$\hat{\textbf{a}} \propto \left(\dfrac{1}{n_1}\textbf{S}_1 + \dfrac{1}{n_2}\textbf{S}_2\right)^{-1}(\bar{\textbf{x}}_1 - \bar{\textbf{x}}_2)$ 

In [33]:
a_hat = factor_S_inv.dot(mu_male - mu_female)

In [34]:
print(a_hat)

[1242.4872248315296 -79.699985037989 -77.85176004070843 9.88529705699483]


$c^{2} = \dfrac{(n_{1} + n_{2} - 2)p}{(n_{1} + n_{2} - p - 1)}F_{p, n_{1}+n_{2}-p-1}(\alpha)$

In [35]:
c2 = (n1+n2-2)*p/(n1+n2-p-1)*f.ppf(1-0.05, dfn=p, dfd=n1+n2-p-1)
print(c2)

11.002620519729316


Khoảng tin cậy đồng thời:

$(\bar{X}_{1i} - \bar{X}_{2i}) \pm c\sqrt{\left(\frac{1}{n_{1}} + \frac{1}{n_{2}}\right)s_{ii,pooled}}, i=1,2,...,p$

In [36]:
lower = [(mu_male[i] - mu_female[i]) - np.sqrt(c2)*np.sqrt((1/n1+1/n2) * S_pooled[i,i]) for i in range(p)]

In [37]:
upper = [(mu_male[i] - mu_female[i]) + np.sqrt(c2)*np.sqrt((1/n1+1/n2) * S_pooled[i,i]) for i in range(p)]

In [38]:
lower = np.array(lower).reshape(-1,1)
upper = np.array(upper).reshape(-1,1)
np.hstack((lower,upper))

array([[-2.52336063e-03,  1.69723361e-01],
       [-1.16348346e+00,  1.46508346e+00],
       [ 8.68742824e-01,  1.87605718e+00],
       [ 5.38734028e+00,  1.71438597e+01]])

Khoảng tin cậy Bonferroni:

In [39]:
lower = [(mu_male[i] - mu_female[i]) - t.ppf(1-(0.05/(2*p)), n1)*np.sqrt(S_pooled[i,i]/n1) for i in range(p)]

In [40]:
upper = [(mu_male[i] - mu_female[i]) + t.ppf(1-(0.05/(2*p)), n1)*np.sqrt(S_pooled[i,i]/n1) for i in range(p)]

In [41]:
lower = np.array(lower).reshape(-1,1)
upper = np.array(upper).reshape(-1,1)

In [42]:
np.hstack((lower,upper))

array([[ 0.03418468,  0.13301532],
       [-0.60330125,  0.90490125],
       [ 1.08341477,  1.66138523],
       [ 7.89280932, 14.63839068]])