In [None]:
import numpy as np
import scipy.stats as stats

# 샘플 데이터
x = np.array([10,20,30,40,50])
y = np.array([5,15,25,35,48])

In [None]:
corr_coeff, p_value = stats.pearsonr(x,y)

print(f'피어슨 상관계수 (r): {corr_coeff:.4f}')
print(f'p-value: {p_value:.4f}')

피어슨 상관계수 (r): 0.9984
p-value: 0.0001


In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris

# 1. Iris 데이터 로드
df_iris = load_iris()

# 2. pandas DataFrame으로 변환
iris = pd.DataFrame(data=df_iris.data, columns = df_iris.feature_names)
iris.columns = ['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width'] # 칼럼명 변경시

# 3. target 추가
iris['species'] = df_iris.target

# 4. 타겟 라벨을 실제 이름으로 변환 (0: setosa, 1: versicolor, 2: virginica)
iris['species'] = iris['species'].map({0:'setosa', 1:'versicolor', 2:'virginica'})

In [None]:
# 시험 때 활용해야 하는 것 -> Formula API 활용

import statsmodels.api as sm
import statsmodels.formula.api as smf

model = smf.ols("Petal_Length ~ Petal_Width + Sepal_Length", data=iris).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:           Petal_Length   R-squared:                       0.949
Model:                            OLS   Adj. R-squared:                  0.948
Method:                 Least Squares   F-statistic:                     1354.
Date:                Sat, 07 Jun 2025   Prob (F-statistic):           2.01e-95
Time:                        03:40:05   Log-Likelihood:                -75.090
No. Observations:                 150   AIC:                             156.2
Df Residuals:                     147   BIC:                             165.2
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept       -1.5071      0.337     -4.473   

In [None]:
model = smf.glm('Petal_Length ~ Petal_Width + Sepal_Length', family = sm.families.Gaussian(), data=iris).fit()

In [None]:
model = smf.ols('Petal_Length ~ Petal_Width + Sepal_Length + C(species)', data=iris).fit()

In [None]:
# 방법 2 : 행렬 활용(시험 활용X)

import statsmodels.api as sm

X = iris[['Petal_Width', 'Sepal_Length']]
y = iris['Petal_Length']

X = sm.add_constant(X)

# 다중회귀 분석 모델 적합 (train 데이터 활용)
model = sm.OLS(y, X).fit()

# 회귀계수 출력
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:           Petal_Length   R-squared:                       0.949
Model:                            OLS   Adj. R-squared:                  0.948
Method:                 Least Squares   F-statistic:                     1354.
Date:                Sat, 07 Jun 2025   Prob (F-statistic):           2.01e-95
Time:                        04:03:13   Log-Likelihood:                -75.090
No. Observations:                 150   AIC:                             156.2
Df Residuals:                     147   BIC:                             165.2
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const           -1.5071      0.337     -4.473   

In [None]:
model = smf.glm("Petal_Length ~ Petal_Width + Sepal_Length", family = sm.families.Gaussian(), data=iris).fit()

In [None]:
model = smf.glm("Petal_Length ~ Petal_Width + Sepal_Length", family = sm.families.Gaussian(), data=iris).fit()

# 독립 변수 (Petal_Width, Sepal_Length) + 범주형 변수 species 추가
X = iris[['Petal_Width', 'Sepal_Length', 'species']]

# 범주형 변수 species를 더미 변수로 변환 (setosa를 기준으로 drop)
X = pd.get_dummies(X, columns = ["species"], drop_first = True)
X = X.astype(float)
y= iris['Petal_Length']
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()

In [None]:
# 회귀계수 추출

coefficients = model.params[1:]
print('회귀계수가 가장 큰 변수: ', coefficients.idxmax())

회귀계수가 가장 큰 변수:  Petal_Width


In [None]:
t_values = model.tvalues
print("t-values:\n", t_values)

p_values = model.pvalues
print("p-values:\n", p_values)

t-values:
 const           -4.472752
Petal_Width     23.205443
Sepal_Length     7.819907
dtype: float64
p-values:
 const           1.535178e-05
Petal_Width     5.257543e-51
Sepal_Length    9.414477e-13
dtype: float64


In [None]:
print('tvalue가 가장 큰 변수:', np.abs(t_values).idxmax())

tvalue가 가장 큰 변수: Petal_Width


In [None]:
# 신뢰구간
conf_intervals = model.conf_int()
print("Confidence intervals:\n", conf_intervals)

conf_intervals_90 = model.conf_int(alpha = 0.1)
print("90% Confidence intervals:\n", conf_intervals_90)

Confidence intervals:
                      0         1
const        -2.173050 -0.841227
Petal_Width   1.599230  1.896976
Sepal_Length  0.405218  0.679294
90% Confidence intervals:
                      0         1
const        -2.064903 -0.949373
Petal_Width   1.623408  1.872798
Sepal_Length  0.427473  0.657038


In [1]:
# 더미 변수 함정

import pandas as pd
import statsmodels.api as sm

# 샘플 데이터 생성
data = {
    'color' : ['red', 'blue', 'green', 'red', 'green', 'red', 'green', 'blue', 'green', 'red'],
    'size' : [1,2,3,1,3,5,9,2,9,10],
    'price' : [10,20,30,10,30,55,29,10,25,12]
}
df = pd.DataFrame(data)

In [2]:
# 범주형 변수 더미 코딩 (drop_first = True로 기준 범주 제거)
df_dummies = pd.get_dummies(df, columns=['color'], drop_first = True) # drop_first=True 하면 다중공선성 문제 해결
print(df_dummies)

   size  price  color_green  color_red
0     1     10        False       True
1     2     20        False      False
2     3     30         True      False
3     1     10        False       True
4     3     30         True      False
5     5     55        False       True
6     9     29         True      False
7     2     10        False      False
8     9     25         True      False
9    10     12        False       True


In [3]:
# 종속 변수와 독립 변수 설정
X = df_dummies[['size', 'color_green', 'color_red']]
y = df_dummies['price']

X = X.astype(float)
y = y.astype(float)

# 상수항 추가
X = sm.add_constant(X)

# 다중회귀모델 적합
model = sm.OLS(y, X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.146
Model:                            OLS   Adj. R-squared:                 -0.280
Method:                 Least Squares   F-statistic:                    0.3430
Date:                Sun, 08 Jun 2025   Prob (F-statistic):              0.796
Time:                        04:11:48   Log-Likelihood:                -39.360
No. Observations:                  10   AIC:                             86.72
Df Residuals:                       6   BIC:                             87.93
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
const          14.3994     11.800      1.220      

  return hypotest_fun_in(*args, **kwds)


In [4]:
import statsmodels.formula.api as smf

# 회귀 분석 공식 설정
formula = 'price ~ size  + C(color)'

# 회귀 모델 적합
model = smf.ols(formula, data = df).fit()

# 모델 요약 출력
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.146
Model:                            OLS   Adj. R-squared:                 -0.280
Method:                 Least Squares   F-statistic:                    0.3430
Date:                Sun, 08 Jun 2025   Prob (F-statistic):              0.796
Time:                        04:28:31   Log-Likelihood:                -39.360
No. Observations:                  10   AIC:                             86.72
Df Residuals:                       6   BIC:                             87.93
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept            14.3994     11.80

  return hypotest_fun_in(*args, **kwds)


In [12]:
# 모델 평가를 위해 붓꽃 데이터 다시 불러오기

import numpy as np
import pandas as pd
from sklearn.datasets import load_iris

# 1. Iris 데이터 로드
df_iris = load_iris()

# 2. pandas DataFrame으로 변환
iris = pd.DataFrame(data=df_iris.data, columns = df_iris.feature_names)
iris.columns = ['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width'] # 칼럼명 변경시

# 3. target 추가
iris['species'] = df_iris.target

# 4. 타겟 라벨을 실제 이름으로 변환 (0: setosa, 1: versicolor, 2: virginica)
iris['species'] = iris['species'].map({0:'setosa', 1:'versicolor', 2:'virginica'})

# 시험 때 활용해야 하는 것 -> Formula API 활용

import statsmodels.api as sm
import statsmodels.formula.api as smf

model = smf.ols("Petal_Length ~ Petal_Width + Sepal_Length", data=iris).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:           Petal_Length   R-squared:                       0.949
Model:                            OLS   Adj. R-squared:                  0.948
Method:                 Least Squares   F-statistic:                     1354.
Date:                Sun, 08 Jun 2025   Prob (F-statistic):           2.01e-95
Time:                        04:52:51   Log-Likelihood:                -75.090
No. Observations:                 150   AIC:                             156.2
Df Residuals:                     147   BIC:                             165.2
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept       -1.5071      0.337     -4.473   

In [13]:
import numpy as np
print('R-squared: ', np.round(model.rsquared, 2))

R-squared:  0.95


In [14]:
print('Adj. R-squared: ', np.round(model.rsquared_adj, 2))

Adj. R-squared:  0.95


In [15]:
# 회귀모델의 유의성 검정(Fvalue, F_pvalue)

print('F-statistic: ', np.round(model.fvalue, 4))
print('Prob (F-statistic): ', np.round(model.f_pvalue, 4))

F-statistic:  1354.3397
Prob (F-statistic):  0.0


In [17]:
print('AIC', np.round(model.aic, 2))
print('BIC', np.round(model.bic, 2))

AIC 156.18
BIC 165.21


In [18]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

model1 = ols('Petal_Length ~ Petal_Width', data=iris).fit() # mod1
model2 = ols('Petal_Length ~ Petal_Width + Sepal_Length + Sepal_Width', data=iris).fit()  # mod2

table = sm.stats.anova_lm(model1, model2)   # Reduced 모델이 첫번째, Full 모델이 두번째 !!중요!!
print(table)

   df_resid        ssr  df_diff    ss_diff          F        Pr(>F)
0     148.0  33.844753      0.0        NaN        NaN           NaN
1     146.0  14.852948      2.0  18.991805  93.341859  7.752746e-27


In [21]:
# 독립성 가정 체크
#1
dw_stat = model.summary().tables[2].data[0][3]
print(f'Durbin-Watson statistic: {dw_stat}')
#2
from statsmodels.stats.stattools import durbin_watson
dw_stat = durbin_watson(model.resid)
print(dw_stat)

Durbin-Watson statistic:    1.339
1.3391185441384643


In [22]:
# 정규성 가정 체크

# 잔차 계산
residuals = model.resid

# 엔더슨-달링 테스트
from scipy.stats import anderson
ad_stat, ad_critical_values, ad_significance_level = anderson(residuals, dist='norm')

print(f'Anderson-Darling Test Statistic: {ad_stat}')
print(f'Critical Values: {ad_critical_values}')
print(f'Significance Levels: {ad_significance_level}')

Anderson-Darling Test Statistic: 0.2799444785279661
Critical Values: [0.562 0.64  0.767 0.895 1.065]
Significance Levels: [15.  10.   5.   2.5  1. ]


In [23]:
# 샤피로-윌크 테스트
from scipy.stats import shapiro
sw_stat, sw_p_value = shapiro(residuals)

print(f'Shapiro-Wilk Test Statistic: {sw_stat}')
print(f'p-value: {sw_p_value}')

Shapiro-Wilk Test Statistic: 0.9932794545480763
p-value: 0.7114249301711132


In [24]:
# 등분산성 가정 체크

from statsmodels.stats.diagnostic import het_breuschpagan
bptest = het_breuschpagan(model.resid, model.model.exog)

print('BP-test statistics: ', bptest[0])
print('p-value:', bptest[1])

BP-test statistics:  6.20946286228456
p-value: 0.044836558645737955


In [25]:
# 다중공선성 체크

from statsmodels.stats.outliers_influence import variance_inflation_factor
X=iris[['Petal_Width', 'Sepal_Length']]

# VIF 계산
vif_data = pd.DataFrame()
vif_data['Variable'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print(vif_data)

       Variable       VIF
0   Petal_Width  5.150649
1  Sepal_Length  5.150649


In [27]:
# 예측 (기존에 적합했던 모델을 활용하여 새로운 데이터에 대한 예측 하)

from sklearn.metrics import mean_squared_error
import statsmodels.formula.api as smf
models = smf.ols('Petal_Length ~ Petal_Width + Sepal_Length + C(species)', data=iris).fit()

In [29]:
# 새로운 데이터 생성 (행 5개)
new_data = pd.DataFrame({
    'Petal_Width' : [0.2, 1.5, 1.3, 2.1, 1.8],
    'Sepal_Length' : [4.9, 5.5, 6.1 ,6.7, 7.2],
    'species' : ['setosa', 'versicolor', 'virginica', 'versicolor', 'virginica']
})

In [33]:
# 예측값 계산
y_pred = model.predict(new_data)

# 실제값과 비교할 y_true 생성 (예제 값 사용)
y_true = np.array([1.4, 4.7, 5.1, 5.8, 6.3])

# MSE 계산
mse_score = mean_squared_error(y_true, y_pred)

# 결과 출력
print("예측값:\n", y_pred)
print(f'MSE : {mse_score:.4f}')

예측값:
 0    1.499535
1    4.097422
2    4.073155
3    5.796990
4    5.543687
dtype: float64
MSE : 0.3999
