In [1]:
# 결과 비교
# from sklearn.linear_model import LinearRegression
# import statsmodels.api as sm (OLS)
# from statsmodels.formula.api import ols

In [33]:
import numpy as np
import pandas as pd

np.random.seed(1234)

n_samples = 100

x1 = np.random.randn(n_samples)
x2 = np.random.randn(n_samples)
x3 = np.random.randn(n_samples)

# 임의의 계수
a, b, c, d = 2.0, -1.5, 3.0, 5.0

# 종속 변수 생성 + noise
noise = np.random.randn(n_samples) * 0.5
y0 = a * x1 + b * x2 + c * x3 + d + noise

df = pd.DataFrame({'X1': x1, 'X2': x2, 'X3': x3, 'Y': y0})
df.head()

Unnamed: 0,X1,X2,X3,Y
0,0.471435,0.291205,-0.319561,4.810901
1,-1.190976,0.566534,-0.619993,0.263825
2,1.432707,0.503592,0.156998,7.472249
3,-0.312652,0.285296,-0.571455,3.551282
4,-0.720589,0.484288,1.057633,5.134221


In [34]:
from sklearn.model_selection import train_test_split

X = df[['X1', 'X2', 'X3']]
y = df['Y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

In [35]:
#####################################################################################################################

In [36]:
from sklearn.linear_model import LinearRegression

# 모델 학습
model = LinearRegression() # 절편 미포함 : LinearRegression(fit_intercept=False)
model.fit(X_train, y_train)

print("Intercept (절편):", model.intercept_)
print("Coefficients (계수):", model.coef_)

Intercept (절편): 5.052336995858368
Coefficients (계수): [ 1.96729623 -1.57951403  2.95907097]


In [37]:
# 예측
y_pred = model.predict(X_test)
y_pred

array([ 3.12664969,  7.06984042, -0.93769385,  6.69679378, -2.1026261 ,
        2.52734463,  5.65414199,  1.69225091,  9.82235049,  6.56760218,
        1.92804274,  5.23218495,  8.6534034 ,  5.00881546,  6.62089209,
       13.21558057,  4.4790476 , -0.02011646,  4.87736446, -0.01729664])

In [38]:
# 평가
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(y_test, y_pred) # rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

mse, mae, r2

(0.1407066479342696, 0.2928034528632188, 0.989423116757431)

In [39]:
#####################################################################################################################

In [40]:
import statsmodels.api as sm

# 절편 포함
X_const = sm.add_constant(X_train)
X_const.head()

  return ptp(axis=axis, out=out, **kwargs)


Unnamed: 0,const,X1,X2,X3
9,1.0,-2.242685,-1.281108,0.787965
51,1.0,-1.44581,0.796595,-1.712274
29,1.0,1.058969,0.67863,0.247112
88,1.0,0.270836,-0.414505,-0.354509
75,1.0,2.007843,-0.223019,0.983513


In [41]:
# 모델 학습
model = sm.OLS(y_train, X_const).fit()
model.summary()

0,1,2,3
Dep. Variable:,Y,R-squared:,0.978
Model:,OLS,Adj. R-squared:,0.977
Method:,Least Squares,F-statistic:,1109.0
Date:,"Wed, 04 Jun 2025",Prob (F-statistic):,1.26e-62
Time:,21:43:01,Log-Likelihood:,-56.317
No. Observations:,80,AIC:,120.6
Df Residuals:,76,BIC:,130.2
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.0523,0.058,86.414,0.000,4.936,5.169
X1,1.9673,0.062,31.827,0.000,1.844,2.090
X2,-1.5795,0.059,-26.839,0.000,-1.697,-1.462
X3,2.9591,0.062,47.842,0.000,2.836,3.082

0,1,2,3
Omnibus:,1.493,Durbin-Watson:,2.281
Prob(Omnibus):,0.474,Jarque-Bera (JB):,1.478
Skew:,0.314,Prob(JB):,0.478
Kurtosis:,2.78,Cond. No.,1.42


In [42]:
model.params

const    5.052337
X1       1.967296
X2      -1.579514
X3       2.959071
dtype: float64

In [43]:
model.pvalues

const    1.218428e-77
X1       1.095671e-45
X2       1.624641e-40
X3       1.712198e-58
dtype: float64

In [44]:
model.rsquared

0.9776677023867212

In [45]:
model.conf_int(alpha=0.05) #신뢰구간 95%

Unnamed: 0,0,1
const,4.93589,5.168784
X1,1.844187,2.090405
X2,-1.696727,-1.462301
X3,2.835883,3.082259


In [46]:
# 예측
X_const_test = sm.add_constant(X_test) # 평가 세트 절편 포함
y_pred = model.predict(X_const_test)
y_pred

40     3.126650
35     7.069840
81    -0.937694
61     6.696794
98    -2.102626
68     2.527345
85     5.654142
27     1.692251
39     9.822350
42     6.567602
33     1.928043
59     5.232185
63     8.653403
94     5.008815
56     6.620892
87    13.215581
96     4.479048
1     -0.020116
71     4.877364
82    -0.017297
dtype: float64

In [47]:
# 신뢰구간, 예측구간
pred = model.get_prediction(X_const_test)
pred.summary_frame(alpha=0.05)

Unnamed: 0,mean,mean_se,mean_ci_lower,mean_ci_upper,obs_ci_lower,obs_ci_upper
40,3.12665,0.085149,2.957061,3.296239,2.112718,4.140582
35,7.06984,0.078373,6.913746,7.225934,6.058078,8.081603
81,-0.937694,0.248427,-1.43248,-0.442908,-2.053091,0.177703
61,6.696794,0.069562,6.558249,6.835338,5.68759,7.705998
98,-2.102626,0.152984,-2.40732,-1.797932,-3.147679,-1.057573
68,2.527345,0.098231,2.3317,2.722989,1.508731,3.545959
85,5.654142,0.105721,5.443581,5.864703,4.632558,6.675726
27,1.692251,0.132312,1.428729,1.955773,0.658451,2.726051
39,9.82235,0.176392,9.471035,10.173666,8.762765,10.881936
42,6.567602,0.077212,6.41382,6.721384,5.556194,7.579011


In [48]:
# 평가
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

mse, mae, r2

(0.14070664793426885, 0.29280345286321896, 0.989423116757431)

In [49]:
print("AIC :", model.aic)
print("BIC :", model.bic)

AIC : 120.63454906181968
BIC : 130.16265560051522


In [50]:
n = model.nobs        # 관측치 수
k = model.df_model + 1  # 변수 수 (상수항 포함)
aic = model.aic
aicc = aic + (2 * k * (k + 1)) / (n - k - 1)
aicc

121.16788239515301

In [51]:
#####################################################################################################################

In [52]:
df_ols = X_train.copy()
df_ols['y'] = y_train
df_ols

Unnamed: 0,X1,X2,X3,y
9,-2.242685,-1.281108,0.787965,3.924334
51,-1.445810,0.796595,-1.712274,-3.474049
29,1.058969,0.678630,0.247112,6.629826
88,0.270836,-0.414505,-0.354509,4.721972
75,2.007843,-0.223019,0.983513,11.438556
...,...,...,...,...
76,0.226963,2.123692,0.023505,3.235071
53,-0.100918,-0.056696,0.126781,5.034431
38,0.841675,-1.735349,0.399325,10.550047
83,0.152631,-0.014752,1.541030,9.783804


In [53]:
from statsmodels.formula.api import ols

# 절편 제외 y ~ X1 + X2 + X3 + 0 or -1 
model = ols('y ~ X1 + X2 + X3', df_ols).fit()
model.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.978
Model:,OLS,Adj. R-squared:,0.977
Method:,Least Squares,F-statistic:,1109.0
Date:,"Wed, 04 Jun 2025",Prob (F-statistic):,1.26e-62
Time:,21:43:01,Log-Likelihood:,-56.317
No. Observations:,80,AIC:,120.6
Df Residuals:,76,BIC:,130.2
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,5.0523,0.058,86.414,0.000,4.936,5.169
X1,1.9673,0.062,31.827,0.000,1.844,2.090
X2,-1.5795,0.059,-26.839,0.000,-1.697,-1.462
X3,2.9591,0.062,47.842,0.000,2.836,3.082

0,1,2,3
Omnibus:,1.493,Durbin-Watson:,2.281
Prob(Omnibus):,0.474,Jarque-Bera (JB):,1.478
Skew:,0.314,Prob(JB):,0.478
Kurtosis:,2.78,Cond. No.,1.42


In [54]:
model.params

Intercept    5.052337
X1           1.967296
X2          -1.579514
X3           2.959071
dtype: float64

In [55]:
model.pvalues

Intercept    1.218428e-77
X1           1.095671e-45
X2           1.624641e-40
X3           1.712198e-58
dtype: float64

In [59]:
# 예측
# X_const_test = sm.add_constant(X_test) # 평가 세트 절편 포함
y_pred = model.predict(X_test)
y_pred

40     3.126650
35     7.069840
81    -0.937694
61     6.696794
98    -2.102626
68     2.527345
85     5.654142
27     1.692251
39     9.822350
42     6.567602
33     1.928043
59     5.232185
63     8.653403
94     5.008815
56     6.620892
87    13.215581
96     4.479048
1     -0.020116
71     4.877364
82    -0.017297
dtype: float64

In [57]:
# 평가
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

mse, mae, r2

(0.14070664793426885, 0.2928034528632189, 0.989423116757431)