# 2장 통계모형
## 1절 선형회귀모형
### scipy 를 활용한 단순 선형 회귀 분석

In [1]:
import pandas as pd
from sklearn.datasets import load_diabetes

diabetes=load_diabetes()
data=pd.DataFrame(diabetes.data,columns=diabetes.feature_names)
data.info() # 확인

target=diabetes.target

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     442 non-null    float64
 1   sex     442 non-null    float64
 2   bmi     442 non-null    float64
 3   bp      442 non-null    float64
 4   s1      442 non-null    float64
 5   s2      442 non-null    float64
 6   s3      442 non-null    float64
 7   s4      442 non-null    float64
 8   s5      442 non-null    float64
 9   s6      442 non-null    float64
dtypes: float64(10)
memory usage: 34.7 KB


In [3]:
# 단순 선형회귀 모델 생성
from scipy.stats import linregress
model=linregress(x=data['bmi'],y=target)
print(model)

LinregressResult(slope=949.4352603840384, intercept=152.13348416289617, rvalue=0.5864501344746884, pvalue=3.466006445167547e-42, stderr=62.515122002852664, intercept_stderr=2.973541118790735)


slope 회귀계수, intercept 는 상수항,  rvalue 모형을 설명력, pvalue 기울기에 대한 통계적 유의성, stderr, intercept_stderr표준오차


In [4]:
# 독립변수에 대한 추정된 회귀계수
print(model.slope)

949.4352603840384


In [5]:
# 상숭항에 대한 추정된 회귀계수
print(model.intercept)

152.13348416289617


In [6]:
# target 과 bmi 사이에는 
# target=152.13+949.44*sex 라는 회귀식이 나옴

# beta1에 대한 통계적 유의성 (p-value)
print(model.pvalue)

3.466006445167547e-42


In [7]:
# 결정계수(모델의 설명력)
print(model.rvalue)

0.5864501344746884


- pvalue가 매우 작으므로 통계적으로 유의미 하다고 판단할수잇다.

### Statsmodels를 활용한 다중 선형 회귀 분석
#### 회귀 모형 적합

In [9]:
import pandas as pd
tips=pd.read_csv('../예제/tips.csv')
print(tips)

import statsmodels.api as sm 

# 독립변수 (total_vill,size)와 종속변수 지정
X=tips[['total_bill','size']]   # 독립변수
y=tips['tip']   # 종속변수

# 상수항추가
X=sm.add_constant(X)

# 다중 선형 회귀분석 수행
# OLS 객체 생성 후 적합
model=sm.OLS(y,X).fit()
print(model.summary())

     total_bill   tip     sex smoker   day    time  size
0         16.99  1.01  Female     No   Sun  Dinner     2
1         10.34  1.66    Male     No   Sun  Dinner     3
2         21.01  3.50    Male     No   Sun  Dinner     3
3         23.68  3.31    Male     No   Sun  Dinner     2
4         24.59  3.61  Female     No   Sun  Dinner     4
..          ...   ...     ...    ...   ...     ...   ...
239       29.03  5.92    Male     No   Sat  Dinner     3
240       27.18  2.00  Female    Yes   Sat  Dinner     2
241       22.67  2.00    Male    Yes   Sat  Dinner     2
242       17.82  1.75    Male     No   Sat  Dinner     2
243       18.78  3.00  Female     No  Thur  Dinner     2

[244 rows x 7 columns]
                            OLS Regression Results                            
Dep. Variable:                    tip   R-squared:                       0.468
Model:                            OLS   Adj. R-squared:                  0.463
Method:                 Least Squares   F-statistic:   

#### 반응변수의 기댓값 추정

In [10]:
# Q. 5번째 관측치에 대한 tip의 기댓값을 추정
X.iloc[4]

const          1.00
total_bill    24.59
size           4.00
Name: 4, dtype: float64

In [11]:
# 5번쨰 관측치에 대한 tip 의 기댓값을 추정
model.predict(X.iloc[4])

None    3.719157
dtype: float64

#### 반응 변수의 기댓값 예측

In [19]:
# Q.위에서 적합한 회귀 모형을 활용하여, total-bill이 24.59이고 size 가 4인 경우의 tips의 기댓값을 예측해보자
new_data=pd.DataFrame({'const':[1],'total_bill':[24.59],'size':[4]})

#예측 기댓값 결과 얻기
result=model.get_prediction(new_data)

model.summary()

0,1,2,3
Dep. Variable:,tip,R-squared:,0.468
Model:,OLS,Adj. R-squared:,0.463
Method:,Least Squares,F-statistic:,105.9
Date:,"Mon, 25 Nov 2024",Prob (F-statistic):,9.67e-34
Time:,17:38:36,Log-Likelihood:,-347.99
No. Observations:,244,AIC:,702.0
Df Residuals:,241,BIC:,712.5
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.6689,0.194,3.455,0.001,0.288,1.050
total_bill,0.0927,0.009,10.172,0.000,0.075,0.111
size,0.1926,0.085,2.258,0.025,0.025,0.361

0,1,2,3
Omnibus:,24.753,Durbin-Watson:,2.1
Prob(Omnibus):,0.0,Jarque-Bera (JB):,46.169
Skew:,0.545,Prob(JB):,9.43e-11
Kurtosis:,4.831,Cond. No.,67.6
