# 단순 회귀분석

## 단순 회귀분석의 특징
    연속형 종속변수와 독립변수 간 선형관계 및 설명력을 확인하는 방법
    종속변수와 독립변수가 각각 하나인 경우의 단순 선형 회귀 모형
    설명력과 더불어 오차 평가 지표로 모델의 성능을 평가
    
## statsmodels - ols()
    선형회귀 분석을 위한 statsmodels의 함수
    ols() 함수 내에 종속변수와 독립변수를 선언
    ols() 함수의 fit() 메서드로 모델 적합
    변수명에 온점 등 특정 특수문자가 있는 경우 오류 발생
    모델 객체의 predict() 메서드로 예측

In [1]:
import pandas as pd
from statsmodels.formula.api import ols

In [2]:
df = pd.read_csv("iris.csv")
df.head(2)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa


In [3]:
#model = ols(formular="Sepal.Length ~ Sepal.Width", data=df).fit()

df.columns = ["SL", "SW", "PL", "PW", "species"]
df.head(2)

Unnamed: 0,SL,SW,PL,PW,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa


In [5]:
model = ols(formula="SL ~ SW", data=df).fit()
model.summary()

0,1,2,3
Dep. Variable:,SL,R-squared:,0.014
Model:,OLS,Adj. R-squared:,0.007
Method:,Least Squares,F-statistic:,2.074
Date:,"Tue, 07 Feb 2023",Prob (F-statistic):,0.152
Time:,13:42:58,Log-Likelihood:,-183.0
No. Observations:,150,AIC:,370.0
Df Residuals:,148,BIC:,376.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,6.5262,0.479,13.628,0.000,5.580,7.473
SW,-0.2234,0.155,-1.440,0.152,-0.530,0.083

0,1,2,3
Omnibus:,4.389,Durbin-Watson:,0.952
Prob(Omnibus):,0.111,Jarque-Bera (JB):,4.237
Skew:,0.36,Prob(JB):,0.12
Kurtosis:,2.6,Cond. No.,24.2


In [6]:
model = ols(formula="PL ~ PW", data=df).fit()
model.summary()

0,1,2,3
Dep. Variable:,PL,R-squared:,0.927
Model:,OLS,Adj. R-squared:,0.927
Method:,Least Squares,F-statistic:,1882.0
Date:,"Tue, 07 Feb 2023",Prob (F-statistic):,4.6800000000000005e-86
Time:,13:45:14,Log-Likelihood:,-101.18
No. Observations:,150,AIC:,206.4
Df Residuals:,148,BIC:,212.4
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.0836,0.073,14.850,0.000,0.939,1.228
PW,2.2299,0.051,43.387,0.000,2.128,2.332

0,1,2,3
Omnibus:,2.438,Durbin-Watson:,1.43
Prob(Omnibus):,0.295,Jarque-Bera (JB):,1.966
Skew:,0.211,Prob(JB):,0.374
Kurtosis:,3.369,Cond. No.,3.7


y = 2.2299x + 1.0836

In [8]:
model.predict(df.iloc[:6,])

0    1.529546
1    1.529546
2    1.529546
3    1.529546
4    1.529546
5    1.975534
dtype: float64

In [11]:
df["pred"] = model.predict(df) #알아서 학습한 변수로 넣어줌
df.head()

Unnamed: 0,SL,SW,PL,PW,species,pred
0,5.1,3.5,1.4,0.2,setosa,1.529546
1,4.9,3.0,1.4,0.2,setosa,1.529546
2,4.7,3.2,1.3,0.2,setosa,1.529546
3,4.6,3.1,1.5,0.2,setosa,1.529546
4,5.0,3.6,1.4,0.2,setosa,1.529546


## sklearn - LinearRegression()
    선형회귀 분석을 위한 sklearn의 함수
    LinearRegression() 함수 내 fit_intercept로 절편 적합 여부 설정 가능
    LinearRegression() 함수의 fit() 메서드에 학습 데이터 할당 가능
    모델 객체의 coef_와 intercept_ 어트리뷰트로 각가 계수와 절편 확인 가능
    모델 객체의 predict() 메서드로 예측

In [12]:
from sklearn.linear_model import LinearRegression

In [14]:
#model = LinearRegression.fit(X=df["PL"], y=df["PW"])
df["PL"]

0      1.4
1      1.4
2      1.3
3      1.5
4      1.4
      ... 
145    5.2
146    5.0
147    5.2
148    5.4
149    5.1
Name: PL, Length: 150, dtype: float64

In [16]:
df[["PL"]]

Unnamed: 0,PL
0,1.4
1,1.4
2,1.3
3,1.5
4,1.4
...,...
145,5.2
146,5.0
147,5.2
148,5.4


In [17]:
df.iloc[0,]

SL              5.1
SW              3.5
PL              1.4
PW              0.2
species      setosa
pred       1.529546
Name: 0, dtype: object

In [20]:
df.iloc[[0],]

Unnamed: 0,SL,SW,PL,PW,species,pred
0,5.1,3.5,1.4,0.2,setosa,1.529546


In [26]:
model = LinearRegression().fit(X=df[["PW"]], y=df[["PL"]])
model

LinearRegression()

In [27]:
model.coef_

array([[2.2299405]])

In [28]:
model.intercept_

array([1.08355803])

In [29]:
#model.predict(df) #error
model.predict(df[["PL"]])

Feature names unseen at fit time:
- PL
Feature names seen at fit time, yet now missing:
- PW



array([[ 4.20547473],
       [ 4.20547473],
       [ 3.98248068],
       [ 4.42846878],
       [ 4.20547473],
       [ 4.87445687],
       [ 4.20547473],
       [ 4.42846878],
       [ 4.20547473],
       [ 4.42846878],
       [ 4.42846878],
       [ 4.65146283],
       [ 4.20547473],
       [ 3.53649258],
       [ 3.75948663],
       [ 4.42846878],
       [ 3.98248068],
       [ 4.20547473],
       [ 4.87445687],
       [ 4.42846878],
       [ 4.87445687],
       [ 4.42846878],
       [ 3.31349853],
       [ 4.87445687],
       [ 5.32044497],
       [ 4.65146283],
       [ 4.65146283],
       [ 4.42846878],
       [ 4.20547473],
       [ 4.65146283],
       [ 4.65146283],
       [ 4.42846878],
       [ 4.42846878],
       [ 4.20547473],
       [ 4.42846878],
       [ 3.75948663],
       [ 3.98248068],
       [ 4.20547473],
       [ 3.98248068],
       [ 4.42846878],
       [ 3.98248068],
       [ 3.98248068],
       [ 3.98248068],
       [ 4.65146283],
       [ 5.32044497],
       [ 4

In [36]:
model2 = LinearRegression().fit(X=df[["SW"]], y=df[["SL"]])
model2

LinearRegression()

In [37]:
model2.coef_

array([[-0.22336106]])

In [38]:
model2.intercept_

array([6.52622255])

## sklearn - mean_absolute_error()
    MAE(Mean Absolute Error) 연산을 위한 sklearn의 함수
    
## sklearn - mean_squared_error()
    MSE(Mean Squared Error) 연산을 위한 sklearn 함수
    해당 결과에 제곱근 연산을 하면 RMSE(Root Mean Squared Error) 계산 가능

In [39]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [40]:
mean_absolute_error(y_true=df["PL"], y_pred=df["PW"])

2.558666666666667

In [41]:
mean_absolute_error(y_true=df["PL"], y_pred=df["PW"])

7.645466666666667

In [42]:
mean_squared_error(y_true=df["PL"], y_pred=df["PW"]) ** 0.5

2.76504370067937

문제 01. 종속변수를 registered, 독립변수를 temp로 했을 때 결정계수는?
statsmodels 함수 활용, 학습 데이터 비율을 70%, seed를 123으로 설정

In [43]:
df = pd.read_csv("bike.csv")
df.head(2)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40


In [44]:
from sklearn.model_selection import train_test_split

In [46]:
df_train, df_test = train_test_split(df, train_size=0.7, random_state=123)
df_train.head(2)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
4046,2011-09-19 15:00:00,3,0,1,2,24.6,30.305,60,15.0013,44,143,187
9262,2012-09-09 07:00:00,3,0,0,1,22.14,25.76,73,11.0014,20,50,70


In [47]:
model = ols(formula = "registered ~ temp", data=df_train).fit()
model.summary()

0,1,2,3
Dep. Variable:,registered,R-squared:,0.106
Model:,OLS,Adj. R-squared:,0.106
Method:,Least Squares,F-statistic:,902.3
Date:,"Wed, 08 Feb 2023",Prob (F-statistic):,1.92e-187
Time:,01:41:51,Log-Likelihood:,-48650.0
No. Observations:,7620,AIC:,97300.0
Df Residuals:,7618,BIC:,97320.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,27.5151,4.559,6.036,0.000,18.579,36.452
temp,6.3391,0.211,30.038,0.000,5.925,6.753

0,1,2,3
Omnibus:,2097.525,Durbin-Watson:,2.022
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5337.402
Skew:,1.502,Prob(JB):,0.0
Kurtosis:,5.79,Cond. No.,60.1


문제 02. 종속변수를 casual, 독립변수를 atemp로 했을 때 RMSE는?
statsmodels 함수 활용

In [48]:
df = pd.read_csv("bike.csv")
df_train, df_test = train_test_split(df, train_size=0.7, random_state=123)
model = ols(formula = "casual ~ atemp", data=df_train).fit()
model.summary()

0,1,2,3
Dep. Variable:,casual,R-squared:,0.219
Model:,OLS,Adj. R-squared:,0.219
Method:,Least Squares,F-statistic:,2138.0
Date:,"Wed, 08 Feb 2023",Prob (F-statistic):,0.0
Time:,01:44:44,Log-Likelihood:,-39689.0
No. Observations:,7620,AIC:,79380.0
Df Residuals:,7618,BIC:,79400.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-29.2974,1.498,-19.554,0.000,-32.234,-26.360
atemp,2.7672,0.060,46.243,0.000,2.650,2.885

0,1,2,3
Omnibus:,4125.373,Durbin-Watson:,1.973
Prob(Omnibus):,0.0,Jarque-Bera (JB):,34148.771
Skew:,2.494,Prob(JB):,0.0
Kurtosis:,12.092,Cond. No.,74.1


In [49]:
pred = model.predict(df_test)
pred[:4]

6495    31.499001
7050    12.626390
558     10.537120
5085    33.588271
dtype: float64

In [50]:
mean_squared_error(y_pred = pred, y_true=df_test["casual"]) ** 0.5

44.46237010271433

문제 03. 종속변수를 casual, 독립변수를 atemp로 했을 때 여름과 겨울의 RMSE 차이는?

In [52]:
df = pd.read_csv("bike.csv")
df_s2 = df.loc[df["season"]==2,]
df_s4 = df.loc[df["season"]==4,]

In [53]:
df_s2

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
1323,2011-04-01 00:00:00,2,0,1,3,10.66,12.880,100,11.0014,0,6,6
1324,2011-04-01 01:00:00,2,0,1,3,10.66,12.880,100,11.0014,0,4,4
1325,2011-04-01 02:00:00,2,0,1,3,10.66,12.880,93,12.9980,0,7,7
1326,2011-04-01 03:00:00,2,0,1,2,9.84,11.365,93,16.9979,0,4,4
1327,2011-04-01 04:00:00,2,0,1,2,9.84,11.365,93,16.9979,0,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...
8146,2012-06-19 19:00:00,2,0,1,1,32.80,38.635,59,15.0013,82,432,514
8147,2012-06-19 20:00:00,2,0,1,1,32.80,37.880,55,16.9979,59,399,458
8148,2012-06-19 21:00:00,2,0,1,1,31.16,35.605,62,11.0014,37,239,276
8149,2012-06-19 22:00:00,2,0,1,1,29.52,34.850,79,6.0032,51,240,291


In [56]:
df_s2_train, df_s2_test = train_test_split(df_s2, train_size=0.7, random_state=123)
df_s4_train, df_s4_test = train_test_split(df_s4, train_size=0.7, random_state=123)

In [57]:
model_s2 = ols(formula = "casual ~ atemp", data=df_s2_train).fit()
model_s4 = ols(formula = "casual ~ atemp", data=df_s4_train).fit()
pred_s2 = model_s2.predict(df_s2_test)
pred_s4 = model_s4.predict(df_s4_test)

In [None]:
RMSE_s2 = mean_squared_error(y_pred=pred_s2, y_true=df_s2_test[])