# 다항식 회귀분석(Polynomial Regression)

### FB(x), TV(y), Newspaper(z)라 할때 다항식 xy를 삽입하여 xy, y, z 로 다중선형회귀분석

In [6]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# 데이터 로드
data = np.loadtxt("/content/FB_TV_News_sales.csv", delimiter=',', skiprows=1)
X = data[:, 1:-1]  # X는 페이스북(FB), TV, 신문(Newspaper) 광고비
Y = data[:, -1]  # Y는 판매량

# 다항식 특성 구성: FB와 TV의 곱 (xy)
FB = X[:, 0]  # FB 광고비
TV = X[:, 1]  # TV 광고비
poly = FB * TV  # FB와 TV의 곱

# 원래 특성 배열에 다항 특성 추가
xpoly = np.column_stack((poly, X))

# 데이터 분할: 훈련 데이터와 테스트 데이터
x_train, x_test, y_train, y_test = train_test_split(xpoly, Y, test_size=0.2, random_state=5)

# 선형 회귀 모델 생성 및 훈련
lrmodel = LinearRegression()
lrmodel.fit(x_train, y_train)

# 계수 및 절편 추출
beta_0 = lrmodel.coef_[0]  # FB*TV의 계수
beta_1 = lrmodel.coef_[1]  # TV의 계수
beta_2 = lrmodel.coef_[2]  # 신문의 계수
beta_3 = lrmodel.intercept_  # 절편

print('beta_0: %f' % beta_0)
print('beta_1: %f' % beta_1)
print('beta_2: %f' % beta_2)
print('beta_3: %f' % beta_3)

# 훈련 데이터와 테스트 데이터에 대한 예측
predicted_y_train = lrmodel.predict(x_train)
predicted_y_test = lrmodel.predict(x_test)

# MSE 계산
mse_train = mean_squared_error(y_train, predicted_y_train)
mse_test = mean_squared_error(y_test, predicted_y_test)
print("MSE on train data: %f" % mse_train)
print("MSE on test data: %f" % mse_test)

# 결정계수 R^2 계산
r2_train = r2_score(y_train, predicted_y_train)
r2_test = r2_score(y_test, predicted_y_test)
print("train의 결정계수: %f" % r2_train)
print("test의 결정계수: %f" % r2_test)


beta_0: 0.001074
beta_1: 0.020130
beta_2: 0.029456
beta_3: 6.596782
MSE on train data: 0.924063
MSE on test data: 0.685722
train의 결정계수: 0.966346
test의 결정계수: 0.972695


### 다항식 x^3,x^2를 삽입하여 x^3,x^2, xy, y, z 로 다중선형회귀분석

In [9]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# 데이터 로드
data = np.loadtxt("FB_TV_News_sales.csv", delimiter=',', skiprows=1)
X = data[:, 1:-1]  # FB, TV, 신문 광고비 순으로 가정
Y = data[:, -1]    # 판매량, 종속 변수

# 다항식 특성 계산
FB = X[:, 0]  # FB 광고비
TV = X[:, 1]  # TV 광고비
Newspaper = X[:, 2]  # 신문 광고비

# FB와 TV의 교호작용 항
xy = FB * TV

# FB의 제곱 및 세제곱
p1 = FB**3  # FB 세제곱
p2 = FB**2  # FB 제곱

# 새로운 특성 행렬 구성: x^3, x^2, xy, y, z
xpoly = np.column_stack((p1, p2, xy, TV, Newspaper))

# 데이터 분할: 훈련 세트와 테스트 세트
x_train, x_test, y_train, y_test = train_test_split(xpoly, Y, test_size=0.2, random_state=5)

# 선형 회귀 모델 생성 및 학습
lrmodel = LinearRegression()
lrmodel.fit(x_train, y_train)

# 회귀 계수 추출
beta_0 = lrmodel.coef_[0]  # x^3 계수
beta_1 = lrmodel.coef_[1]  # x^2 계수
beta_2 = lrmodel.coef_[2]  # xy 계수
beta_3 = lrmodel.coef_[3]  # TV 계수
beta_4 = lrmodel.coef_[4]  # 신문 계수
beta_5 = lrmodel.intercept_  # 절편

# 계수 출력
print('beta_0: %f' % beta_0)
print('beta_1: %f' % beta_1)
print('beta_2: %f' % beta_2)
print('beta_3: %f' % beta_3)
print('beta_4: %f' % beta_4)
print('beta_5: %f' % beta_5)

# 예측 및 성능 평가
predicted_y_train = lrmodel.predict(x_train)
mse_train = mean_squared_error(y_train, predicted_y_train)
print("MSE on train data: %f" % mse_train)

predicted_y_test = lrmodel.predict(x_test)
mse_test = mean_squared_error(y_test, predicted_y_test)
print("MSE on test data: %f" % mse_test)

r2_train = r2_score(y_train, predicted_y_train)
r2_test = r2_score(y_test, predicted_y_test)
print("Train 결정계수: %f" % r2_train)
print("Test 결정계수: %f" % r2_test)


beta_0: -0.000001
beta_1: 0.000231
beta_2: 0.001206
beta_3: 0.010364
beta_4: 0.005343
beta_5: 6.999073
MSE on train data: 0.881124
MSE on test data: 0.602364
Train 결정계수: 0.967910
Test 결정계수: 0.976014


### 다항식 x^5,x^4를 삽입하여 x^5,x^4, x^3,x^2, xy, y, z 로 다중선형회귀분석

In [8]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# 데이터 로드
data = np.loadtxt("FB_TV_News_sales.csv", delimiter=',', skiprows=1)
X = data[:, 1:-1]  # FB, TV, Newspaper
Y = data[:, -1]    # Sales

# FB 광고비 (x)
FB = X[:, 0]
# 다항식 특성 계산
p1 = FB**5  # FB^5
p2 = FB**4  # FB^4
p3 = FB**3  # FB^3
p4 = FB**2  # FB^2
xy = FB * X[:, 1]  # FB * TV

# 기존 y, z 특성
Y_TV_Newspaper = X[:, 1:3]

# 새로운 특성 배열 구성
xpoly = np.column_stack((p1, p2, p3, p4, xy, Y_TV_Newspaper))

# 데이터 분할: 훈련 데이터와 테스트 데이터
x_train, x_test, y_train, y_test = train_test_split(xpoly, Y, test_size=0.2, random_state=5)

# 선형 회귀 모델 생성 및 학습
lrmodel = LinearRegression()
lrmodel.fit(x_train, y_train)

# 회귀 계수 추출
coefficients = lrmodel.coef_
beta_0, beta_1, beta_2, beta_3, beta_4, beta_5, beta_6 = coefficients
beta_7 = lrmodel.intercept_

# 결과 출력
print('beta_0: %f' % beta_0)
print('beta_1: %f' % beta_1)
print('beta_2: %f' % beta_2)
print('beta_3: %f' % beta_3)
print('beta_4: %f' % beta_4)
print('beta_5: %f' % beta_5)
print('beta_6: %f' % beta_6)
print('beta_7: %f' % beta_7)

# 예측 및 성능 평가
predicted_y_train = lrmodel.predict(x_train)
mse_train = mean_squared_error(y_train, predicted_y_train)
print("MSE on train data: %f" % mse_train)

predicted_y_test = lrmodel.predict(x_test)
mse_test = mean_squared_error(y_test, predicted_y_test)
print("MSE on test data: %f" % mse_test)

r2_train = r2_score(y_train, predicted_y_train)
r2_test = r2_score(y_test, predicted_y_test)
print("train의 결정계수: %f" % r2_train)
print("test의 결정계수: %f" % r2_test)


beta_0: -0.000000
beta_1: 0.000000
beta_2: -0.000016
beta_3: 0.001476
beta_4: 0.001078
beta_5: 0.034631
beta_6: 0.003401
beta_7: 5.199690
MSE on train data: 0.393816
MSE on test data: 0.262254
train의 결정계수: 0.985657
test의 결정계수: 0.989557


# 참고

In [10]:
from sklearn.preprocessing import PolynomialFeatures
X = np.arange(6).reshape(3, 2)
print(X)


poly = PolynomialFeatures(2) #degree=2 , 2차식 (1,x1,x2,x1^2,x1*x2,x2^2)
poly=poly.fit_transform(X)
print(poly)


poly = PolynomialFeatures(degree=2,interaction_only=True)
#interaction_only: True면 2차항에서 상호작용항만 출력
poly=poly.fit_transform(X)
print(poly)

print(X) #X는 그대로

[[0 1]
 [2 3]
 [4 5]]
[[ 1.  0.  1.  0.  0.  1.]
 [ 1.  2.  3.  4.  6.  9.]
 [ 1.  4.  5. 16. 20. 25.]]
[[ 1.  0.  1.  0.]
 [ 1.  2.  3.  6.]
 [ 1.  4.  5. 20.]]
[[0 1]
 [2 3]
 [4 5]]


In [11]:
poly=PolynomialFeatures(3) #1, x1, x2, x1^2, x1*x2, x2^2, x1^3, x1^2 * x2, x1 * x2^2, x2^3
poly=poly.fit_transform(X)
print(poly)

poly = PolynomialFeatures(degree=3,interaction_only=True)
#interaction_only: True면 2차항에서 상호작용항만 출력
poly=poly.fit_transform(X)
print(poly)

[[  1.   0.   1.   0.   0.   1.   0.   0.   0.   1.]
 [  1.   2.   3.   4.   6.   9.   8.  12.  18.  27.]
 [  1.   4.   5.  16.  20.  25.  64.  80. 100. 125.]]
[[ 1.  0.  1.  0.]
 [ 1.  2.  3.  6.]
 [ 1.  4.  5. 20.]]


## PolynomialFeatures활용하여 비교

In [12]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import PolynomialFeatures

data=np.loadtxt("FB_TV_News_sales.csv",delimiter=',',skiprows=1)
X=data[:,1:-1]
Y=data[:,-1]

#
poly = PolynomialFeatures(2) #poly = PolynomialFeatures(degree=2,interaction_only=True)와 비교
poly = poly.fit_transform(X)
x_train, x_test, y_train, y_test=train_test_split(poly,Y,test_size=0.2,random_state=5)
print('변인 개수:',len(x_train[0])) # 다항식 몇개인지 알아봄 10개

lrmodel=LinearRegression()
lrmodel.fit(x_train,y_train)

predicted_y_train=lrmodel.predict(x_train)
mse_train=mean_squared_error(y_train,predicted_y_train)
print("MSE on train data: %f"%mse_train)

predicted_y_test=lrmodel.predict(x_test)
mse_test=mean_squared_error(y_test,predicted_y_test)
print("MSE on train data: %f"%mse_test)


r2_train = r2_score(y_train, predicted_y_train)
r2_test = r2_score(y_test, predicted_y_test)
print("train의 결정계수:%f"%r2_train)
print("test의 결정계수:%f"%r2_test)

변인 개수: 10
MSE on train data: 0.388436
MSE on train data: 0.283414
train의 결정계수:0.985853
test의 결정계수:0.988714


In [13]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import PolynomialFeatures  #활용

data=np.loadtxt("FB_TV_News_sales.csv",delimiter=',',skiprows=1)
X=data[:,1:-1]
Y=data[:,-1]

#다항식 구성
poly = PolynomialFeatures(degree=3)
poly = poly.fit_transform(X)

x_train, x_test, y_train, y_test=train_test_split(poly,Y,test_size=0.2,random_state=5)
print('변인 개수:',len(x_train[0])) # 다항식 몇개인지 알아봄 20개

lrmodel=LinearRegression()
lrmodel.fit(x_train,y_train)

#for i in range(20):
#    print(lrmodel.coef_[i])

#print('절편:',lrmodel.intercept_)


predicted_y_train=lrmodel.predict(x_train)
mse_train=mean_squared_error(y_train,predicted_y_train)
print("MSE on train data: %f"%mse_train)

predicted_y_test=lrmodel.predict(x_test)
mse_test=mean_squared_error(y_test,predicted_y_test)
print("MSE on train data: %f"%mse_test)


r2_train = r2_score(y_train, predicted_y_train)
r2_test = r2_score(y_test, predicted_y_test)
print("train의 결정계수:%f"%r2_train)
print("test의 결정계수:%f"%r2_test)

변인 개수: 20
MSE on train data: 0.220547
MSE on train data: 0.220674
train의 결정계수:0.991968
test의 결정계수:0.991213
