# [선형회귀분석 실습 4: 실전과제문항]

## 모듈 불러오기

In [None]:
# 데이터 전처리 패키지
import numpy as np
import pandas as pd

# 기계학습 모델 & 데이터 셋 & 성능 평가 패키지
import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, MaxAbsScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# 데이터 시각화 패키지
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

# 경고 숨기기
import warnings
warnings.filterwarnings(action='ignore')

def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true-y_pred)/y_true))*100

## 데이터 불러오기

### Toyota Corolla Data (Toyota 중고차 가격 예측)

#### 설명변수
- Id : Record_ID
- Model : Model Description
- Age_08_04 : Age in months as in August 2004
- Mfg_Month : Manufacturing month (1-12)
- Mfg_Year : Manufacturing Year
- KM : Accumulated Kilometers on odometer
- Fuel_Type : Fuel Type (Petrol, Diesel, CNG)
- HP : Horse Power
- Met_Color : Metallic Color (Yes=1, No=0)
- Color : Color (Blue, Red, Grey, Silver, Black, etc.)
- Automatic : Automatic ( (Yes=1, No=0)
- CC : Cylinder Volume in cubic centimeters
- Doors : Number of doors
- Cylinders : Number of cylinders
- Gears : Number of gear positions
- Quarterly_Tax : Quarterly road tax in EUROs
- Weight : Weight in Kilograms
- Mfr_Guarantee : Within Manufacturer's Guarantee period (Yes=1, No=0)
- BOVAG_Guarantee : BOVAG (Dutch dealer network) Guarantee (Yes=1, No=0)
- Guarantee_Period : Guarantee period in months
- ABS : Anti-Lock Brake System (Yes=1, No=0)
- Airbag_1 : Driver_Airbag (Yes=1, No=0)
- Airbag_2 : Passenger Airbag (Yes=1, No=0)
- Airco : Airconditioning (Yes=1, No=0)
- Automatic_airco : Automatic Airconditioning (Yes=1, No=0)
- Boardcomputer : Boardcomputer (Yes=1, No=0)
- CD_Player : CD Player (Yes=1, No=0)
- Central_Lock : Central Lock (Yes=1, No=0)
- Powered_Windows : Powered Windows (Yes=1, No=0)
- Power_Steering : Power Steering (Yes=1, No=0)
- Radio : Radio (Yes=1, No=0)
- Mistlamps : Mistlamps (Yes=1, No=0)
- Sport_Model : Sport Model (Yes=1, No=0)
- Backseat_Divider : Backseat Divider (Yes=1, No=0)
- Metallic_Rim : Metallic Rim (Yes=1, No=0)
- Radio_cassette : Radio Cassette (Yes=1, No=0)
- Parking_Assistant : Parking assistance system (Yes=1, No=0)
- Tow_Bar : Tow Bar (Yes=1, No=0)

#### 반응변수
- Price : Offer Price in EUROs

In [None]:
data = pd.read_csv('data/ToyotaCorolla.csv')
data.head()

## 데이터 전처리하기

### 데이터 정보 확인

In [None]:
data.'''Answer'''()

### Column별 기초 통계량 확인

In [None]:
data.'''Answer'''()

### 예측에 필요하지 않은 변수 제거
예측에 필요하지 않은 변수명: Id, Model, Fuel_Type

In [None]:
data = data.drop('''Answer''')
data.head()

### Training 데이터 70% / Testing 데이터 30% 로 나누기

In [None]:
train_data, test_data = train_test_split('''Anwer''', random_state=55)

## 모델링

### 학습 데이터를 이용하여 선형회귀모델 학습 (상수항 미포함)

In [None]:
lm = sm.OLS('''Answer''')
lm_trained = '''Answer'''

### 모델 학습 결과 (Summary) 확인

In [None]:
lm_trained.'''Answer'''()

## 예측성능 평가

### 학습 데이터셋에 대한 예측성능 평가
MSE, RMSE, MAE, MAPE, R2 계산하여 성능 확인

In [None]:
train_pred = lm_trained.'''Answer'''

print('Training MSE: {:.3f}'.format('''Answer''')
print('Training RMSE: {:.3f}'.format('''Answer''')
print('Training MAE: {:.3f}'.format('''Answer''')
print('Training MAPE: {:.3f}'.format('''Answer''')
print('Training R2: {:.3f}'.format('''Answer''')

### 테스트 데이터셋에 대한 예측성능 평가
MSE, RMSE, MAE, MAPE, R2 계산하여 성능 확인

In [None]:
test_pred = lm_trained.'''Answer'''

print('Testing MSE: {:.3f}'.format('''Answer''')
print('Testing RMSE: {:.3f}'.format('''Answer''')
print('Testing MAE: {:.3f}'.format('''Answer''')
print('Testing MAPE: {:.3f}'.format('''Answer''')
print('Testing R2: {:.3f}'.format('''Answer'''?)

## 변수 선택

### 선형회귀모델에서 유의하지 않은 변수가 있다면 제거

In [None]:
not_significants = lm_trained.pvalues.index[lm_trained.pvalues > '''Answer''']
print(not_significants)

train_data_new = train_data.drop(not_significants, axis=1)
test_data_new = test_data.drop(not_significants, axis=1)

## 직접 p-value를 보고 drop 함수를 통해 제거하기도 가능

### 변수 제거 후 모델 학습 결과 확인

In [None]:
lm_new = sm.OLS('''Answer''')
lm_new_trained = '''Answer'''
lm_new_trained.summary()

### 변수 제거 후 모델 예측성능 (학습/테스트) 확인

In [None]:
train_pred_new = lm_new_trained.'''Answer'''

print('Training MSE: {:.3f}'.format('''Answer''')
print('Training RMSE: {:.3f}'.format('''Answer''')
print('Training MAE: {:.3f}'.format('''Answer''')
print('Training MAPE: {:.3f}'.format('''Answer''')
print('Training R2: {:.3f}'.format('''Answer''')

In [None]:
test_pred_new = lm_new_trained.'''Answer'''

print('Training MSE: {:.3f}'.format('''Answer''')
print('Training RMSE: {:.3f}'.format('''Answer''')
print('Training MAE: {:.3f}'.format('''Answer''')
print('Training MAPE: {:.3f}'.format('''Answer''')
print('Training R2: {:.3f}'.format('''Answer''')

### 상관계수가 높은 설명변수들이 있다면 하나의 변수만 선택
변수별 상관계수 히트맵 출력

In [None]:
plt.figure(figsize=(20, 18))
'''Answer'''(data.corr(), cmap=sns.color_palette("coolwarm", 10), annot=data.corr())

상관계수가 높은 두 변수 중 하나의 변수만 선택하여 설명변수에서 제거

In [None]:
train_data_new2 = train_data.drop('''Answer''')
test_data_new2 = test_data.drop('''Answer''')

### 변수 제거 후 모델 학습 결과 확인

In [None]:
lm_new2 = '''Answer'''(train_data_new2['Price'], train_data_new2.drop('Price', axis=1))
lm_new2_trained = '''Answer'''
lm_new2_trained.summary()

### 변수 제거 후 모델 예측성능 (학습/테스트) 확인

In [None]:
train_pred_new2 = lm_new2_trained.'''Answer'''

print('Training MSE: {:.3f}'.format(mean_squared_error(train_data_new2['Price'], train_pred_new2)))
print('Training RMSE: {:.3f}'.format(np.sqrt(mean_squared_error(train_data_new2['Price'], train_pred_new2))))
print('Training MAE: {:.3f}'.format(mean_absolute_error(train_data_new2['Price'], train_pred_new2)))
print('Training MAPE: {:.3f}'.format(mean_absolute_percentage_error(train_data_new2['Price'], train_pred_new2)))
print('Training R2: {:.3f}'.format(r2_score(train_data_new2['Price'], train_pred_new2)))

In [None]:
test_pred_new2 = lm_new2_trained.'''Answer'''

print('Testing MSE: {:.3f}'.format(mean_squared_error(test_data_new2['Price'], test_pred_new2)))
print('Testing RMSE: {:.3f}'.format(np.sqrt(mean_squared_error(test_data_new2['Price'], test_pred_new2))))
print('Testing MAE: {:.3f}'.format(mean_absolute_error(test_data_new2['Price'], test_pred_new2)))
print('Testing MAPE: {:.3f}'.format(mean_absolute_percentage_error(test_data_new2['Price'], test_pred_new2)))
print('Testing R2: {:.3f}'.format(r2_score(test_data_new2['Price'], test_pred_new2)))