### ML 예측모델 만들기 - MPG

In [1]:
import pandas as pd
import numpy as np
import pickle
from seaborn import load_dataset
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler

#### 1. MPG 데이터를 가져와서 전처리 한다:

In [2]:
# Seaborn의 내장 데이터 mpg를 가져온다.
df= load_dataset("mpg")

In [3]:
# 불필요한 name 컬럼은 제거한다.
df.drop(columns="name",inplace=True)
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,18.0,8,307.0,130.0,3504,12.0,70,usa
1,15.0,8,350.0,165.0,3693,11.5,70,usa
2,18.0,8,318.0,150.0,3436,11.0,70,usa
3,16.0,8,304.0,150.0,3433,12.0,70,usa
4,17.0,8,302.0,140.0,3449,10.5,70,usa


In [4]:
# 결측치를 확인해 본다.
df.isnull().sum(axis=0)

mpg             0
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model_year      0
origin          0
dtype: int64

In [5]:
# 결측치를 제거한다.
df.dropna(axis=0, inplace=True)

In [6]:
# 명목형 변수인 Origin는 가변수 (dummy variable)로 변환하여 둔다. 
df = df.join(pd.get_dummies(df["origin"], drop_first=True)).drop(columns=["origin"])
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,japan,usa
0,18.0,8,307.0,130.0,3504,12.0,70,0,1
1,15.0,8,350.0,165.0,3693,11.5,70,0,1
2,18.0,8,318.0,150.0,3436,11.0,70,0,1
3,16.0,8,304.0,150.0,3433,12.0,70,0,1
4,17.0,8,302.0,140.0,3449,10.5,70,0,1


In [7]:
# 설명 변수(X)들을 별도로 둔다.
df_X = df.drop(columns="mpg")
column_names = df_X.columns

In [8]:
# 종속변수.
Y= df["mpg"]

In [9]:
# X 변수의 MinMaxScaler 전처리.
my_scaler = MinMaxScaler()
X_scaled = my_scaler.fit_transform(df_X.values)

In [10]:
# 전처리 된 X 변수.
df_X_scaled = pd.DataFrame(data=X_scaled, columns= column_names)
df_X_scaled.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,japan,usa
0,1.0,0.617571,0.456522,0.53615,0.238095,0.0,0.0,1.0
1,1.0,0.728682,0.646739,0.589736,0.208333,0.0,0.0,1.0
2,1.0,0.645995,0.565217,0.51687,0.178571,0.0,0.0,1.0
3,1.0,0.609819,0.565217,0.516019,0.238095,0.0,0.0,1.0
4,1.0,0.604651,0.51087,0.520556,0.14881,0.0,0.0,1.0


#### 2.  ML(회귀) 모델 생성과 저장:

In [11]:
# 모델을 학습 시킨다.
my_regressor = RandomForestRegressor()
my_regressor.fit(df_X_scaled, Y)

RandomForestRegressor()

In [12]:
# In-sample 결정계수 R^2를 계산해 본다.
rsq = my_regressor.score(df_X_scaled, Y)
print(f"In-sample 결정계수는 {rsq:0.3f} 입니다.")

In-sample 결정계수는 0.983 입니다.


In [13]:
# my_regressor 객체 저장.
with open("my_regressor.pkl","wb") as f:
    pickle.dump(my_regressor, f)

In [14]:
# my_scaler 객체 저장.
with open("my_scaler.pkl","wb") as f:
    pickle.dump(my_scaler, f)