In [1]:
# 주피터 노트북 환경설정
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

from IPython.display import set_matplotlib_formats
set_matplotlib_formats("retina")

from IPython.core.display import display, HTML
# display(HTML("<style>.container { font-weight: bold !important; font-family:'Malgun Gothic' !important;}</style>"))
# display(HTML("<style>.container { font-weight: bold !important;}</style>"))
# display(HTML("<style>.container { width: 98% !important; }</style>"))

In [2]:
import numpy as np
import pandas as pd
import os

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use("ggplot")
%matplotlib inline

# 관련 라이브러리 임포트 
import matplotlib.font_manager as fm

#  한글글꼴로 변경
# plt.rcParams['font.family'] = '한글글꼴명'
plt.rcParams['font.size'] = 11.0
# plt.rcParams['font.family'] = 'batang'
plt.rcParams['font.family'] = 'Malgun Gothic'

# 그래프에서 마이너스 폰트 깨지는 문제에 대한 대처
matplotlib.rcParams['axes.unicode_minus'] = False

# 그래프 기본 크기 설정 
plt.rcParams['figure.figsize'] = [10, 6]

# Pipeline 

- https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html
- 데이터 사전 처리 및 분류의 모든 단계를 포함하는 단일 객체를 만들때 사용한다. 
- train과 test 데이터 손실을 피할 수 있다.
- 교차 검증 및 기타 모델 선택 유형을 쉽게 만든다.
- from sklearn.pipeline import Pipeline

In [3]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression

from sklearn.metrics import f1_score, recall_score, precision_score, classification_report
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

from sklearn.preprocessing import PolynomialFeatures, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from sklearn.pipeline import Pipeline

In [4]:
print(dir(Pipeline))

['__abstractmethods__', '__annotations__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_check_fit_params', '_check_n_features', '_estimator_type', '_final_estimator', '_fit', '_get_param_names', '_get_params', '_get_tags', '_inverse_transform', '_iter', '_log_message', '_more_tags', '_pairwise', '_replace_estimator', '_repr_html_', '_repr_html_inner', '_repr_mimebundle_', '_required_parameters', '_set_params', '_sk_visual_block_', '_transform', '_validate_data', '_validate_names', '_validate_steps', 'classes_', 'decision_function', 'fit', 'fit_predict', 'fit_transform', 'get_params', 'inverse_transform', 'n_features_in_'

### 스케일 + 모델화 

In [5]:
#데이터셋 준비
from sklearn.datasets import load_iris

iris_data = load_iris()
print(iris_data.keys())

X = iris_data.data
y = iris_data.target
feature_names = iris_data.feature_names
target_names = iris_data.target_names

print(feature_names)
print(target_names)

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
['setosa' 'versicolor' 'virginica']


In [7]:
#파이프라인 없이 스케일링 -> 모델 만들어서 학습 -> 성능평가
#1) 스케일링
scale = StandardScaler()
scale.fit(X)
X_scale = scale.transform(X)
print(X_scale[:3])

#2) 학습데이터, 테스트데이터 분리, 모델 생성 및 학습
X_train, X_test, y_train, y_test = train_test_split(X_scale, y, random_state=111, stratify=y)

model = DecisionTreeClassifier()
model.fit(X_train, y_train)

#3) 평가 및 예측
print(model.score(X_train, y_train) , model.score(X_test, y_test))
print()
print(classification_report(y_test, model.predict(X_test)))

[[-0.90068117  1.01900435 -1.34022653 -1.3154443 ]
 [-1.14301691 -0.13197948 -1.34022653 -1.3154443 ]
 [-1.38535265  0.32841405 -1.39706395 -1.3154443 ]]
1.0 0.9473684210526315

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       0.92      0.92      0.92        12
           2       0.92      0.92      0.92        13

    accuracy                           0.95        38
   macro avg       0.95      0.95      0.95        38
weighted avg       0.95      0.95      0.95        38



In [None]:
X[:3]

In [9]:
#파이프라인 이용

#학습데이터와 테스트데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=111, stratify=y)

#파이프라인 구축(스케일링 -> 모델 생성)
#파이프라인객체변수 = Pipeline([ (객체변수1, 객체생성메서드1),(객체변수2, 객체생성메서드2), ...  ])
pipe = Pipeline([ ('scale', StandardScaler()), 
                  ('model_dt', DecisionTreeClassifier(max_depth=3)) ])
print(pipe)

#파이프라인 이용 학습 
pipe.fit(X_train, y_train)

#성능 평가
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

#예측
print(pipe.predict(X_test))

#파이프라인의 파라미터 정보
print(pipe.get_params())

#F1 score
print(f1_score(y_test, pipe.predict(X_test), average='macro'))


Pipeline(steps=[('scale', StandardScaler()),
                ('model_dt', DecisionTreeClassifier(max_depth=3))])
0.9821428571428571
0.9736842105263158
[2 0 2 1 0 2 0 0 1 1 2 2 2 1 1 0 0 0 0 2 2 1 1 2 1 2 0 1 1 0 0 0 2 1 2 0 2
 2]
{'memory': None, 'steps': [('scale', StandardScaler()), ('model_dt', DecisionTreeClassifier(max_depth=3))], 'verbose': False, 'scale': StandardScaler(), 'model_dt': DecisionTreeClassifier(max_depth=3), 'scale__copy': True, 'scale__with_mean': True, 'scale__with_std': True, 'model_dt__ccp_alpha': 0.0, 'model_dt__class_weight': None, 'model_dt__criterion': 'gini', 'model_dt__max_depth': 3, 'model_dt__max_features': None, 'model_dt__max_leaf_nodes': None, 'model_dt__min_impurity_decrease': 0.0, 'model_dt__min_impurity_split': None, 'model_dt__min_samples_leaf': 1, 'model_dt__min_samples_split': 2, 'model_dt__min_weight_fraction_leaf': 0.0, 'model_dt__random_state': None, 'model_dt__splitter': 'best'}
0.9731615673644659


### 스케일 + Polynominal + 모델화 

In [10]:
#농어의 길이와 몸무게

perch_length = np.array(
    [8.4, 13.7, 15.0, 16.2, 17.4, 18.0, 18.7, 19.0, 19.6, 20.0, 
     21.0, 21.0, 21.0, 21.3, 22.0, 22.0, 22.0, 22.0, 22.0, 22.5, 
     22.5, 22.7, 23.0, 23.5, 24.0, 24.0, 24.6, 25.0, 25.6, 26.5, 
     27.3, 27.5, 27.5, 27.5, 28.0, 28.7, 30.0, 32.8, 34.5, 35.0, 
     36.5, 36.0, 37.0, 37.0, 39.0, 39.0, 39.0, 40.0, 40.0, 40.0, 
     40.0, 42.0, 43.0, 43.0, 43.5, 44.0]
     ).reshape(-1, 1)

perch_weight = np.array(
    [5.9, 32.0, 40.0, 51.5, 70.0, 100.0, 78.0, 80.0, 85.0, 85.0, 
     110.0, 115.0, 125.0, 130.0, 120.0, 120.0, 130.0, 135.0, 110.0, 
     130.0, 150.0, 145.0, 150.0, 170.0, 225.0, 145.0, 188.0, 180.0, 
     197.0, 218.0, 300.0, 260.0, 265.0, 250.0, 250.0, 300.0, 320.0, 
     514.0, 556.0, 840.0, 685.0, 700.0, 700.0, 690.0, 900.0, 650.0, 
     820.0, 850.0, 900.0, 1015.0, 820.0, 1100.0, 1000.0, 1100.0, 
     1000.0, 1000.0]
     ).reshape(-1, 1)

In [None]:
#파이프라인 없이 모델 구축 후 평가
#스케일링 -> 데이터 분리 -> 다항식 이용 -> 모델 만들어서 학습 -> 성능평가

In [11]:
#스케일링
scale = StandardScaler()
scale.fit(perch_length)
X_scale = scale.transform(perch_length)
y_scale = scale.transform(perch_weight)

#데이터 분리
# X_train, X_test, y_train, y_test = train_test_split(X_scale, y, random_state=111)
X_train, X_test, y_train, y_test = train_test_split(X_scale, y_scale, random_state=111)

#다항식 이용
poly = PolynomialFeatures(degree=3, include_bias=False)
poly.fit(X_train)
X_train_poly = poly.transform(X_train)
X_test_poly = poly.transform(X_test)

#모델 생성
model_lr = LinearRegression()
model_lr.fit(X_train_poly, y_train)
print()

#평가
print(model_lr.score(X_test_poly, y_test))
print()
print(model_lr.predict(X_test_poly))



0.9369469361716163

[[ 13.96546946]
 [ 10.96993168]
 [ 10.96993168]
 [ 94.08254146]
 [ 10.96993168]
 [ 64.8561176 ]
 [ 39.47073516]
 [ 14.93081859]
 [ 33.7036993 ]
 [ 16.40618757]
 [ 28.73506637]
 [ 28.73506637]
 [ 18.27519618]
 [112.77435527]]


In [13]:
#파이프라인 이용

#데이터 분리
X_train, X_test, y_train, y_test = train_test_split(perch_length, perch_weight, random_state=111)

#파이프라인 구축
pipe2 = Pipeline([ ('scale', StandardScaler()), 
                      ('poly', PolynomialFeatures(degree=3, include_bias=False)),
                      ('model_lr', LinearRegression()) ])

#파이프라인의 파라미터 정보
print(pipe2.get_params())

#파이프라인 이용 학습
pipe2.fit(X_train, y_train)

#성능 평가
print(pipe2.score(X_test, y_test))

#예측
print(pipe2.predict(X_test))


{'memory': None, 'steps': [('scale', StandardScaler()), ('poly', PolynomialFeatures(degree=3, include_bias=False)), ('model_lr', LinearRegression())], 'verbose': False, 'scale': StandardScaler(), 'poly': PolynomialFeatures(degree=3, include_bias=False), 'model_lr': LinearRegression(), 'scale__copy': True, 'scale__with_mean': True, 'scale__with_std': True, 'poly__degree': 3, 'poly__include_bias': False, 'poly__interaction_only': False, 'poly__order': 'C', 'model_lr__copy_X': True, 'model_lr__fit_intercept': True, 'model_lr__n_jobs': None, 'model_lr__normalize': False, 'model_lr__positive': False}
0.9369469361716162
[[ 152.75469172]
 [ 125.97232365]
 [ 125.97232365]
 [ 869.06176731]
 [ 125.97232365]
 [ 607.75548538]
 [ 380.79101196]
 [ 161.38564135]
 [ 329.22935988]
 [ 174.57655325]
 [ 284.8060324 ]
 [ 284.8060324 ]
 [ 191.28690043]
 [1036.18068663]]


# 퀴즈 

보스턴 주택 가격데이타를 파이프라인 구축 후 테스트하여라
    
    PolynomialFeatures + 스케일링 + 선형회귀

In [14]:
from sklearn.datasets import load_boston

boston = load_boston()
boston_df = pd.DataFrame(boston.data , columns = boston.feature_names)

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error , r2_score

In [17]:
# y 데이타 
boston_df['PRICE'] = boston.target
boston_df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [22]:
y_target = boston_df['PRICE']
X_data = boston_df.drop(['PRICE'], axis=1, inplace=False)

X_train , X_test , y_train , y_test = train_test_split( X_data , y_target , \
                                                       test_size=0.3, random_state=156)

In [23]:
# 학습용데이타 X 에 PolynomialFeatures 적용 
poly.fit(X_train)
X_train_poly = poly.transform(X_train)
X_test_poly = poly.transform(X_test)

In [24]:
# 선형 모델 생성 후 학습 
model_lr = LinearRegression()
model_lr.fit(X_train_poly, y_train)

print(model_lr.score(X_train_poly, y_train))
print(model_lr.score(X_test_poly, y_test))

1.0
-1116.5979172270693


In [25]:
#파이프라인 구축
pipe3 = Pipeline([ ('scale', StandardScaler()), 
                      ('poly', PolynomialFeatures(degree=3, include_bias=False)),
                      ('model_lr', LinearRegression()) ])

In [26]:
#파이프라인의 파라미터 정보
print(pipe3.get_params())

{'memory': None, 'steps': [('scale', StandardScaler()), ('poly', PolynomialFeatures(degree=3, include_bias=False)), ('model_lr', LinearRegression())], 'verbose': False, 'scale': StandardScaler(), 'poly': PolynomialFeatures(degree=3, include_bias=False), 'model_lr': LinearRegression(), 'scale__copy': True, 'scale__with_mean': True, 'scale__with_std': True, 'poly__degree': 3, 'poly__include_bias': False, 'poly__interaction_only': False, 'poly__order': 'C', 'model_lr__copy_X': True, 'model_lr__fit_intercept': True, 'model_lr__n_jobs': None, 'model_lr__normalize': False, 'model_lr__positive': False}


In [29]:
#파이프라인 이용 학습
pipe3.fit(X_train, y_train)

#성능 평가
print(pipe3.score(X_test, y_test))
print(pipe3.score(X_train, y_train))

#예측
print(pipe3.predict(X_test))

-464.1830001411799
1.0
[ 3.68537574e+01  1.55173593e+01  3.56286071e+01  2.17320161e+01
  1.20703435e+01  2.34008464e+01 -6.25374031e+00  3.05512892e+01
  3.15282856e+00  3.52483266e+01  2.92103439e+01 -5.17231495e+01
  3.00492888e+01  3.18250412e+01  6.36554599e+01  2.24747144e+01
  2.23553370e+03  6.99985515e+00  4.22498334e+01  2.65444115e+01
  2.53914810e+01  6.02779668e+01  2.81217789e+01  1.76345904e+01
  3.31033461e+01  2.98406660e+01 -7.50738149e+00  4.49112641e+01
  1.67832365e+01  3.73362636e+01  3.06008544e+01  1.71183004e+01
  1.61025875e+01  2.34543028e+01  2.36033504e+01  9.89474553e+00
  1.41936045e+01 -2.66089850e+00  6.76040486e+00  2.55453880e+01
  3.70107806e+01 -4.82199232e+01 -5.11882049e-01 -8.11212667e+01
  2.05075784e+01  1.50893691e+01  2.44024387e+01  1.18797176e+01
  3.32083282e+01  8.89477767e+00  3.97092333e+01  6.54499378e+00
  8.29095651e+01  1.91340061e+01  2.37952695e+01  2.70550342e+01
  1.28105253e+01  2.91456642e+01  1.39681743e+01  9.98082591e+00
  