# 2. Cars93 데이터로 SVM과 인공신경망 모델을 이용해 회귀분석 후 R2와 RMSE을 구하고 비교하세요 > Price 예측

In [33]:
import pandas as pd 
df = pd.read_csv('data/Cars93.csv')

# 1. 전처리
# 1-1) 결측치
'''
print(df.isnull().sum()) # Rear.seat.room
print(df['Rear.seat.room'].value_counts()) # 범주형은 아니지만, 빈도수 관점에서 최빈값으로 결측치 대치
'''
df['Rear.seat.room'].fillna(df['Rear.seat.room'].value_counts().index[0], inplace = True)


# 1-2) 불필요한 컬럼 삭제 및 One-Hot Encoding
'''
print("--------------")
print(df['Manufacturer'].value_counts()) # 제거
print("--------------")
print(df['Model'].value_counts()) # 제거
print("--------------")
print(df['Type'].value_counts()) # 범주형변수
print("--------------")
print(df['AirBags'].value_counts()) # 범주형변수
print("--------------")
print(df['DriveTrain'].value_counts()) # 범주형변수
print("--------------")
print(df['Cylinders'].value_counts()) # 범주형변수
print("--------------")
print(df['Man.trans.avail'].value_counts()) # 범주형변수
print("--------------")
print(df['Origin'].value_counts()) # 범주형변수
print("--------------")
print(df['Make'].value_counts()) # 제거
print("--------------")
'''

df.drop(columns = ['Manufacturer','Model','Make'], inplace = True)
df = pd.get_dummies(df, ['Type','AirBags','DriveTrain','Cylinders','Man.trans.avail','Origin'])

'''
df = pd.get_dummies(df, columns = ['Manufacturer', 'Model', 'Type', 'AirBags', 'DriveTrain', 
                                   'Cylinders', 'Man.trans.avail', 'Rear.seat.room', 'Origin', 'Make']) # 변수 모두 포함
'''
# 3. 데이터 분할 및 Scale
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
X = df.drop(['Price'], axis = 1)
y = df['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2022)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

#scaler = MinMaxScaler() # SVR : 2.X, MLP : 5.X
scaler = StandardScaler() # SVR : 3.X MLP : 3.X
X_train_Scaled = scaler.fit_transform(X_train)
X_test_Scaled = scaler.transform(X_test)

(65, 39)
(65,)
(28, 39)
(28,)


In [34]:
# 1. SVR
import warnings
warnings.filterwarnings('ignore')

from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
import numpy as np

svr = SVR()
param_grid = {'C' : [1,3,10,30,100], 'gamma' : [0.03,0.1,0.3,1,3]}

# 스케일 전
model_not_scaled = GridSearchCV(svr, param_grid, cv = 5)
model_not_scaled.fit(X_train, y_train)
y_pred_not_scaled = model_not_scaled.predict(X_test)

# 스케일 후
model_scaled = GridSearchCV(svr,param_grid)
model_scaled.fit(X_train_Scaled,y_train)
y_pred_scaled = model_scaled.predict(X_test_Scaled)


print('SVR_best_estimator_ : ', model_scaled.best_estimator_)
best_estimator_result = pd.DataFrame(model_scaled.cv_results_['params'])
best_estimator_result['mean_test_score'] = model_scaled.cv_results_['mean_test_score']
best_estimator_result = best_estimator_result.sort_values(['mean_test_score'], ascending = False)
print(best_estimator_result)

print('---------------------')
print('SVC_not_scaled_train_score : ', model_not_scaled.score(X_train, y_train))
print('SVC_scaled_train_score : ', model_scaled.score(X_train_Scaled,y_train))
print('---------------------')
print('SVC_not_scaled_test_score : ', model_not_scaled.score(X_test, y_test))
print('SVC_scaled_test_score : ', model_scaled.score(X_test_Scaled,y_test))
print('---------------------')
print('SVC_not_scaled_RMSE : ', np.sqrt(mean_squared_error(y_test, y_pred_not_scaled)))
print('SVC_scaled_RMSE : ', np.sqrt(mean_squared_error(y_test, y_pred_scaled)))

SVR_best_estimator_ :  SVR(C=100, gamma=0.03)
      C  gamma  mean_test_score
20  100   0.03         0.681497
15   30   0.03         0.675304
10   10   0.03         0.600031
5     3   0.03         0.444111
21  100   0.10         0.312213
16   30   0.10         0.308492
0     1   0.03         0.286130
11   10   0.10         0.277126
6     3   0.10         0.172505
1     1   0.10         0.062701
12   10   0.30         0.036193
17   30   0.30         0.015691
22  100   0.30         0.007782
7     3   0.30        -0.004992
2     1   0.30        -0.047464
13   10   1.00        -0.061664
8     3   1.00        -0.063264
9     3   3.00        -0.067987
14   10   3.00        -0.070665
3     1   1.00        -0.073806
4     1   3.00        -0.076134
18   30   1.00        -0.094373
19   30   3.00        -0.104211
23  100   1.00        -0.105874
24  100   3.00        -0.115854
---------------------
SVC_not_scaled_train_score :  0.31593967332921713
SVC_scaled_train_score :  0.9998864244809355
-----

In [35]:
# 2. MLP
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
import numpy as np
 
mlp = MLPRegressor(random_state = 42) # MLP random_state 지정필요
#param_grid = {'hidden_layer_sizes' : [40,50,64,72]} # best_estimator가 끝값을 가리키면 범위 조정 필요.
param_grid = {'hidden_layer_sizes' : [300,320,350,400]}

# 스케일 전
mlp_not_scaled = GridSearchCV(mlp, param_grid, cv = 5)
mlp_not_scaled.fit(X_train, y_train)
y_pred_not_scaled = mlp_not_scaled.predict(X_test)

# 스케일 후
mlp_scaled = GridSearchCV(mlp, param_grid, cv = 5)
mlp_scaled.fit(X_train_Scaled, y_train)
y_pred_scaled = mlp_scaled.predict(X_test_Scaled)

print('MLP_best_estimator_ : ', mlp_scaled.best_estimator_)
print('---------------------')
print('MLP_not_scaled_train_score : ', mlp_not_scaled.score(X_train, y_train))
print('MLP_scaled_train_score : ', mlp_scaled.score(X_train_Scaled, y_train))
print('---------------------')
print('MLP_not_scaled_test_score : ', mlp_not_scaled.score(X_test, y_test))
print('MLP_scaled_test_score : ', mlp_scaled.score(X_test_Scaled, y_test))
print('---------------------')
print('MLP_not_scaled_RMSE : ', np.sqrt(mean_squared_error(y_test, y_pred_not_scaled)))
print('MLP_scaled_RMSE : ', np.sqrt(mean_squared_error(y_test, y_pred_scaled)))

MLP_best_estimator_ :  MLPRegressor(hidden_layer_sizes=400, random_state=42)
---------------------
MLP_not_scaled_train_score :  -44331941253.26358
MLP_scaled_train_score :  0.9564692357295236
---------------------
MLP_not_scaled_test_score :  -32827932719.19108
MLP_scaled_test_score :  0.8575240309914612
---------------------
MLP_not_scaled_RMSE :  1831685.8154684517
MLP_scaled_RMSE :  3.815926707465746
