In [0]:
import numpy as np
import pandas as pd
import os

In [0]:
#XGboost
from xgboost import XGBRegressor

#SVM
from sklearn.svm import SVR
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV, cross_validate

#metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

import time

#Decision Tree 
import matplotlib.pyplot as pl
from sklearn.model_selection import learning_curve
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import validation_curve

In [0]:
os.chdir('C:/Users/JAEMIN/Desktop/DeepLearning/TEAM_PROJECT/data')

In [0]:
pca2d = pd.read_csv('pca_2D.csv')
pca3d = pd.read_csv('pca_3D.csv')
FA = pd.read_csv('Selected_var(FA_corr).csv', encoding = 'CP949')

In [0]:
X, y = pca2d.iloc[:,:2], pca2d.iloc[:,2]

X_train, X_test = x.iloc[:int(len(x)*0.7)], x.iloc[int(len(x)*0.7):]
y_train, y_test = y.iloc[:int(len(y)*0.7)], y.iloc[int(len(y)*0.7):]

In [0]:
def svr_model_lr(X_train, y_train, X_test, y_test):
    tic=time.time()
    #best parameter extraction
    gsc = GridSearchCV(
        estimator=SVR(kernel='linear'),
        param_grid={
            'C': [0.1, 1, 100, 1000],
            'epsilon': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10]},
        cv=10, scoring='r2', verbose=1, n_jobs=-1)

    grid_result = gsc.fit(X, y)
    best_params = grid_result.best_params_
    
    #best model
    best_svr = SVR(kernel='rbf', C=best_params["C"], epsilon=best_params["epsilon"], gamma='auto',
                   coef0=0.1, shrinking=True,
                   tol=0.001, cache_size=200, verbose=False, max_iter=10000)
    
    #Fitting based on best model
    best_svr.fit(X_train, y_train)
    y_pred = best_svr.predict(X_test)
    
    #socres 
    global R_svr_li, MAE_svr_li, RMSE_svr_li
    R_svr_li = round(best_svr.score(X_test,y_test),3)
    MAE_svr_li = round(mean_absolute_error(y_test,y_pred),3)
    RMSE_svr_li = round(np.sqrt(mean_squared_error(y_test,y_pred)),3)
    toc=time.time()    
    return ('Rsquare:',best_svr.score(X_test,y_test),
            'MAE:',mean_absolute_error(y_test,y_pred),
            'RMSE:',np.sqrt(mean_squared_error(y_test,y_pred)),
            'Time:',toc-tic)

In [0]:
svr_model_lr(X_train, y_train, X_test, y_test)

Fitting 10 folds for each of 44 candidates, totalling 440 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.1s
[Parallel(n_jobs=-1)]: Done 440 out of 440 | elapsed:   24.2s finished


('Rsquare:',
 -1.2415687621506226,
 'MAE:',
 0.13828133356942934,
 'RMSE:',
 0.1693432428490558,
 'Time:',
 24.33316469192505)

In [0]:
def svr_model_rbf(X_train, y_train, X_test, y_test):
    gsc = GridSearchCV(
        estimator=SVR(kernel='rbf'),
        param_grid={
            'C': [0.1, 1, 100, 1000],
            'epsilon': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10],
            'gamma': [0.0001, 0.001, 0.005, 0.1, 1, 3, 5]},
        cv=10, scoring='r2', verbose=0, n_jobs=-1)

    grid_result = gsc.fit(X_train, y_train)
    best_params = grid_result.best_params_
    best_svr_rbf = SVR(kernel='rbf', C=best_params["C"], epsilon=best_params["epsilon"], gamma=best_params["gamma"],
                   coef0=0.1, shrinking=True,
                   tol=0.001, cache_size=200, verbose=False, max_iter=10000)
    
    #Fitting based on best model
    best_svr_rbf.fit(X_train, y_train)
    y_pred = best_svr_rbf.predict(X_test)
    
    #socres 
    global R_svr_rbf, MAE_svr_rbf, RMSE_svr_rbf
    R_svr_rbf = round(best_svr_rbf.score(X_test,y_test),3)
    MAE_svr_rbf = round(mean_absolute_error(y_test,y_pred),3)
    RMSE_svr_rbf = round(np.sqrt(mean_squared_error(y_test,y_pred)),3)
        
    return ('Rsquare:',best_svr_rbf.score(X_test,y_test),
            'MAE:',mean_absolute_error(y_test,y_pred),
            'RMSE:',np.sqrt(mean_squared_error(y_test,y_pred)))

In [0]:
svr_model_rbf(X_train, y_train, X_test, y_test)

('Rsquare:',
 -1.3644556121006297,
 'MAE:',
 0.13680645187312737,
 'RMSE:',
 0.17392316176169376)

In [0]:
def xgb_reg(X_train,y_train,X_test,y_test):
  #최적의 파라미터 추출
    xgb_param_grid={
        'n_estimator' : list(range(1000,2000,100)),
        'learning_rate' : list(np.arange(0.0260,0.031, 0.0001)),
        'max_depth' : list(range(1,4,1))}
    xgb = GridSearchCV(XGBRegressor(),
                        param_grid=xgb_param_grid,
                        n_jobs=-1,
                        verbose=1,
                        scoring='r2')
    xgb_result = xgb.fit(X_train,y_train)
    best_params = xgb_result.best_params_
    
    #최적의 모델
    best_xgb = XGBRegressor(learning_rate=best_params["learning_rate"], 
                           max_depth=best_params["max_depth"], 
                           n_estimators=best_params["n_estimator"],
                           verbose=1, 
                           max_iter=10000)
    #최적의 모델로 데이터 핏팅
    best_xgb.fit(X_train, y_train)
    y_pred = best_xgb.predict(X_test)

    #점수 저장 
    global R_xgb, MAE_xgb, RMSE_xgb
    R_xgb = round(best_xgb.score(X_test,y_test),3)
    MAE_xgb = round(mean_absolute_error(y_test,y_pred),3)
    RMSE_xgb = round(np.sqrt(mean_squared_error(y_test,y_pred)),3)
    
    return (best_xgb.score(X_test,y_test),
            mean_absolute_error(y_test,y_pred),
            np.sqrt(mean_squared_error(y_test,y_pred)))

In [0]:
xgb_reg(X_train,y_train,X_test,y_test)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 1530 candidates, totalling 4590 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 1343 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done 3843 tasks      | elapsed:   10.1s
[Parallel(n_jobs=-1)]: Done 4590 out of 4590 | elapsed:   11.6s finished


(-1.6940057382361675, 0.15076597690024401, 0.18564835159100754)

In [0]:
print('====SVR kernel linear====')
print('MAE:', MAE_svr_li)
print('RMSE:', RMSE_svr_li)
print('R-squared', R_svr_li)

print('====SVR kernel rbf====')
print('MAE:', MAE_svr_rbf)
print('RMSE:', RMSE_svr_rbf)
print('R-squared', R_svr_rbf)

print('====XgBoost====')
print('MAE:', MAE_xgb)
print('RMSE:', RMSE_xgb)
print('R-squared', R_xgb)

====SVR kernel linear====
MAE: 0.138
RMSE: 0.169
R-squared -1.242
====SVR kernel rbf====
MAE: 0.137
RMSE: 0.174
R-squared -1.364
====XgBoost====
MAE: 0.151
RMSE: 0.186
R-squared -1.694


In [0]:
X, y = pca3d.iloc[:,:3], pca3d.iloc[:,3]

X_train, X_test = x.iloc[:int(len(x)*0.7)], x.iloc[int(len(x)*0.7):]
y_train, y_test = y.iloc[:int(len(y)*0.7)], y.iloc[int(len(y)*0.7):]

In [0]:
svr_model_lr(X_train, y_train, X_test, y_test)

Fitting 10 folds for each of 44 candidates, totalling 440 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done 440 out of 440 | elapsed:   18.2s finished


('Rsquare:',
 -1.2415687621506226,
 'MAE:',
 0.13828133356942934,
 'RMSE:',
 0.1693432428490558,
 'Time:',
 18.284090995788574)

In [0]:
svr_model_rbf(X_train, y_train, X_test, y_test)

('Rsquare:',
 -1.3644556121006297,
 'MAE:',
 0.13680645187312737,
 'RMSE:',
 0.17392316176169376)

In [0]:
xgb_reg(X_train,y_train,X_test,y_test)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 1530 candidates, totalling 4590 fits


[Parallel(n_jobs=-1)]: Done 160 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 1360 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done 3360 tasks      | elapsed:    9.5s
[Parallel(n_jobs=-1)]: Done 4590 out of 4590 | elapsed:   12.8s finished


(-1.6940057382361675, 0.15076597690024401, 0.18564835159100754)

In [0]:
print('====SVR kernel linear====')
print('MAE:', MAE_svr_li)
print('RMSE:', RMSE_svr_li)
print('R-squared', R_svr_li)

print('====SVR kernel rbf====')
print('MAE:', MAE_svr_rbf)
print('RMSE:', RMSE_svr_rbf)
print('R-squared', R_svr_rbf)

print('====XgBoost====')
print('MAE:', MAE_xgb)
print('RMSE:', RMSE_xgb)
print('R-squared', R_xgb)

====SVR kernel linear====
MAE: 0.138
RMSE: 0.169
R-squared -1.242
====SVR kernel rbf====
MAE: 0.137
RMSE: 0.174
R-squared -1.364
====XgBoost====
MAE: 0.151
RMSE: 0.186
R-squared -1.694


In [0]:
X, y = FA.iloc[:,:5], FA.iloc[:,5]

X_train, X_test = x.iloc[:int(len(x)*0.7)], x.iloc[int(len(x)*0.7):]
y_train, y_test = y.iloc[:int(len(y)*0.7)], y.iloc[int(len(y)*0.7):]

In [0]:
svr_model_lr(X_train, y_train, X_test, y_test)

Fitting 10 folds for each of 44 candidates, totalling 440 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done 440 out of 440 | elapsed:   36.3s finished


('Rsquare:',
 -14.848715869938005,
 'MAE:',
 0.35683753024997766,
 'RMSE:',
 0.45028645060663064,
 'Time:',
 36.54955840110779)

In [0]:
svr_model_rbf(X_train, y_train, X_test, y_test)

('Rsquare:',
 -1.3644556121006297,
 'MAE:',
 0.13680645187312737,
 'RMSE:',
 0.17392316176169376)

In [0]:
xgb_reg(X_train,y_train,X_test,y_test)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 1530 candidates, totalling 4590 fits


[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 1024 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done 2524 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done 4590 out of 4590 | elapsed:   13.0s finished


(-1.6940057382361675, 0.15076597690024401, 0.18564835159100754)

In [0]:
print('====SVR kernel linear====')
print('MAE:', MAE_svr_li)
print('RMSE:', RMSE_svr_li)
print('R-squared', R_svr_li)

print('====SVR kernel rbf====')
print('MAE:', MAE_svr_rbf)
print('RMSE:', RMSE_svr_rbf)
print('R-squared', R_svr_rbf)

print('====XgBoost====')
print('MAE:', MAE_xgb)
print('RMSE:', RMSE_xgb)
print('R-squared', R_xgb)

====SVR kernel linear====
MAE: 0.357
RMSE: 0.45
R-squared -14.849
====SVR kernel rbf====
MAE: 0.137
RMSE: 0.174
R-squared -1.364
====XgBoost====
MAE: 0.151
RMSE: 0.186
R-squared -1.694
