In [1]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, RobustScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, VotingRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint

In [2]:
def model_scaler(data, col, scaler = None):
  
  '''
  정규화 함수
  data : dataframe
  column : P_PRICE
  scaler : standard, robust, minmax, log

  '''
 
  features = data.drop(col, axis=1)
  target = data[col]

  if scaler == 'standard':
    scaler = StandardScaler()
    features = scaler.fit_transform(features)

    return features, target

  elif scaler == 'robust':
    scaler = RobustScaler()
    features = scaler.fit_transform(features)

    return features, target

  elif scaler == 'minmax':
    scaler = MinMaxScaler()
    features = scaler.fit_transform(features)

    return features, target

  elif scaler == 'log':
    features = np.log1p(features)

    return features, target

################################################################################################################################################

def model_train(data, col, scaler, model = None):

  '''
  
  data : dataframe
  column : P_PRICE
  scaler : standard, robust, minmax, log
  model_name : linear, ridge, lasso, elastic, decisiontree,
               randomforest, ada, gradient, xgb, lgbm

  '''

  features, target = model_scaler(data, col, scaler)
  x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0, random_state=0)
  
  if model == 'linear': 
    
    model = LinearRegression()
    neg_mse_scores = cross_val_score(lr, features, target, scoring = 'neg_mean_squared_error', cv = 5)
    rmse_scores = np.sqrt(-1 * neg_mse_scores)
    avg_rmse = np.mean(rmse_scores)


    print('RMSE : {:.4f}'.format(rmse))

  elif model == 'ridge':
    
    params = {
              'alpha': randint(0, 10),            
              'fit_intercept':(True, False),
              'normalize':(True, False),

              }

    ridge = Ridge(random_state=0)
    final = RandomizedSearchCV(rf, param_distributions = params, cv = 5, scoring = 'neg_mean_squared_error', n_iter = 10, n_jobs = -1 ,random_state=random_state)
    final.fit(features, target)

    print('Best Params:', final.best_params_)
    print('Best Score:', np.sqrt(-1 *final.best_score_))

    
  elif model == 'lasso':

    params = {
              'alpha': randint(0, 10),            
              'fit_intercept':(True, False),
              'normalize':(True, False),

              }

    lasso = Lasso(random_state=0)
    final = RandomizedSearchCV(lasso, param_distributions = params, cv = 5, scoring = 'neg_mean_squared_error', n_iter = 10, n_jobs = -1 ,random_state=random_state)
    final.fit(features, target)

    print('Best Params:', final.best_params_)
    print('Best Score:', np.sqrt(-1 *final.best_score_))
  
  elif model == 'elastic':
    pass

  elif model == 'decisiontree':
    
    params = {
              'max_depth': randint(10, 100),            
              'n_estimators':randint(100, 1000),
               #'min_child_samples': randint(5, 50),
              'min_samples_leaf':randint(1, 10),
              'min_samples_split': randint(1, 10),
              'max_leaf_nodes': randint(2, 10)
              }

    dt = DecisionTreeRegressor(random_state=0)
    final = RandomizedSearchCV(dt, param_distributions = params, cv = 5, scoring = 'neg_mean_squared_error', n_iter = 10, n_jobs = -1 ,random_state=random_state)
    final.fit(features, target)

    print('Best Params:', final.best_params_)
    print('Best Score:', np.sqrt(-1 *final.best_score_))
  
  elif model == 'randomforest':
    
    params = {
              'max_depth': randint(30, 100),            
              'n_estimators':randint(100, 1000),
               #'min_child_samples': randint(5, 50),
              'min_samples_leaf':randint(1, 10),
              'min_samples_split': randint(2, 10),
              'max_leaf_nodes': randint(2, 10)

              }

    rf = RandomForestRegressor(random_state=0)
    final = RandomizedSearchCV(rf, param_distributions = params, cv = 5, scoring = 'neg_mean_squared_error', n_iter = 10, n_jobs = -1 ,random_state=random_state)
    final.fit(features, target)

    print('Best Params:', final.best_params_)
    print('Best Score:', np.sqrt(-1 *final.best_score_))

  elif model == 'ada':
    
    params = {'n_estimators' : randint(30, 1000),
              'learning_rate' : randint(0.01 , 1)}

    ada = AdaBoostRegressor()
    final = RandomizedSearchCV(ada, param_distributions = params, cv = 5, scoring = 'neg_mean_squared_error', n_iter = 10, n_jobs = -1 ,random_state=random_state)
    final.fit(features, target)

    print('Best Params:', final.best_params_)
    print('Best Score:', np.sqrt(-1 *final.best_score_))

  elif model == 'gradinet':

    params = {'n_estimators' : randint(30, 1000),
              'learning_rate' : randint(0.01 , 1),
              'subsample' : randint(0.01, 1),
              'min_samples_split' : randint(2, 10),
              'max_depth' : randint(3, 10),
              }   

    grad = GradientBoostingRegressor()
    final = RandomizedSearchCV(grad, param_distributions = params, cv = 5, scoring = 'neg_mean_squared_error', n_iter = 10, n_jobs = -1 ,random_state=random_state)
    final.fit(features, target)

    print('Best Params:', final.best_params_)
    print('Best Score:', np.sqrt(-1 *final.best_score_))

  elif model == 'xgb':
    
    params = {'n_estimators' : randint(30, 1000),
              'learning_rate' : randint(0.01 , 1),
              'max_depth' : randint(1, 10),
              'min_child_weight' : randint(2, 10),
              }   

    xgb = XGBRegressor()
    final = RandomizedSearchCV(xgb, param_distributions = params, cv = 5, scoring = 'neg_mean_squared_error', n_iter = 10, n_jobs = -1 ,random_state=random_state)
    final.fit(features, target)

    print('Best Params:', final.best_params_)
    print('Best Score:', np.sqrt(-1 *final.best_score_))

  elif model == 'lgbm':
    params = {'n_estimators' : randint(30, 1000),
              'learning_rate' : randint(0.01 , 1),
              'max_depth' : randint(-1, 10),
              'min_child_weight' : randint(0.001, 1),
              'num_leaves': randint(3, 100),
              'min_child_samples':randint(1, 100)
              }   

    lgbm = XGBRegressor()
    final = RandomizedSearchCV(lgbm, param_distributions = params, cv = 5, scoring = 'neg_mean_squared_error', n_iter = 10, n_jobs = -1 ,random_state=random_state)
    final.fit(features, target)

    print('Best Params:', final.best_params_)
    print('Best Score:', np.sqrt(-1 *final.best_score_))