In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing

from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

from optuna import Trial
import optuna
from optuna.samplers import TPESampler

### 1. 무지성 인코딩

In [2]:

'''
    - 카테고리: 데이터 전처리
    - 개요: 무지성 인코딩 => 전체 컬럼 데이터 타입 확인 후 통째로 범주형 인코딩 처리
    - param: df
    - return: encoded columns dataframe
'''
def one_hot(df):
    # 컬럼의 이름 리스트로 뽑아오기
    columns_name_list = list(df.columns)

    # 컬럼마다 for문 반복
    for col_name in columns_name_list:
        # 만약 컬럼의 값 타입이 범주형이면
        if df[col_name].dtype == object or df[col_name].dtype == str:
            # print(col_name)

            # # 컬럼의 유니크한 값을 리스트로 만들어둠
            col_items = df[col_name].unique().tolist()
            # print(col_items)

            onehot = OneHotEncoder(sparse=False)
            # print(col_name)
            onehot_encoded_arr = onehot.fit_transform(df[col_name].values.reshape(-1, 1))
            onehot_encoded_label = onehot.categories_[0]
            onehot_encoded_df = pd.DataFrame(onehot_encoded_arr, columns=onehot_encoded_label)
            # print(onehot_encoded_df)
            df.drop(col_name, axis=1, inplace=True)
            df = pd.concat([df, onehot_encoded_df], axis=1)
        else:
            if col_name == '거래금액':
                continue
            scaled_label = [col_name]
            x = df[col_name].values.reshape(-1, 1) #returns a numpy array
            min_max_scaler = preprocessing.MinMaxScaler()
            x_scaled = min_max_scaler.fit_transform(x)
            scaled_df = pd.DataFrame(x_scaled, columns=scaled_label)

            df.drop(col_name, axis=1, inplace=True)
            df = pd.concat([df, scaled_df], axis=1)
    
    return df

'''
    - 카테고리: 데이터 전처리
    - 개요: 무지성 인코딩 => 새로운 데이터 컬럼 추가 시 별도 인코딩 결과만 출력할 때 사용 ㄱㄱ
    - param: df, col_name
    - return: encoded columns dataframe or series
'''
def encode_column(df, col_name):

    # column data type이 object 또는 str 즉 범주형일 경우
    # onehot 인코딩 수행된 데이터프레임 return
    if df[col_name].dtype == object or df[col_name].dtype == str:
        onehot = OneHotEncoder(sparse=False)

        onehot_encoded_arr = onehot.fit_transform(df[col_name].values.reshape(-1, 1))
        onehot_encoded_label = onehot.categories_[0]
        onehot_encoded_df = pd.DataFrame(onehot_encoded_arr, columns=onehot_encoded_label)

        return onehot_encoded_df

    # column data type이 나머지 타입일 경우
    # 해당 컬럼의 series return
    else:
        return df[col_name]

In [7]:
# 1-1 test one_hot function
sample_df = pd.read_csv('C:/Projects/python-workspace/Machine Learning/Machine learning project/team4 eda prj/Team4_EDA_ML_Prj/dataset/sample.csv', encoding='utf-16')
one_hot_df = one_hot(sample_df)
one_hot_df.head()
# print(len(one_hot_df.columns))

Unnamed: 0.1,거래금액,Unnamed: 0,전용면적,계약년월,층,건축년도,강남구,강동구,강북구,강서구,...,황학동,회기동,회현동1가,회현동2가,효제동,효창동,후암동,휘경동,흑석동,흥인동
0,130000,0.0,0.181669,0.0,0.097222,0.433333,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,117000,4e-06,0.181669,0.0,0.069444,0.433333,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,130000,8e-06,0.181669,0.0,0.055556,0.433333,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,139500,1.2e-05,0.181669,0.009479,0.069444,0.433333,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,107500,1.7e-05,0.116767,0.014218,0.111111,0.433333,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 2. 모델링

In [8]:
'''
    - 카테고리: 모델링
    - 개요: 머신러닝 모델링 수행 및 점수 도출
        - 교차 검증 방법으로 TimeSeriesSplit 수행
    - param: 

        1. model_tuple => ex. ('LR', LinearRegression())
        2. X_train, y_train, X_test, y_test

    - return: rmse
'''
def execute_modeling(model_tuple, X_train, y_train, X_test, y_test):

    name = model_tuple[0]
    model = model_tuple[1]

    # 각 모델에 대하여 실질적 학습 수행
    clf = model.fit(X_train, y_train)
    pred = clf.predict(X_test)

    # 각 모델의 rmse 점수 도출 
    rmse = mean_squared_error(y_test, pred) ** 0.5
    print(f'{name} rmse: {rmse}')

    # TimeSeries Cross validation 
    tscv = TimeSeriesSplit(n_splits=15)

    # 각 모델에 대하여 교차 검증한 결과 점수 확인
    # scoring parameter option 어캐 줘야 함?
    cv_results = cross_val_score(model, X_train, y_train, cv=tscv, scoring='neg_mean_squared_error')
    cv_results = np.sqrt(-cv_results)
    print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))

    return rmse

In [11]:
from sklearn.linear_model import LinearRegression 
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import lightgbm as lgb

# 2. test execute_modeling function
preprocessed_train_df = one_hot_df.drop('Unnamed: 0', axis=1)
preprocessed_train_df = preprocessed_train_df.drop_duplicates()

X_train = preprocessed_train_df[preprocessed_train_df['계약년월'] < 0.6].drop(['거래금액'], axis=1) 
y_train = preprocessed_train_df[preprocessed_train_df['계약년월'] < 0.6]['거래금액'] 
X_test = preprocessed_train_df[preprocessed_train_df['계약년월'] >= 0.6].drop(['거래금액'], axis=1)
y_test = preprocessed_train_df[preprocessed_train_df['계약년월'] >= 0.6]['거래금액']

model_list = [
                ('LR', LinearRegression()), 
                ('RF', RandomForestRegressor(n_estimators = 10)),
                ('model_xgb', xgb.XGBRegressor(n_estimators=500, max_depth=9, min_child_weight=5, gamma=0.1, n_jobs=-1)),
                ('model_lgb', lgb.LGBMRegressor(n_estimators=500, max_depth=9, min_child_weight=5, n_jobs=-1))
            ]

for model_tuple in model_list:
    # execute_modeling(model_tuple, X_train, y_train, X_test, y_test)
    pass

### 3. 파라미터 튜닝

In [12]:
# 3. test get_best_param function

# linear regression model object
def linear_object(trial:Trial, X_train, y_train, X_test, y_test):
    params = {
        'fit_intercept': trial.suggest_categorical('fit_intercept', [True, False])
    }
    
    test_model = LinearRegression(**params)
    test_model_score = execute_modeling(('LR', test_model), X_train, y_train, X_test, y_test)

    return test_model_score

# xgbr regression model object
def xgbr_object(trial:Trial, X_train, y_train, X_test, y_test):
    params = {
        "n_estimators" : trial.suggest_int('n_estimators', 500, 1000),
        'max_depth':trial.suggest_int('max_depth', 8, 16),
        'min_child_weight':trial.suggest_int('min_child_weight', 1, 100),
        'gamma':trial.suggest_int('gamma', 1, 3),
        'learning_rate': 0.01,
        'colsample_bytree':trial.suggest_discrete_uniform('colsample_bytree',0.5, 1, 0.1),
        'nthread' : -1,
        'tree_method': 'gpu_hist',
        'predictor': 'gpu_predictor',
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 1.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 1.0),
        'subsample': trial.suggest_categorical('subsample', [0.6,0.7,0.8,1.0] ),
        'random_state': 42
    }
    
    test_model = xgb.XGBRegressor(**params)
    test_model_score = execute_modeling(('XGBR', test_model), X_train, y_train, X_test, y_test)

    return test_model_score

# light gbm regression model object
def lgb_object(trial:Trial, X_train, y_train, X_test, y_test):
    params = {
        "n_estimators" : trial.suggest_int('n_estimators', 500, 1000),
        'max_depth':trial.suggest_int('max_depth', 8, 16),
        'min_child_weight':trial.suggest_int('min_child_weight', 1, 100)
    }
    
    test_model = lgb.LGBMRegressor(**params)
    test_model_score = execute_modeling(('LGBM', test_model), X_train, y_train, X_test, y_test)

    return test_model_score

# 하이퍼 파라미터 결과 도출
# 위에서 분할한  X_train, y_train, X_test, y_test 파라미터 삽입
study = optuna.create_study(direction='minimize', sampler=TPESampler())
study.optimize(lambda trial: lgb_object(trial, X_train, y_train, X_test, y_test), n_trials=3)

best_score = study.best_value
best_param_dict = study.best_trial.params

print(best_score, best_param_dict)

[32m[I 2021-12-28 11:47:40,028][0m A new study created in memory with name: no-name-ae7d1bcb-25ba-46e6-b64c-f95b04f145b9[0m


LGBM rmse: 16823.53038115679


[32m[I 2021-12-28 11:48:09,687][0m Trial 0 finished with value: 16823.53038115679 and parameters: {'n_estimators': 741, 'max_depth': 14, 'min_child_weight': 62}. Best is trial 0 with value: 16823.53038115679.[0m


LGBM: 23491.332917 (15555.415931)
LGBM rmse: 16657.87889421775


[32m[I 2021-12-28 11:48:38,456][0m Trial 1 finished with value: 16657.87889421775 and parameters: {'n_estimators': 804, 'max_depth': 11, 'min_child_weight': 38}. Best is trial 1 with value: 16657.87889421775.[0m


LGBM: 23167.745078 (15413.918432)
LGBM rmse: 17311.278753577306


[32m[I 2021-12-28 11:49:03,448][0m Trial 2 finished with value: 17311.278753577306 and parameters: {'n_estimators': 598, 'max_depth': 16, 'min_child_weight': 75}. Best is trial 1 with value: 16657.87889421775.[0m


LGBM: 23691.161756 (15549.838022)
16657.87889421775 {'n_estimators': 804, 'max_depth': 11, 'min_child_weight': 38}


In [14]:
# 하이퍼파라미터별 중요도를 확인할 수 있는 그래프
optuna.visualization.plot_param_importances(study)

In [15]:
# 하이퍼파라미터 최적화 과정을 확인
optuna.visualization.plot_optimization_history(study)