In [1]:
import numpy as np
import pandas as pd

np.random.seed(1234)

n_samples = 100

x1 = np.random.randn(n_samples)
x2 = np.random.randn(n_samples)
x3 = np.random.randn(n_samples)

# 임의의 계수
a, b, c, d = 2.0, -1.5, 3.0, 5.0

# 종속 변수 생성 + noise
noise = np.random.randn(n_samples) * 0.5
y0 = a * x1 + b * x2 + c * x3 + d + noise

df = pd.DataFrame({'X1': x1, 'X2': x2, 'X3': x3, 'Y': y0})
df.head()

Unnamed: 0,X1,X2,X3,Y
0,0.471435,0.291205,-0.319561,4.810901
1,-1.190976,0.566534,-0.619993,0.263825
2,1.432707,0.503592,0.156998,7.472249
3,-0.312652,0.285296,-0.571455,3.551282
4,-0.720589,0.484288,1.057633,5.134221


In [2]:
from sklearn.model_selection import train_test_split

X = df[['X1', 'X2', 'X3']]
y = df['Y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

In [4]:
from lightgbm import LGBMRegressor # Gradient Boosting Decision Tree(GBDT)

# 모델 학습
model = LGBMRegressor(random_state=1234, verbose=-1)
model.fit(X_train, y_train)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=None, num_leaves=31, objective=None,
              random_state=1234, reg_alpha=0.0, reg_lambda=0.0, subsample=1.0,
              subsample_for_bin=200000, subsample_freq=0, verbose=-1)

In [5]:
# 예측
y_pred = model.predict(X_test)
y_pred

array([ 2.49009192,  7.5551495 ,  1.29616788,  7.5551495 , -0.62564883,
        0.43013808,  3.45194308,  3.11295739,  7.42028387,  7.6039263 ,
        2.55443135,  3.33787344,  9.99131754,  5.24695534,  6.03126547,
       11.294911  ,  4.51650793, -0.62564883,  6.03126547,  3.33787344])

In [6]:
# 평가
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(y_test, y_pred) # rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
r2_manual = 1 - np.sum((y_test - y_pred) ** 2) / np.sum((y_test - y_test.mean()) ** 2)

mse, mae, r2, r2_manual

(2.3703965186429965, 1.321889433512746, 0.8218178914475234, 0.8218178914475234)