In [108]:
# LinearRegression
# KNeighborsRegressor
# RandomForestRegressor
# AdaBoostRegressor
# XGBRegressor
# XGBRegressor + 변수 선택
# PCA + XGBRegressor

In [109]:
import pandas as pd
import numpy as np

np.random.seed(1234)

n = 100

data = {
    'age': np.random.randint(18, 70, size=n),
    'income': np.random.normal(50000, 15000, size=n).astype(int),
    'experience_years': np.random.randint(0, 30, size=n),
    'hours_per_week': np.random.randint(20, 60, size=n),
}

# 범주형 변수
data['education_level'] = np.random.choice(['High School', 'Bachelor', 'Master', 'PhD'], size=n)
data['married'] = np.random.choice(['Yes', 'No'], size=n)
data['city'] = np.random.choice(['Seoul', 'Busan', 'Incheon'], size=n)

data['target'] = (
    data['age'] * 50 +
    data['income'] * 0.1 +
    data['experience_years'] * 100 +
    data['hours_per_week'] * 30 +
    np.random.normal(0, 1000, size=n)
)

df = pd.DataFrame(data)

df.head()

Unnamed: 0,age,income,experience_years,hours_per_week,education_level,married,city,target
0,65,28970,11,39,Bachelor,No,Seoul,9686.473462
1,37,48486,1,32,PhD,No,Seoul,8797.782328
2,56,41776,29,32,Bachelor,No,Seoul,11793.854644
3,30,47830,9,53,Bachelor,Yes,Seoul,10221.918488
4,42,55310,20,41,PhD,Yes,Incheon,11099.470343


In [110]:
# 표준화
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
col_s = ['age', 'income', 'experience_years', 'hours_per_week']
col_new = [item + '_s' for item in col_s]

scaled = ss.fit_transform(df[col_s])
df_s = pd.DataFrame(scaled, columns=col_new)

df_new = pd.concat([df, df_s], axis=1)
df_new

Unnamed: 0,age,income,experience_years,hours_per_week,education_level,married,city,target,age_s,income_s,experience_years_s,hours_per_week_s
0,65,28970,11,39,Bachelor,No,Seoul,9686.473462,1.268445,-1.477412,-0.503009,-0.041133
1,37,48486,1,32,PhD,No,Seoul,8797.782328,-0.548254,-0.174824,-1.745005,-0.628746
2,56,41776,29,32,Bachelor,No,Seoul,11793.854644,0.684506,-0.622680,1.732585,-0.628746
3,30,47830,9,53,Bachelor,Yes,Seoul,10221.918488,-1.002428,-0.218609,-0.751408,1.134094
4,42,55310,20,41,PhD,Yes,Incheon,11099.470343,-0.223843,0.280641,0.614788,0.126757
...,...,...,...,...,...,...,...,...,...,...,...,...
95,43,64762,16,57,PhD,Yes,Seoul,11693.707824,-0.158961,0.911511,0.117990,1.469873
96,61,48174,5,21,Bachelor,No,Incheon,9694.661632,1.008916,-0.195648,-1.248206,-1.552139
97,32,85486,22,57,PhD,No,Busan,15209.836756,-0.872664,2.294726,0.863188,1.469873
98,41,57442,23,30,Master,No,Seoul,11621.668018,-0.288725,0.422941,0.987387,-0.796636


In [111]:
# 더미변수 생성
df_dum = pd.get_dummies(df_new[['city', 'married', 'education_level']], drop_first=True)
df_dum

Unnamed: 0,city_Incheon,city_Seoul,married_Yes,education_level_High School,education_level_Master,education_level_PhD
0,0,1,0,0,0,0
1,0,1,0,0,0,1
2,0,1,0,0,0,0
3,0,1,1,0,0,0
4,1,0,1,0,0,1
...,...,...,...,...,...,...
95,0,1,1,0,0,1
96,1,0,0,0,0,0
97,0,0,0,0,0,1
98,0,1,0,0,1,0


In [112]:
df_dum_col = df_dum.columns.tolist()
df_dum_col

['city_Incheon',
 'city_Seoul',
 'married_Yes',
 'education_level_High School',
 'education_level_Master',
 'education_level_PhD']

In [113]:
df_final = pd.concat([df_new, df_dum], axis=1)
df_final.head()

Unnamed: 0,age,income,experience_years,hours_per_week,education_level,married,city,target,age_s,income_s,experience_years_s,hours_per_week_s,city_Incheon,city_Seoul,married_Yes,education_level_High School,education_level_Master,education_level_PhD
0,65,28970,11,39,Bachelor,No,Seoul,9686.473462,1.268445,-1.477412,-0.503009,-0.041133,0,1,0,0,0,0
1,37,48486,1,32,PhD,No,Seoul,8797.782328,-0.548254,-0.174824,-1.745005,-0.628746,0,1,0,0,0,1
2,56,41776,29,32,Bachelor,No,Seoul,11793.854644,0.684506,-0.62268,1.732585,-0.628746,0,1,0,0,0,0
3,30,47830,9,53,Bachelor,Yes,Seoul,10221.918488,-1.002428,-0.218609,-0.751408,1.134094,0,1,1,0,0,0
4,42,55310,20,41,PhD,Yes,Incheon,11099.470343,-0.223843,0.280641,0.614788,0.126757,1,0,1,0,0,1


In [114]:
col = col_new + df_dum_col + ['target']
df_final = df_final[col]
df_final.head()

Unnamed: 0,age_s,income_s,experience_years_s,hours_per_week_s,city_Incheon,city_Seoul,married_Yes,education_level_High School,education_level_Master,education_level_PhD,target
0,1.268445,-1.477412,-0.503009,-0.041133,0,1,0,0,0,0,9686.473462
1,-0.548254,-0.174824,-1.745005,-0.628746,0,1,0,0,0,1,8797.782328
2,0.684506,-0.62268,1.732585,-0.628746,0,1,0,0,0,0,11793.854644
3,-1.002428,-0.218609,-0.751408,1.134094,0,1,1,0,0,0,10221.918488
4,-0.223843,0.280641,0.614788,0.126757,1,0,1,0,0,1,11099.470343


In [115]:
from sklearn.model_selection import train_test_split

X = df_final[col_new + df_dum_col]
y = df_final['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

In [116]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [117]:
# 선형회귀
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

mse, mae, r2

(1156984.84463057, 878.1698568478444, 0.7084108873479193)

In [118]:
# KNN
from sklearn.neighbors import KNeighborsRegressor

for k in [3, 5, 7]:
    model = KNeighborsRegressor(n_neighbors=k)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print("k :", k, "," , mse, mae, r2)

k : 3 , 1958800.5834864087 1231.1264284394545 0.5063332707840653
k : 5 , 2345580.116140212 1271.6856437967108 0.4088551566653619
k : 7 , 2152980.6509625823 1181.2340278544877 0.45739503807266113


In [119]:
# 랜덤포레스트
from sklearn.ensemble import RandomForestRegressor

for n in [50, 100, 150]:
    for md in [10, 15, 20]:
        model = RandomForestRegressor(random_state=1234, n_estimators=n, max_depth=md)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
                
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        print("n =", n, "md =", md, "," , mse, mae, r2)

n = 50 md = 10 , 1533377.0998761454 987.0454387300048 0.6135506268825223
n = 50 md = 15 , 1527560.328534412 981.5727658771557 0.6150165987160406
n = 50 md = 20 , 1527560.328534412 981.5727658771557 0.6150165987160406
n = 100 md = 10 , 1462349.2427327628 969.4052697817594 0.6314514230201163
n = 100 md = 15 , 1483255.6559579237 974.0542453366731 0.626182484097231
n = 100 md = 20 , 1483255.6559579237 974.0542453366731 0.626182484097231
n = 150 md = 10 , 1554646.7533288717 1009.3468278180915 0.6081901423390292
n = 150 md = 15 , 1568110.5204312312 1012.3150967877107 0.6047969363514567
n = 150 md = 20 , 1568110.5204312312 1012.3150967877107 0.6047969363514567


In [120]:
# AdaBoost
from sklearn.ensemble import AdaBoostRegressor

for k in [50, 100, 150]:
    model = AdaBoostRegressor(random_state=1234, n_estimators=k)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print("k :", k, "," , mse, mae, r2)

k : 50 , 1563730.296362983 953.2951942007392 0.6059008623494551
k : 100 , 1670237.6795233556 1010.523505534076 0.5790583384471263
k : 150 , 1607909.0438295114 985.8018406806783 0.5947667132448843


In [121]:
# XGBoost
from xgboost import XGBRegressor

for n in [50, 100, 150]:
    for md in [10, 15, 20]:
        model = XGBRegressor(random_state=1234, n_estimators=n, max_depth=md)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
                
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        print("n =", n, "md =", md, "," , mse, mae, r2)

n = 50 md = 10 , 1413903.4368270303 1029.1162729878506 0.6436609775543228
n = 50 md = 15 , 1426295.4224365293 1032.124939980038 0.6405378873041434
n = 50 md = 20 , 1426295.4224365293 1032.124939980038 0.6405378873041434
n = 100 md = 10 , 1392025.2274883012 979.8203989644128 0.6491748333987643
n = 100 md = 15 , 1417051.2145807503 987.4471079487878 0.642867658846396
n = 100 md = 20 , 1417051.2145807503 987.4471079487878 0.642867658846396
n = 150 md = 10 , 1396057.2042293584 977.4641245503502 0.6481586744355636
n = 150 md = 15 , 1421687.2763571425 985.3952768941002 0.6416992553484141
n = 150 md = 20 , 1421687.2763571425 985.3952768941002 0.6416992553484141


In [122]:
# XGBoost + 변수선택
n, md = 150, 10
model = XGBRegressor(random_state=1234, n_estimators=n, max_depth=md)
model.fit(X_train, y_train)
model.feature_importances_

col_imp = pd.Series(col_new + df_dum_col)
val_imp = pd.Series(model.feature_importances_)
df_xgb = pd.concat([col_imp, val_imp], axis=1)
df_xgb

Unnamed: 0,0,1
0,age_s,0.37808
1,income_s,0.277595
2,experience_years_s,0.142968
3,hours_per_week_s,0.110572
4,city_Incheon,0.028516
5,city_Seoul,0.009117
6,married_Yes,0.009311
7,education_level_High School,0.016101
8,education_level_Master,0.010281
9,education_level_PhD,0.017459


In [123]:
list_imp = list(df_xgb.sort_values(1, ascending=False)[0])
list_imp

['age_s',
 'income_s',
 'experience_years_s',
 'hours_per_week_s',
 'city_Incheon',
 'education_level_PhD',
 'education_level_High School',
 'education_level_Master',
 'married_Yes',
 'city_Seoul']

In [124]:
for i in range(1, len(list_imp)):
    list_tmp = list_imp[:i]
    model = XGBRegressor(random_state=1234, n_estimators=n, max_depth=md)
    model.fit(X_train[list_tmp], y_train)
    y_pred = model.predict(X_test[list_tmp])
                
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print("i =", i, "," , mse, mae, r2)

i = 1 , 2680955.8685216373 1194.0738106641013 0.3243320805889859
i = 2 , 2038134.2030133375 1163.0335645961227 0.48633921483018516
i = 3 , 1030912.4550909828 904.1745493550377 0.7401842821044545
i = 4 , 1450670.7511081025 1017.2465708394132 0.6343946949443364
i = 5 , 1371314.7307190453 1014.0778325309071 0.6543943964756336
i = 6 , 1573128.7130639087 1059.8968510855946 0.6035322263220466
i = 7 , 1361696.0305274338 958.6746225972253 0.6568185494511498
i = 8 , 1489455.0164441592 991.7644663472256 0.6246200902321938
i = 9 , 1523095.997063112 1023.1591632561907 0.6161417218828154


In [125]:
# PCA + XGBoost
from sklearn.decomposition import PCA

pca = PCA(random_state=1234)
pca.fit(X)
pca.explained_variance_ratio_

array([0.2350969 , 0.20911702, 0.190349  , 0.14037272, 0.06246552,
       0.05131706, 0.0455274 , 0.03489931, 0.01979637, 0.01105871])

In [126]:
col_pca = ['val_' + str(i) for i in range(0, len(pca.explained_variance_ratio_))]
X_pca = pd.DataFrame(pca.fit_transform(X), columns=col_pca)
X_pca.head()

X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=1234)

In [127]:
for i in range(1, len(col_pca)):
    list_tmp = col_pca[:i]
    model = XGBRegressor(random_state=1234, n_estimators=n, max_depth=md)
    model.fit(X_train[list_tmp], y_train)
    y_pred = model.predict(X_test[list_tmp])
                
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print("i =", i, "," , mse, mae, r2)

i = 1 , 10148230.964932393 2667.7548301776674 -1.5576079719504556
i = 2 , 3619867.178246195 1545.429898546141 0.08770295192567612
i = 3 , 3378704.0766432583 1438.9003503744327 0.14848208410460928
i = 4 , 1085214.6892657108 782.798143251906 0.72649876119939
i = 5 , 1441311.8704812778 959.2594262893283 0.636753366892424
i = 6 , 1548481.5040801854 1005.7973657424533 0.6097439393192113
i = 7 , 1629369.621783701 1021.1586694533908 0.5893581109527246
i = 8 , 1356870.2503352421 916.9557641799532 0.6580347667340516
i = 9 , 1551399.3486233107 968.0736560484947 0.6090085695301124
