In [32]:
# LinearRegression
# KNeighborsRegressor
# RandomForestRegressor
# AdaBoostRegressor
# XGBRegressor
# XGBRegressor + 변수 선택
# PCA + XGBRegressor

In [67]:
import pandas as pd
import numpy as np

np.random.seed(1234)

n = 100

data = {
    'age': np.random.randint(18, 70, size=n),
    'income': np.random.normal(50000, 15000, size=n).astype(int),
    'experience_years': np.random.randint(0, 30, size=n),
    'hours_per_week': np.random.randint(20, 60, size=n),
}

# 범주형 변수
data['education_level'] = np.random.choice(['High School', 'Bachelor', 'Master', 'PhD'], size=n)
data['married'] = np.random.choice(['Yes', 'No'], size=n)
data['city'] = np.random.choice(['Seoul', 'Busan', 'Incheon'], size=n)

data['target'] = (
    data['age'] * 50 +
    data['income'] * 0.1 +
    data['experience_years'] * 100 +
    data['hours_per_week'] * 30 +
    np.random.normal(0, 1000, size=n)
)

df = pd.DataFrame(data)

df.head()

Unnamed: 0,age,income,experience_years,hours_per_week,education_level,married,city,target
0,65,28970,11,39,Bachelor,No,Seoul,9686.473462
1,37,48486,1,32,PhD,No,Seoul,8797.782328
2,56,41776,29,32,Bachelor,No,Seoul,11793.854644
3,30,47830,9,53,Bachelor,Yes,Seoul,10221.918488
4,42,55310,20,41,PhD,Yes,Incheon,11099.470343


In [68]:
# 표준화
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
col_s = ['age', 'income', 'experience_years', 'hours_per_week']
col_new = [item + '_s' for item in col_s]

scaled = ss.fit_transform(df[col_s])
df_s = pd.DataFrame(scaled, columns=col_new)

df_new = pd.concat([df, df_s], axis=1)
df_new

Unnamed: 0,age,income,experience_years,hours_per_week,education_level,married,city,target,age_s,income_s,experience_years_s,hours_per_week_s
0,65,28970,11,39,Bachelor,No,Seoul,9686.473462,1.268445,-1.477412,-0.503009,-0.041133
1,37,48486,1,32,PhD,No,Seoul,8797.782328,-0.548254,-0.174824,-1.745005,-0.628746
2,56,41776,29,32,Bachelor,No,Seoul,11793.854644,0.684506,-0.622680,1.732585,-0.628746
3,30,47830,9,53,Bachelor,Yes,Seoul,10221.918488,-1.002428,-0.218609,-0.751408,1.134094
4,42,55310,20,41,PhD,Yes,Incheon,11099.470343,-0.223843,0.280641,0.614788,0.126757
...,...,...,...,...,...,...,...,...,...,...,...,...
95,43,64762,16,57,PhD,Yes,Seoul,11693.707824,-0.158961,0.911511,0.117990,1.469873
96,61,48174,5,21,Bachelor,No,Incheon,9694.661632,1.008916,-0.195648,-1.248206,-1.552139
97,32,85486,22,57,PhD,No,Busan,15209.836756,-0.872664,2.294726,0.863188,1.469873
98,41,57442,23,30,Master,No,Seoul,11621.668018,-0.288725,0.422941,0.987387,-0.796636


In [69]:
# 더미변수 생성
df_dum = pd.get_dummies(df_new[['city', 'married', 'education_level']])
df_dum

Unnamed: 0,city_Busan,city_Incheon,city_Seoul,married_No,married_Yes,education_level_Bachelor,education_level_High School,education_level_Master,education_level_PhD
0,0,0,1,1,0,1,0,0,0
1,0,0,1,1,0,0,0,0,1
2,0,0,1,1,0,1,0,0,0
3,0,0,1,0,1,1,0,0,0
4,0,1,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...
95,0,0,1,0,1,0,0,0,1
96,0,1,0,1,0,1,0,0,0
97,1,0,0,1,0,0,0,0,1
98,0,0,1,1,0,0,0,1,0


In [70]:
df_dum_col = df_dum.columns.tolist()
df_dum_col

['city_Busan',
 'city_Incheon',
 'city_Seoul',
 'married_No',
 'married_Yes',
 'education_level_Bachelor',
 'education_level_High School',
 'education_level_Master',
 'education_level_PhD']

In [71]:
df_final = pd.concat([df_new, df_dum], axis=1)
df_final.head()

Unnamed: 0,age,income,experience_years,hours_per_week,education_level,married,city,target,age_s,income_s,...,hours_per_week_s,city_Busan,city_Incheon,city_Seoul,married_No,married_Yes,education_level_Bachelor,education_level_High School,education_level_Master,education_level_PhD
0,65,28970,11,39,Bachelor,No,Seoul,9686.473462,1.268445,-1.477412,...,-0.041133,0,0,1,1,0,1,0,0,0
1,37,48486,1,32,PhD,No,Seoul,8797.782328,-0.548254,-0.174824,...,-0.628746,0,0,1,1,0,0,0,0,1
2,56,41776,29,32,Bachelor,No,Seoul,11793.854644,0.684506,-0.62268,...,-0.628746,0,0,1,1,0,1,0,0,0
3,30,47830,9,53,Bachelor,Yes,Seoul,10221.918488,-1.002428,-0.218609,...,1.134094,0,0,1,0,1,1,0,0,0
4,42,55310,20,41,PhD,Yes,Incheon,11099.470343,-0.223843,0.280641,...,0.126757,0,1,0,0,1,0,0,0,1


In [72]:
col = col_new + df_dum_col + ['target']
df_final = df_final[col]
df_final.head()

Unnamed: 0,age_s,income_s,experience_years_s,hours_per_week_s,city_Busan,city_Incheon,city_Seoul,married_No,married_Yes,education_level_Bachelor,education_level_High School,education_level_Master,education_level_PhD,target
0,1.268445,-1.477412,-0.503009,-0.041133,0,0,1,1,0,1,0,0,0,9686.473462
1,-0.548254,-0.174824,-1.745005,-0.628746,0,0,1,1,0,0,0,0,1,8797.782328
2,0.684506,-0.62268,1.732585,-0.628746,0,0,1,1,0,1,0,0,0,11793.854644
3,-1.002428,-0.218609,-0.751408,1.134094,0,0,1,0,1,1,0,0,0,10221.918488
4,-0.223843,0.280641,0.614788,0.126757,0,1,0,0,1,0,0,0,1,11099.470343


In [73]:
from sklearn.model_selection import train_test_split

X = df_final[col_new + df_dum_col]
y = df_final['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

In [74]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [75]:
# 선형회귀
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

mse, mae, r2

(1162831.7939682403, 889.2585099331758, 0.7069373099047875)

In [76]:
# KNN
from sklearn.neighbors import KNeighborsRegressor

for k in [3, 5, 7]:
    model = KNeighborsRegressor(n_neighbors=k)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print("k :", k, "," , mse, mae, r2)

k : 3 , 2419880.996606203 1365.5018723441358 0.39012947680456445
k : 5 , 2723110.7640391826 1346.7235648450287 0.3137079927844243
k : 7 , 2370513.3076973977 1205.3765276181348 0.40257136890834466


In [77]:
# 랜덤포레스트
from sklearn.ensemble import RandomForestRegressor

for n in [50, 100, 150]:
    for md in [10, 15, 20]:
        model = RandomForestRegressor(random_state=1234, n_estimators=n, max_depth=md)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
                
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        print("n =", n, "md =", md, "," , mse, mae, r2)

n = 50 md = 10 , 1566925.2283973855 1001.9963965434279 0.6050956595836465
n = 50 md = 15 , 1563158.7026322205 1002.3953583087692 0.6060449182629997
n = 50 md = 20 , 1563158.7026322205 1002.3953583087692 0.6060449182629997
n = 100 md = 10 , 1535635.1963153076 989.8383432310302 0.6129815301134212
n = 100 md = 15 , 1510809.4935591333 984.4982877960847 0.6192382280046962
n = 100 md = 20 , 1510809.4935591333 984.4982877960847 0.6192382280046962
n = 150 md = 10 , 1604128.1007198014 1018.2365871279392 0.595719605455586
n = 150 md = 15 , 1573756.5800841693 1013.0052910893606 0.6033739881323669
n = 150 md = 20 , 1573756.5800841693 1013.0052910893606 0.6033739881323669


In [82]:
# AdaBoost
from sklearn.ensemble import AdaBoostRegressor

for k in [50, 100, 150]:
    model = AdaBoostRegressor(random_state=1234, n_estimators=k)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print("k :", k, "," , mse, mae, r2)

k : 50 , 1569072.431907204 981.3765292203343 0.6045545106057518
k : 100 , 1549299.1797258193 986.1139033251441 0.6095378646095433
k : 150 , 1523124.0104844326 958.8305587487855 0.6161346618001331


In [83]:
# XGBoost
from xgboost import XGBRegressor

for n in [50, 100, 150]:
    for md in [10, 15, 20]:
        model = XGBRegressor(random_state=1234, n_estimators=n, max_depth=md)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
                
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        print("n =", n, "md =", md, "," , mse, mae, r2)

n = 50 md = 10 , 1409884.2637070674 1037.833094276913 0.6446739096848377
n = 50 md = 15 , 1421262.8933300893 1045.1530649800382 0.6418062104834423
n = 50 md = 20 , 1421262.8933300893 1045.1530649800382 0.6418062104834423
n = 100 md = 10 , 1238194.6845915406 975.4674448628506 0.6879439769274998
n = 100 md = 15 , 1322996.2155775437 999.3964487691006 0.6665718705541945
n = 100 md = 20 , 1322996.2155775437 999.3964487691006 0.6665718705541945
n = 150 md = 10 , 1228046.8949012875 970.8071665425377 0.6905014736871964
n = 150 md = 15 , 1323183.9874844071 997.4258188862881 0.6665245473382013
n = 150 md = 20 , 1323184.9375105144 997.4265024800382 0.6665243079078658


In [87]:
# XGBoost + 변수선택
n, md = 150, 10
model = XGBRegressor(random_state=1234, n_estimators=n, max_depth=md)
model.fit(X_train, y_train)
model.feature_importances_

col_imp = pd.Series(col_new + df_dum_col)
val_imp = pd.Series(model.feature_importances_)
df_xgb = pd.concat([col_imp, val_imp], axis=1)
df_xgb

Unnamed: 0,0,1
0,age_s,0.355037
1,income_s,0.257836
2,experience_years_s,0.146455
3,hours_per_week_s,0.114925
4,city_Busan,0.009515
5,city_Incheon,0.012873
6,city_Seoul,0.011007
7,married_No,0.022201
8,married_Yes,0.0
9,education_level_Bachelor,0.026493


In [88]:
list_imp = list(df_xgb.sort_values(1, ascending=False)[0])
list_imp

['age_s',
 'income_s',
 'experience_years_s',
 'hours_per_week_s',
 'education_level_Bachelor',
 'education_level_PhD',
 'married_No',
 'city_Incheon',
 'education_level_High School',
 'city_Seoul',
 'city_Busan',
 'education_level_Master',
 'married_Yes']

In [90]:
for i in range(1, len(list_imp)):
    list_tmp = list_imp[:i]
    model = XGBRegressor(random_state=1234, n_estimators=n, max_depth=md)
    model.fit(X_train[list_tmp], y_train)
    y_pred = model.predict(X_test[list_tmp])
                
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print("i =", i, "," , mse, mae, r2)

i = 1 , 2680955.8685216373 1194.0738106641013 0.3243320805889859
i = 2 , 2038134.2030133375 1163.0335645961227 0.48633921483018516
i = 3 , 1030912.4550909828 904.1745493550377 0.7401842821044545
i = 4 , 1450670.7511081025 1017.2465708394132 0.6343946949443364
i = 5 , 1531667.0672500322 1009.9537978506502 0.6139815978657366
i = 6 , 1627254.42770969 1037.1739638662752 0.589891192752361
i = 7 , 1592353.4381482063 1025.3381972647126 0.5986871148633046
i = 8 , 1425031.8162409973 983.6641982412751 0.6408563476634184
i = 9 , 1525460.6323403688 989.1128798819002 0.615545774662382
i = 10 , 1459213.9863944878 994.1902475972253 0.6322415860182378
i = 11 , 1311449.1121963914 982.8530649800381 0.6694820293555288
i = 12 , 1228914.2524047387 971.1183970112877 0.6902828779070034


In [93]:
# PCA + XGBoost
from sklearn.decomposition import PCA

pca = PCA(random_state=1234)
pca.fit(X)
pca.explained_variance_ratio_

array([2.09072199e-01, 1.87608531e-01, 1.69540287e-01, 1.25948092e-01,
       8.46088677e-02, 5.71032968e-02, 5.44307219e-02, 4.42493331e-02,
       3.84146265e-02, 2.90240445e-02, 3.46272273e-33, 1.71091782e-33,
       2.89498462e-34])

In [106]:
col_pca = ['val_' + str(i) for i in range(0, len(pca.explained_variance_ratio_))]
X_pca = pd.DataFrame(pca.fit_transform(X), columns=col_pca)
X_pca.head()

X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=1234)

In [107]:
for i in range(1, len(col_pca)):
    list_tmp = col_pca[:i]
    model = XGBRegressor(random_state=1234, n_estimators=n, max_depth=md)
    model.fit(X_train[list_tmp], y_train)
    y_pred = model.predict(X_test[list_tmp])
                
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print("i =", i, "," , mse, mae, r2)

i = 1 , 9065238.0480516 2548.836934669855 -1.2846666753489506
i = 2 , 4585075.309884719 1652.7797317337404 -0.15555363344378592
i = 3 , 3479684.268327252 1321.0093473896213 0.12303255066843488
i = 4 , 1054390.2784570232 741.0411968983727 0.7342672835248474
i = 5 , 1184514.2371282098 807.2500702826986 0.7014727920327635
i = 6 , 1181216.2326862665 814.7417138735557 0.7023039716227225
i = 7 , 1203156.0811035396 835.6586507275376 0.6967745811891282
i = 8 , 1323266.6248092332 900.2387769319514 0.6665037206658988
i = 9 , 1574255.6850387866 975.0589171986791 0.6032482012030804
i = 10 , 1508752.4172106914 979.3717639696599 0.6197566626841853
i = 11 , 1438004.6791526615 958.8391655080129 0.6375868618075539
i = 12 , 1552832.4912213294 979.1159047307622 0.608647381757948
