In [None]:
import pandas as pd
train = pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/data/main/st_train.csv')
test = pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/data/main/st_test.csv')

train_X = train.drop(['grade'], axis=1)
train_y = train['grade']

test_X = test.drop(['grade'], axis=1)
test_y = test['grade']

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV

num_columns = train_X.select_dtypes('number').columns.to_list()
cat_columns = train_X.select_dtypes('object').columns.to_list()

cat_preprocess = make_pipeline(
    OneHotEncoder(handle_unknown = 'ignore', sparse_output = False))
num_preprocess = make_pipeline(
    SimpleImputer(strategy = 'mean'),
    StandardScaler()
)

preprocess = ColumnTransformer(
    [("num", num_preprocess, num_columns),
    ("cat", cat_preprocess, cat_columns)]
)

In [None]:
from sklearn.neighbors import KNeighborsRegressor

full_pipe = Pipeline(
    [
        ('preprocess', preprocess),
        ('regressor', KNeighborsRegressor())
    ]
)

In [None]:
KNeighborsRegressor().get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [None]:
knn_params = {'regressor__n_neighbors':np.arange(5,10,1)}

In [None]:
knn_search = GridSearchCV(estimator = full_pipe,
                          param_grid = knn_params,
                          cv = 3,
                          scoring = 'neg_mean_squared_error')
knn_search.fit(train_X, train_y)

In [None]:
pd.DataFrame(knn_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_regressor__n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.016691,0.001267,0.013184,0.00298,5,{'regressor__n_neighbors': 5},-9.297209,-8.924235,-10.583059,-9.601501,0.710572,4
1,0.013069,0.001521,0.009685,0.000348,6,{'regressor__n_neighbors': 6},-9.269703,-8.851634,-10.933987,-9.685108,0.899433,5
2,0.012537,0.000346,0.010848,0.001238,7,{'regressor__n_neighbors': 7},-9.094922,-8.545738,-10.694118,-9.444926,0.911322,3
3,0.014738,0.001526,0.01109,0.000113,8,{'regressor__n_neighbors': 8},-9.122093,-8.554963,-10.609926,-9.428994,0.866549,2
4,0.013407,0.000461,0.010413,4.6e-05,9,{'regressor__n_neighbors': 9},-9.00201,-8.557008,-10.621932,-9.39365,0.887323,1


In [None]:
print('Best 파라미터 조합: ', knn_search.best_params_)
print('교차검증 MSE: ', -knn_search.best_score_)

Best 파라미터 조합:  {'regressor__n_neighbors': np.int64(9)}
교차검증 MSE:  9.39364982857915


In [None]:
from sklearn.metrics import mean_squared_error
knn_pred = knn_search.predict(test_X)
print('테스트 MSE: ', mean_squared_error(test_y, knn_pred))

테스트 MSE:  9.736363636363635
['__annotations__', '__builtins__', '__call__', '__class__', '__closure__', '__code__', '__defaults__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__get__', '__getattribute__', '__getstate__', '__globals__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__kwdefaults__', '__le__', '__lt__', '__module__', '__name__', '__ne__', '__new__', '__qualname__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__wrapped__', '_skl_parameter_constraints']


In [None]:
from sklearn.tree import DecisionTreeRegressor
full_pipe = Pipeline(
    [
        ('preprocess', preprocess),
        ('regressor', DecisionTreeRegressor())
    ]
)

In [None]:
DecisionTreeRegressor().get_params()

{'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'random_state': None,
 'splitter': 'best'}

In [None]:
import numpy as np
decisiontree_param = {'regressor__ccp_alpha':np.arange(0.01,0.3, 0.05)}
decisiontree_search = GridSearchCV(estimator = full_pipe,
                                   param_grid=decisiontree_param,
                                   cv=5,
                                   scoring = 'neg_mean_squared_error')
decisiontree_search.fit(train_X, train_y)

In [None]:
print('Best 파라미터 조합: ', decisiontree_search.best_params_)
print('교차검증 MSE: ', -decisiontree_search.best_score_)

Best 파라미터 조합:  {'regressor__ccp_alpha': np.float64(0.26)}
교차검증 MSE:  9.403541096157653


In [None]:
from sklearn.metrics import mean_squared_error
dt_pred = decisiontree_search.predict(test_X)
print('테스트 MSE: ', mean_squared_error(test_y, dt_pred))

테스트 MSE:  10.23195890566565


In [None]:
# 의사결정나무에 Bagging 적용
from sklearn.ensemble import BaggingRegressor

full_pipe = Pipeline(
    [
        ('preprocess', preprocess),
        ('regressor', BaggingRegressor())
    ]
)

NameError: name 'Pipeline' is not defined

In [None]:
BaggingRegressor().get_params()

{'bootstrap': True,
 'bootstrap_features': False,
 'estimator': None,
 'max_features': 1.0,
 'max_samples': 1.0,
 'n_estimators': 10,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [None]:
Bagging_param = {'regressor__n_estimators':np.arange(10,100,20), 'regressor__random_state' : [0]}
Bagging_search = GridSearchCV(estimator = full_pipe,
                              param_grid = Bagging_param,
                              cv = 5,
                              scoring = 'neg_mean_squared_error')
Bagging_search.fit(train_X, train_y)

In [None]:
print('Best 파라미터 조합: ', Bagging_search.best_params_)
print('교차검증 MSE Score: ', -Bagging_search.best_score_)

Best 파라미터 조합:  {'regressor__n_estimators': np.int64(30), 'regressor__random_state': 0}
교차검증 MSE Score:  9.581004443482522


In [None]:
# bagging 테스트 데이터를 이용해서 모형 성능 평가하기

from sklearn.metrics import mean_squared_error
bag_pred = Bagging_search.predict(test_X)
print('테스트 MSE: ', mean_squared_error(test_y, bag_pred))

테스트 MSE:  9.626060816498317


In [None]:
from sklearn.ensemble import RandomForestRegressor
full_pipe = Pipeline(
    [
        ('preprocess', preprocess),
        ('regressor', RandomForestRegressor())
    ]
)

In [None]:
RandomForestRegressor().get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [None]:
import numpy as np
RandomForest_param = {'regressor__n_estimators':np.arange(100,500,100),
                      'regressor__max_features':['sqrt'], 'regressor__random_state' :[0]}
RandomForest_search = GridSearchCV(estimator = full_pipe,
                                   param_grid = RandomForest_param,
                                   cv=5,
                                   scoring = 'neg_mean_squared_error')
RandomForest_search.fit(train_X, train_y)

In [None]:
print('Best 파라미터 조합: ', RandomForest_search.best_params_)
print('교차검증 RMSE score: ', -RandomForest_search.best_score_)

Best 파라미터 조합:  {'regressor__max_features': 'sqrt', 'regressor__n_estimators': np.int64(400), 'regressor__random_state': 0}
교차검증 RMSE score:  9.418447711599324


In [None]:
from sklearn.metrics import mean_squared_error
rf_pred = RandomForest_search.predict(test_X)
print('테스트 MSE: ', mean_squared_error(test_y, rf_pred))

테스트 MSE:  10.007126272089838


In [None]:
from sklearn.ensemble import GradientBoostingRegressor
full_pipe = Pipeline(
    [
        ('preprocess', preprocess),
        ('regressor', GradientBoostingRegressor())
    ]
)

In [None]:
GradientBoostingRegressor().get_params()

{'alpha': 0.9,
 'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'squared_error',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'random_state': None,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [None]:
GradientBoosting_param = {'regressor__learning_rate':np.arange(0.1, 0.3, 0.05),
                          'regressor__random_state':[0]}
GradientBoosting_search = GridSearchCV(estimator = full_pipe,
                                       param_grid = GradientBoosting_param,
                                       cv = 5,
                                       scoring = 'neg_mean_squared_error')
GradientBoosting_search.fit(train_X, train_y)

In [None]:
print('Best 파라미터 조합: ', GradientBoosting_search.best_params_)
print('교차검증 RMSE score: ', -GradientBoosting_search.best_score_)

Best 파라미터 조합:  {'regressor__learning_rate': np.float64(0.1), 'regressor__random_state': 0}
교차검증 RMSE score:  10.788240022704802


In [None]:
from sklearn.metrics import mean_squared_error
gd_pred = GradientBoosting_search.predict(test_X)
print('테스트 MSE: ', mean_squared_error(test_y, gd_pred))

테스트 MSE:  10.547041848465328


In [None]:
from sklearn.svm import SVR
full_pipe = Pipeline(
    [
        ('preprocess', preprocess),
        ('regressor', SVR())
    ]
)

In [None]:
SVR().get_params()

{'C': 1.0,
 'cache_size': 200,
 'coef0': 0.0,
 'degree': 3,
 'epsilon': 0.1,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [None]:
SVR_param = {'regressor__C': np.arange(1,100,20)}
SVR_search = GridSearchCV(estimator = full_pipe,
                          param_grid = SVR_param,
                          cv = 5,
                          scoring = 'neg_mean_squared_error')
SVR_search.fit(train_X, train_y)

In [None]:
print('Best 파라미터 조합: ', SVR_search.best_params_)
print('교차검증 RMSE score: ', -SVR_search.best_score_)

Best 파라미터 조합:  {'regressor__C': np.int64(1)}
교차검증 RMSE score:  8.905507590431395


In [None]:
from sklearn.metrics import mean_squared_error
svr_pred = SVR_search.predict(test_X)
print('테스트 MSE: ', mean_squared_error(test_y, svr_pred))

테스트 MSE:  10.141966042523617


In [1]:
# 모범 답안 작성 예시
import pandas as pd
import numpy as np
train = pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/data/main/st_train.csv')
test = pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/data/main/st_test.csv')

In [2]:
# 데이터 탐색
print(train.info())
print(test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 256 entries, 0 to 255
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   school    256 non-null    object 
 1   sex       256 non-null    object 
 2   paid      256 non-null    object 
 3   famrel    256 non-null    int64  
 4   freetime  256 non-null    int64  
 5   goout     252 non-null    float64
 6   Dalc      256 non-null    int64  
 7   Walc      256 non-null    int64  
 8   health    256 non-null    int64  
 9   absences  256 non-null    int64  
 10  grade     256 non-null    int64  
dtypes: float64(1), int64(7), object(3)
memory usage: 22.1+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110 entries, 0 to 109
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   school    110 non-null    object 
 1   sex       110 non-null    object 
 2   paid      110 non-null    object 
 3   famrel    110 n

In [3]:
# 데이터 분할
train_X = train.drop(['grade'], axis=1)
train_y = train['grade']

test_X = test.drop(['grade'], axis=1)
test_y = test['grade']

from sklearn.model_selection import train_test_split
train_X, valid_X, train_y, valid_y = train_test_split(train_X, train_y, test_size = 0.3, random_state = 1)
print(train_X.shape, train_y.shape, valid_X.shape, valid_y.shape)

(179, 10) (179,) (77, 10) (77,)


In [5]:
# 데이터 전처리
cat_columns = train_X.select_dtypes('object').columns.to_list()
num_columns = train_X.select_dtypes('number').columns.to_list()

In [7]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

onehotencoder = OneHotEncoder(sparse_output = False, handle_unknown = 'ignore')
imputer = SimpleImputer(strategy = 'mean')

In [8]:
train_X_numeric_imputed = imputer.fit_transform(train_X[num_columns])
valid_X_numeric_imputed = imputer.transform(valid_X[num_columns])
test_X_numeric_imputed = imputer.transform(test_X[num_columns])

train_X_categorical_encoded = onehotencoder.fit_transform(train_X[cat_columns])
valid_X_categorical_encoded = onehotencoder.transform(valid_X[cat_columns])
test_X_categorical_encoded = onehotencoder.transform(test_X[cat_columns])

train_X_preprocessed = np.concatenate([train_X_numeric_imputed, train_X_categorical_encoded], axis=1)
valid_X_preprocessed = np.concatenate([valid_X_numeric_imputed, valid_X_categorical_encoded], axis=1)
test_X_preprocessed = np.concatenate([test_X_numeric_imputed, test_X_categorical_encoded], axis=1)

In [9]:
# 모델 적합

from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state = 1)
rf.fit(train_X_preprocessed, train_y)

In [19]:
from sklearn.metrics import mean_squared_error
pred_val = rf.predict(valid_X_preprocessed)
print('valid RMSE: ', np.sqrt(mean_squared_error(valid_y, pred_val)))

valid RMSE:  3.2503522286653195


In [20]:
# 테스트 데이터로 예측

test_pred = rf.predict(test_X_preprocessed)
test_pred = pd.DataFrame(test_pred, columns = ['pred'])
test_pred.to_csv('result.csv', index = False)

In [24]:
# 교차검증(하이퍼파라미터 튜닝) #GridSearchCV

train_X_full = np.concatenate([train_X_preprocessed, valid_X_preprocessed], axis=0)
train_y_full = np.concatenate([train_y, valid_y], axis=0)

In [27]:
from sklearn.model_selection import GridSearchCV

param_grid = {'max_depth': [10,20,30],
              'min_samples_split': [2,5,10]}

rf = RandomForestRegressor(random_state=1)
rf_search = GridSearchCV(estimator = rf,
                         param_grid = param_grid,
                         cv = 3,
                         scoring = 'neg_root_mean_squared_error')

rf_search.fit(train_X_full, train_y_full)

print('교차검증 RMSE-score: ', -rf_search.best_score_)

교차검증 RMSE-score:  2.9513729164658664


In [29]:
test_pred2 = rf_search.predict(test_X_preprocessed)
test_pred2 = pd.DataFrame(test_pred2, columns = ['pred'])

test_pred2.to_csv('result.csv', index=False)