In [96]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) # FutureWarning 제거

In [97]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.neighbors import KNeighborsClassifier

from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

In [98]:
df=pd.read_csv('datasets/train.csv')

In [99]:
df

Unnamed: 0,id,log_price,property_type,room_type,amenities,accommodates,bathrooms,bed_type,cancellation_policy,cleaning_fee,...,latitude,longitude,name,neighbourhood,number_of_reviews,review_scores_rating,thumbnail_url,zipcode,bedrooms,beds
0,6901257,5.010635,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",3,1.0,Real Bed,strict,True,...,40.696524,-73.991617,Beautiful brownstone 1-bedroom,Brooklyn Heights,2,100.0,https://a0.muscache.com/im/pictures/6d7cbbf7-c...,11201,1.0,1.0
1,6304928,5.129899,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",7,1.0,Real Bed,strict,True,...,40.766115,-73.989040,Superb 3BR Apt Located Near Times Square,Hell's Kitchen,6,93.0,https://a0.muscache.com/im/pictures/348a55fe-4...,10019,3.0,3.0
2,7919400,4.976734,Apartment,Entire home/apt,"{TV,""Cable TV"",""Wireless Internet"",""Air condit...",5,1.0,Real Bed,moderate,True,...,40.808110,-73.943756,The Garden Oasis,Harlem,10,92.0,https://a0.muscache.com/im/pictures/6fae5362-9...,10027,1.0,3.0
3,13418779,6.620073,House,Entire home/apt,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",4,1.0,Real Bed,flexible,True,...,37.772004,-122.431619,Beautiful Flat in the Heart of SF!,Lower Haight,0,,https://a0.muscache.com/im/pictures/72208dad-9...,94117.0,2.0,2.0
4,3808709,4.744932,Apartment,Entire home/apt,"{TV,Internet,""Wireless Internet"",""Air conditio...",2,1.0,Real Bed,moderate,True,...,38.925627,-77.034596,Great studio in midtown DC,Columbia Heights,4,40.0,,20009,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74106,14549287,4.605170,Apartment,Private room,{},1,1.0,Real Bed,flexible,False,...,40.709025,-73.939405,one room bushwick,Williamsburg,0,,https://a0.muscache.com/im/pictures/55162426/6...,11206.0,1.0,1.0
74107,13281809,5.043425,Apartment,Entire home/apt,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",4,2.0,Real Bed,moderate,True,...,33.871549,-118.396053,Spacious Hermosa 2 BR on PCH,Hermosa Beach,16,93.0,https://a0.muscache.com/im/pictures/2b86560b-a...,90254,2.0,4.0
74108,18688039,5.220356,Apartment,Entire home/apt,"{TV,Internet,""Wireless Internet"",""Air conditio...",5,1.0,Real Bed,moderate,True,...,40.706749,-73.942377,Modern 2 Bedroom Apartment in Williamsburg,Williamsburg,43,94.0,https://a0.muscache.com/im/pictures/7fbe448c-5...,11206.0,2.0,2.0
74109,17045948,5.273000,Apartment,Entire home/apt,"{TV,""Wireless Internet"",""Air conditioning"",Kit...",2,1.0,Real Bed,strict,True,...,40.738535,-74.000157,Designer's Apartment in HEART of NYC,West Village,0,,https://a0.muscache.com/im/pictures/b3971b63-0...,10011,0.0,2.0


### amenities 개수 추가하기

In [100]:
df['amenities']=list(map(lambda x:len(x.split(',')), df['amenities']))

In [101]:
columns=['log_price', 'property_type', 'room_type', 'amenities', 'accommodates',
       'bathrooms', 'bed_type', 'cancellation_policy', 'city',
       'latitude', 'longitude', 'neighbourhood', 'bedrooms', 'beds']

In [102]:
X=df[columns].copy()

In [103]:
X.shape

(74111, 14)

# total_train, total_test split

In [147]:
total_train, total_test = train_test_split(X, test_size=0.2, random_state=42)

In [148]:
total_train.shape, total_test.shape

((59288, 14), (14823, 14))

In [106]:
train, test = train_test_split(total_train, test_size=0.2, random_state=42)

In [107]:
train.shape, test.shape

((47430, 14), (11858, 14))

# 전처리 과정

1. 누락값 넣기:
    - neighbourhood...KNeighborsCLF
    - beds...means
    - bedrooms...means
    - bathrooms...means
</br></br>
2. 특성 추가하기:
    - GroupBy([city, Neighbourhoods]).mean()['log_price']
    - GroupBy([city, Neighbourhoods]).count()['id']
    - GroupBy([property_type]).mean()['log_price']
</br></br>    
3. 수치형 특성...?
    - StandardScaler
    </br></br>
4. 범주형 특성...?
    - OneHotEncoding

# null값 처리

### neighbourhood
- KNeighborsClassifier 사용

In [108]:
class NeighbourhoodImputer(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        X=X[X['neighbourhood'].notnull()].copy()
        X=X[['longitude', 'latitude', 'neighbourhood']].copy()
        feature=X.drop(columns='neighbourhood')

        self.label_enc=LabelEncoder()
        self.label_enc.fit(X['neighbourhood'])
        target=self.label_enc.transform(X['neighbourhood'])
        
        self.kne_clf=KNeighborsClassifier(n_neighbors=5)
        self.kne_clf.fit(feature,target)
        
        return self
    
    def transform(self, X, y=None):
        K = X[['longitude', 'latitude']].copy()
        K['neighbourhood']=self.kne_clf.predict(K).copy()
        K = K.drop(columns=['longitude', 'latitude']).copy()
        K['neighbourhood']=self.label_enc.inverse_transform(K['neighbourhood'])
        X['neighbourhood']=K['neighbourhood']
        X.drop(columns=['longitude', 'latitude'], inplace=True)
        return X


### property_type, room_type....침대의 개수(평균개수)

In [109]:
class BedsImputer(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        X['room_type']=df['room_type'].copy()
        self.avg_beds_by_room_type=X.groupby('room_type').mean()['beds']
        X.drop('room_type', inplace=True, axis=1)
        return self        
    
    def transform(self, X, y=None):
        X=X.copy()
        X['room_type']=df['room_type'].copy()
        X_null_beds = X[X['beds'].isnull()].index
        for idx in X_null_beds:
            X.at[idx, 'beds'] = self.avg_beds_by_room_type[X.at[idx, 'room_type']]
        return X
    
    def get_feature_names_out(self, feature_names_in):
        return feature_names_in

### property_type, room_type....침실의 개수(평균개수)

In [110]:
class BedroomsImputer(BaseEstimator, TransformerMixin):        
    
    def fit(self, X, y=None):
        X['room_type']=df['room_type'].copy()
        self.avg_bedrooms_by_room_type=X.groupby('room_type').mean()['bedrooms']
        X.drop('room_type', inplace=True, axis=1)
        return self
    
    def transform(self, X, y=None):
        X=X.copy()
        X['room_type']=df['room_type'].copy()
        X_null_bedrooms = X[X['bedrooms'].isnull()].index
        for idx in X_null_bedrooms:
            X.at[idx, 'bedrooms'] = self.avg_bedrooms_by_room_type[X.at[idx, 'room_type']]
        return X
    
    def get_feature_names_out(self, feature_names_in):
        return feature_names_in

### property_type, room_type....화장실의 개수(평균개수)

In [111]:
class BathroomsImputer(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        X['room_type']=df['room_type'].copy()
        self.avg_bathrooms_by_room_type=X.groupby('room_type').mean()['bathrooms']
        X.drop('room_type', inplace=True, axis=1)
        return self
    
    def transform(self, X, y=None):
        X=X.copy()
        X['room_type']=df.loc[:,'room_type']
        X_null_bathrooms = X[X['bathrooms'].isnull()].index
        for idx in X_null_bathrooms:
            X.at[idx, 'bathrooms'] = self.avg_bathrooms_by_room_type[X.at[idx, 'room_type']]
        return X
    
    def get_feature_names_out(self, feature_names_in):
        return feature_names_in

In [112]:
impute_pipeline=Pipeline([
    ('impute_neighbourhood', NeighbourhoodImputer()),
    ('impute_beds', BedsImputer()),
    ('impute_bedrooms', BedroomsImputer()),
    ('impute_bathrooms', BathroomsImputer())    
])

# 특성 추가

In [113]:
tc=train.copy()

In [114]:
ttc=test.copy()

### GroupBy(City, Neighbourhood) 가격 평균

In [115]:
class MeanPriceGrouped(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.mean_price_grouped = pd.DataFrame(X.groupby(['city', 'neighbourhood']).mean()['log_price'])
        return self
    
    def transform(self, X, y=None):
        mpg=self.mean_price_grouped
        mpg_=mpg.reset_index()
        total_mpg=pd.merge(left=mpg_, right=X, how='left',
                          on=['city', 'neighbourhood'], sort=False)
        total_mpg.rename(columns={'log_price_x':'grouped_mean_price', 'log_price_y':'log_price'}, 
                      inplace=True)
        X=total_mpg.copy()
        
        return X

### GroupBy(City, Neighbourhood) Count

In [116]:
class CountGrouped(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.cnt_grouped = pd.DataFrame(X.groupby(['city', 'neighbourhood']).count()['log_price'])
        return self
    
    def transform(self, X, y=None):
        cg=self.cnt_grouped
        cg_=cg.reset_index()
        total_cg=pd.merge(left=cg_, right=X, how='left',
                          on=['city', 'neighbourhood'], sort=False)
        
        total_cg.rename(columns={'log_price_x':'grouped_count','log_price_y':'log_price'}, inplace=True)
        X=total_cg.copy()
        X.drop(columns=['city','neighbourhood'], inplace=True)
        return X

### property_type...groupby해서 평균가격

In [117]:
class MeanPriceGroupedPro(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.mean_price_grouped = pd.DataFrame(X.groupby('property_type').mean()['log_price'])
        return self
    
    def transform(self, X, y=None):
        mpg=self.mean_price_grouped
        mpg_=mpg.reset_index()
        total_mpg=pd.merge(left=mpg_, right=X, how='left',
                          on=['property_type'], sort=False)
        total_mpg.rename(columns={'log_price_x':'property_mean_price', 'log_price_y':'log_price'}, 
                      inplace=True)
        X=total_mpg.copy()
        X.drop(columns='property_type', inplace=True)
        return X

In [118]:
add_attrib_pipeline=Pipeline(steps=[
    ('mpg', MeanPriceGrouped()),
    ('cg', CountGrouped()),
    ('mpgp', MeanPriceGroupedPro())
])

In [119]:
num_transformer=Pipeline(steps=[
    ('impute', impute_pipeline),
    ('add_attrib', add_attrib_pipeline),
])

In [120]:
num_prepared = num_transformer.fit_transform(train)

In [121]:
ohe=OneHotEncoder(sparse=False)

cat_cat=num_prepared[['bed_type', 'room_type', 'cancellation_policy']]
ohe.fit(cat_cat)
cat_prepared=pd.DataFrame(data=ohe.transform(cat_cat), columns=ohe.get_feature_names_out())

In [122]:
train_label=num_prepared['log_price'].copy()

In [123]:
num_prepared.drop(columns=['log_price', 'bed_type', 'room_type', 'cancellation_policy'], inplace=True)

std_scaler=StandardScaler()
std_scaler.fit(num_prepared)
num_prepared=pd.DataFrame(data=std_scaler.transform(num_prepared), columns=std_scaler.get_feature_names_out())

In [124]:
train_prepared=pd.concat([num_prepared, cat_prepared], axis=1)

In [125]:
train_prepared.shape

(47430, 21)

In [126]:
train_label.shape

(47430,)

In [127]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

In [128]:
X_train, X_test, y_train, y_test = train_test_split(train_prepared, train_label, test_size=0.2, random_state=42)

In [129]:
X_train.shape, X_test.shape

((37944, 21), (9486, 21))

In [130]:
y_train.shape, y_test.shape

((37944,), (9486,))

## LinearReg, RandomForestReg, SGDReg

In [57]:
scores = {}

In [58]:
# LinearRegression
lin_reg = LinearRegression()
scores['lin_scores'] = cross_val_score(lin_reg, X_train, y_train, scoring='neg_mean_squared_error', cv=10, n_jobs=-1)

# RandomForestRegressor
rf_reg = RandomForestRegressor(random_state=42)
scores['rf_scores'] = cross_val_score(rf_reg, X_train, y_train, scoring='neg_mean_squared_error', cv=10, n_jobs=-1)

# SGDRegressor
sgd_reg = SGDRegressor(penalty=None, eta0=0.001, random_state=42)
scores['sgd_scores'] = cross_val_score(sgd_reg, X_train, y_train, scoring='neg_mean_squared_error', cv=10, n_jobs=-1)

In [59]:
min_score = 1
best_model = 'best_model'

for k, v in scores.items():
    print('-{0:<14}/'.format(k), end=' ')
    score = np.sqrt(-v)
    print('/평균:', score.mean(), '/표준편차:', score.std())
    if min_score >= score.mean():
        min_score = score.mean()
        best_model = k
        
print('best_model:', best_model)

-lin_scores    / /평균: 0.4302918328975496 /표준편차: 0.006667393888454507
-rf_scores     / /평균: 0.43449473815057627 /표준편차: 0.006077491379839128
-sgd_scores    / /평균: 0.430345568147002 /표준편차: 0.0067070824045228775
best_model: lin_scores


In [60]:
from sklearn.model_selection import GridSearchCV

In [61]:
param_grid = {'n_estimators' : [10, 50, 100, 150, 200], 'max_features' : [4, 6, 8, 10]}

grid_search = GridSearchCV(rf_reg, param_grid, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)
%time grid_search.fit(X_train, y_train)

Wall time: 16min 47s


GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42), n_jobs=-1,
             param_grid={'max_features': [4, 6, 8, 10],
                         'n_estimators': [10, 50, 100, 150, 200]},
             scoring='neg_mean_squared_error')

In [62]:
grid_search.best_params_

{'max_features': 6, 'n_estimators': 200}

In [63]:
cv_results = grid_search.cv_results_
for mean_score, params in zip(cv_results['mean_test_score'], cv_results['params']):
    print(np.sqrt(-mean_score), params)

0.44436536245005087 {'max_features': 4, 'n_estimators': 10}
0.4310976259780144 {'max_features': 4, 'n_estimators': 50}
0.4294229315513291 {'max_features': 4, 'n_estimators': 100}
0.42871278447361033 {'max_features': 4, 'n_estimators': 150}
0.42848948700211514 {'max_features': 4, 'n_estimators': 200}
0.4450825633116809 {'max_features': 6, 'n_estimators': 10}
0.4311961489191606 {'max_features': 6, 'n_estimators': 50}
0.4291794427321841 {'max_features': 6, 'n_estimators': 100}
0.4285490005943287 {'max_features': 6, 'n_estimators': 150}
0.4284740242700577 {'max_features': 6, 'n_estimators': 200}
0.4455651127072156 {'max_features': 8, 'n_estimators': 10}
0.43209848227583747 {'max_features': 8, 'n_estimators': 50}
0.4301188784349158 {'max_features': 8, 'n_estimators': 100}
0.4294511311686339 {'max_features': 8, 'n_estimators': 150}
0.4292965525230079 {'max_features': 8, 'n_estimators': 200}
0.44703315058597753 {'max_features': 10, 'n_estimators': 10}
0.43278159675444244 {'max_features': 10, 

In [64]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {'n_estimators':randint(low=200, high=600), 'max_features':randint(low=1, high=8)}

random_search = RandomizedSearchCV(rf_reg, param_distribs, n_iter=10, scoring='neg_mean_squared_error', cv=5, n_jobs=-1) 
# n_iter : 반복 횟수

%time random_search.fit(X_train, y_train)

Wall time: 29min 51s


RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42),
                   n_jobs=-1,
                   param_distributions={'max_features': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x000001E5D3C4A130>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x000001E5D3BDE970>},
                   scoring='neg_mean_squared_error')

In [65]:
random_search.best_params_

{'max_features': 5, 'n_estimators': 589}

In [66]:
cv_results = random_search.cv_results_
for mean_score, params in zip(cv_results['mean_test_score'], cv_results['params']):
    print(np.sqrt(-mean_score), params)

0.42789986598395563 {'max_features': 5, 'n_estimators': 589}
0.42801002139074357 {'max_features': 4, 'n_estimators': 376}
0.4308609446171341 {'max_features': 1, 'n_estimators': 217}
0.428111597390346 {'max_features': 6, 'n_estimators': 360}
0.427979936478499 {'max_features': 6, 'n_estimators': 465}
0.4287196495049835 {'max_features': 7, 'n_estimators': 269}
0.4297198282223856 {'max_features': 2, 'n_estimators': 209}
0.4293499387872352 {'max_features': 2, 'n_estimators': 400}
0.42928031806453715 {'max_features': 2, 'n_estimators': 476}
0.4304583500124867 {'max_features': 1, 'n_estimators': 338}


In [67]:
best_model = random_search.best_estimator_

## 

### ElasticNet

In [36]:
from sklearn.linear_model import ElasticNet

elastic=ElasticNet(random_state=42)
elastic_score=cross_val_score(elastic, X_train, y_train, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)
elastic_rmse=np.sqrt(-elastic_score.mean())

elastic_rmse

0.7164166623766047

### KNeighborsRegressor

In [37]:
from sklearn.neighbors import KNeighborsRegressor

kne_reg=KNeighborsRegressor()
kne_score=cross_val_score(kne_reg, X_train, y_train, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)
kne_rmse=np.sqrt(-kne_score.mean())

kne_rmse

0.4530340084920557

### RandomForestRegressor

In [38]:
from sklearn.ensemble import RandomForestRegressor

rf_reg=RandomForestRegressor(random_state=42)
rf_score=cross_val_score(rf_reg, X_train, y_train, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)
rf_rmse=np.sqrt(-rf_score.mean())
rf_rmse

0.43777216852129486

### SVR

In [39]:
from sklearn.svm import SVR

svr=SVR()
svr_score=cross_val_score(svr, X_train, y_train, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
svr_rmse=np.sqrt(-svr_score.mean())
svr_rmse

0.4202154664676149

### Voting

In [40]:
from sklearn.ensemble import VotingRegressor

voting_reg=VotingRegressor(estimators=[
    ('rf_reg', rf_reg),
    ('svr', svr),
    ('kne_reg', kne_reg)
])
voting_score=cross_val_score(voting_reg, X_train, y_train, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
voting_rmse=np.sqrt(-voting_score.mean())
voting_rmse

0.41982131943439754

# 모델 훈련하기

In [41]:
from sklearn.model_selection import GridSearchCV

In [42]:
from sklearn.ensemble import RandomForestRegressor

rf_reg=RandomForestRegressor(random_state=42)
rf_param={
    'n_estimators':[i for i in range(100,901,50)],
    'max_features':['auto', 'sqrt', 'log2'],
    'bootstrap':[True, False]
}
gs_rf=GridSearchCV(rf_reg, rf_param, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

%time gs_rf.fit(X_train, y_train)
rf_best=gs_rf.best_estimator_

gs_rf.best_score_

Wall time: 1h 26min 9s


-0.18346262959462425

In [175]:
np.sqrt(-gs_rf.best_score_)

0.42832537818184935

In [171]:
gs_rf.best_params_

{'bootstrap': True, 'max_features': 'sqrt', 'n_estimators': 600}

In [43]:
best_model=gs_rf.best_estimator_

# 특성 중요도 

In [44]:
feature_importances=best_model.feature_importances_

In [50]:
total_attribs=['property_mean_price', 'grouped_count', 'grouped_mean_price',
       'amenities', 'accommodates', 'bathrooms', 'bedrooms', 'beds',
       'bed_type_Airbed', 'bed_type_Couch', 'bed_type_Futon',
       'bed_type_Pull-out Sofa', 'bed_type_Real Bed',
       'room_type_Entire home/apt', 'room_type_Private room',
       'room_type_Shared room', 'cancellation_policy_flexible',
       'cancellation_policy_moderate', 'cancellation_policy_strict',
       'cancellation_policy_super_strict_30',
       'cancellation_policy_super_strict_60']

In [51]:
sorted(zip(feature_importances, total_attribs), reverse=True)

[(0.20054366013107336, 'grouped_mean_price'),
 (0.14450363819311654, 'room_type_Entire home/apt'),
 (0.0995425379390661, 'bedrooms'),
 (0.09896619958024447, 'amenities'),
 (0.09754159935405605, 'accommodates'),
 (0.09297224672862503, 'room_type_Private room'),
 (0.07893106392984879, 'grouped_count'),
 (0.06443017488856179, 'bathrooms'),
 (0.0482756490483459, 'beds'),
 (0.030765606930260224, 'property_mean_price'),
 (0.01980206305237815, 'room_type_Shared room'),
 (0.006337318103647558, 'cancellation_policy_flexible'),
 (0.006007120921970835, 'cancellation_policy_strict'),
 (0.004688166816049921, 'cancellation_policy_moderate'),
 (0.002485626698468989, 'bed_type_Real Bed'),
 (0.001116765221828492, 'bed_type_Airbed'),
 (0.0009807014220092575, 'bed_type_Futon'),
 (0.0008918193238768178, 'bed_type_Couch'),
 (0.0007986329493098841, 'bed_type_Pull-out Sofa'),
 (0.0002484712375199116, 'cancellation_policy_super_strict_30'),
 (0.00017093752974198118, 'cancellation_policy_super_strict_60')]

# 최종 예측?

In [52]:
from sklearn.metrics import mean_squared_error

final_pred=best_model.predict(X_test)
final_rmse=mean_squared_error(y_test, final_pred, squared=False)
final_rmse

0.4332075707895804

# total_test로 예측하기

### total_test 전처리

In [149]:
num_prepared_test = num_transformer.transform(total_test)

In [163]:
ohe=OneHotEncoder(sparse=False)

cat_cat_test=num_prepared_test2[['bed_type', 'room_type', 'cancellation_policy']]
ohe.fit(cat_cat_test)
cat_prepared=pd.DataFrame(data=ohe.transform(cat_cat_test), columns=ohe.get_feature_names_out())

In [164]:
train_label_test=num_prepared_test2['log_price'].copy()

In [165]:
num_prepared_test2.drop(columns=['log_price', 'bed_type', 'room_type', 'cancellation_policy'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  num_prepared_test2.drop(columns=['log_price', 'bed_type', 'room_type', 'cancellation_policy'], inplace=True)


In [166]:
num_prepared=pd.DataFrame(data=std_scaler.transform(num_prepared_test2), columns=std_scaler.get_feature_names_out())

In [169]:
train_prepared_test=pd.concat([num_prepared, cat_prepared], axis=1)

### 예측

In [170]:
final_pred=best_model.predict(train_prepared_test)
final_rmse=mean_squared_error(train_label_test, final_pred, squared=False)
final_rmse

0.4272172238330357

# 신뢰구간

In [174]:
from scipy.stats import t

squared_errors = (final_pred - train_label_test)**2

confidence=0.95

n = len(squared_errors)

dof=n-1

mse=np.mean(squared_errors)

sample_std=np.std(squared_errors, ddof=1)

std_err=sample_std/n**0.5

mse_ci=t.interval(confidence, dof, mse, std_err)

rmse_ci = np.sqrt(mse_ci)

rmse_ci

array([0.41847432, 0.43578475])