In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [2]:
df9 = pd.read_csv("df_9.csv")
test_ids = df9.ID.iloc[7000:].values # save IDs for later output
df9.drop(["Unnamed: 0", "ID", 'price', 'host_has_profile_pic_t','host_identity_verified_t'], axis=1, inplace=True)
y_train = df9['log_price'].iloc[:7000].values
#y_test = np.zeros(3000)
X_train = df9.drop(['log_price'], axis=1).iloc[:7000].values
X_test = df9.drop(['log_price'], axis=1).iloc[7000:].values

#### PCA & Random Forest

In [9]:
pipe_pca_rf = make_pipeline(StandardScaler(), 
                            PCA(n_components=0.9, svd_solver='full'),
                            RandomForestRegressor(random_state=42))

pipe_pca_rf.fit(X_train, y_train)

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

cv_scores = -cross_val_score(pipe_pca_rf, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')

mse = np.mean(cv_scores)
rmse = np.sqrt(mse)
print("CV Root Mean Squared Error:", rmse)

pipe_pca_rf.fit(X_train, y_train)


y_pred = pipe_pca_rf.predict(X_test)
y_pred_dollar = np.exp(y_pred)

CV Root Mean Squared Error: 0.44153732506373433


In [7]:
# save predictions to csv
out = pd.DataFrame({"ID":test_ids, "price":y_pred_dollar})
out.to_csv("PCARandomForestPredictions.csv", index=False, header=True)

#### PCA & Extra Trees

In [8]:
pipe_pca_xtra = make_pipeline(StandardScaler(), 
                            PCA(n_components=0.95, svd_solver='full'),
                            ExtraTreesRegressor(random_state=42))

pipe_pca_xtra.fit(X_train, y_train)

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

cv_scores = -cross_val_score(pipe_pca_xtra, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')

mse = np.mean(cv_scores)
rmse = np.sqrt(mse)
print("CV Root Mean Squared Error:", rmse)

pipe_pca_xtra.fit(X_train, y_train)


y_pred = pipe_pca_xtra.predict(X_test)
y_pred_dollar = np.exp(y_pred)

CV Root Mean Squared Error: 0.4342546687122259


In [None]:
# save predictions to csv
out = pd.DataFrame({"ID":test_ids, "price":y_pred_dollar})
out.to_csv("PCAExtraTreesPredictions.csv", index=False, header=True)

#### XGBoost Regressor

In [4]:
pipe_xgb = make_pipeline(StandardScaler(),
                         XGBRegressor(random_state=42))

pipe_xgb.fit(X_train, y_train)

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

cv_scores = -cross_val_score(pipe_xgb, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')

mse = np.mean(cv_scores)
rmse = np.sqrt(mse)
print("CV Root Mean Squared Error:", rmse)

pipe_xgb.fit(X_train, y_train)


y_pred = pipe_xgb.predict(X_test)
y_pred_dollar = np.exp(y_pred)

CV Root Mean Squared Error: 0.4054048597083078


In [5]:
# save predictions to csv
out = pd.DataFrame({"ID":test_ids, "price":y_pred_dollar})
out.to_csv("XGBRegressorPredictions.csv", index=False, header=True)

In [7]:
pipe_xgb.get_feature_names_out

<bound method Pipeline.get_feature_names_out of Pipeline(steps=[('standardscaler', StandardScaler()),
                ('xgbregressor',
                 XGBRegressor(base_score=None, booster=None, callbacks=None,
                              colsample_bylevel=None, colsample_bynode=None,
                              colsample_bytree=None, early_stopping_rounds=None,
                              enable_categorical=False, eval_metric=None,
                              feature_types=None, gamma=None, gpu_id=None,
                              grow_policy=None, importance_type=None,
                              interaction_constraints=None, learning_rate=None,
                              max_bin=None, max_cat_threshold=None,
                              max_cat_to_onehot=None, max_delta_step=None,
                              max_depth=None, max_leaves=None,
                              min_child_weight=None, missing=nan,
                              monotone_constraints=None, n_

In [None]:
pipe_xgb.