In [1]:
import pandas as pd
import pickle

In [3]:
pickle_path = "../../sandbox/veronika_junkova/data_for_model.pck"

# Load the pickle file
with open(pickle_path, 'rb') as f:
    data = pickle.load(f)

print(data.columns)
data.drop("reviews_per_month", axis=1, inplace=True)
data.drop("min_rating", axis=1, inplace=True)
data.drop("bedrooms", axis=1, inplace=True)


Index(['host_id', 'host_response_time', 'host_response_rate',
       'host_acceptance_rate', 'host_is_superhost', 'host_listings_count',
       'host_has_profile_pic', 'host_identity_verified',
       'neighbourhood_cleansed', 'room_type', 'accommodates', 'bathrooms',
       'bathrooms_text', 'bedrooms', 'beds', 'price', 'minimum_nights',
       'maximum_nights', 'has_availability', 'instant_bookable',
       'reviews_per_month', 'amenities_count', 'count_verifications',
       'seasonal_availability', 'season', 'min_rating', 'max_rating',
       'distance_from_city_centre'],
      dtype='object')


In [4]:
# categorical columns

categorical_columns = ['host_response_time', 'neighbourhood_cleansed', 'room_type', 'season']

print(categorical_columns)

['host_response_time', 'neighbourhood_cleansed', 'room_type', 'season']


In [5]:
# Linear Regression

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.impute import SimpleImputer


dummy_df = pd.get_dummies(data, columns=categorical_columns)

X = dummy_df.drop('price', axis=1)  # Features
y = dummy_df['price']  # Target variable

imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)


X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

linear_regressor = LinearRegression()
linear_regressor.fit(X_train, y_train)

y_pred_linear = linear_regressor.predict(X_test)

r2_linear = r2_score(y_test, y_pred_linear)
print("R2: {:.2f}".format(r2_linear))


R2: 0.02


In [7]:
# Catboost

from catboost import CatBoostRegressor

X = data.drop('price', axis=1) 
y = data['price']

catboost_regressor = CatBoostRegressor(iterations=1000,
                                       learning_rate=0.1,
                                       depth=7,
                                       loss_function='RMSE',
                                       verbose=0)
catboost_regressor.fit(X_train, y_train)

y_pred_catboost = catboost_regressor.predict(X_test)

# Calculate R-squared score
r2_catboost = r2_score(y_test, y_pred_catboost)
print("R2 (CatBoost): {:.2f}".format(r2_catboost))

R2 (CatBoost): 0.67


In [10]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

dummy_df = pd.get_dummies(data, columns=categorical_columns)

# Split features (X) and target variable (y)
X = dummy_df.drop('price', axis=1)  # Features
y = dummy_df['price']  # Target variable

# Impute missing values in X
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

hyper_grid = {'n_estimators': [100, 200, 300, 500],
               'max_features': [4,8,9],
               'min_samples_split': [5,10, 20]}

#reinstantiate RandomForestRegressor regressor with empty parameter set
forest_model_cv = RandomForestRegressor()

# Instantiate the GridSearchCV with forest_model_cv  as estimator
forest = GridSearchCV(estimator = forest_model_cv, param_grid = hyper_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

forest.fit(X_train, y_train.values.ravel()) #values.ravel() flattened array expected by RandomForestRegressor

print(forest.best_params_)

forest_model_opt= forest.best_estimator_
y_pred_forest = forest_model_opt.predict(X_test)


r2_forest = r2_score(y_test, y_pred_forest)
print("R2: {:.2f}".format(r2_forest))

Fitting 3 folds for each of 36 candidates, totalling 108 fits
{'max_features': 9, 'min_samples_split': 5, 'n_estimators': 200}
R2: 0.45
