In [35]:
import pandas as pd
import pickle

In [87]:
pickle_path = "../../sandbox/filip_rott/listing_clean_price.pck"

# Load the pickle file
with open(pickle_path, 'rb') as f:
    data = pickle.load(f)

print(data.columns)

del_cols = ["last_scraped", "description", "latitude", "longitude", "host_has_profile_pic", "calendar_last_scraped", "bathrooms"]
data = data.drop(del_cols, axis=1)

Index(['last_scraped', 'description', 'host_since', 'host_response_time',
       'host_response_rate', 'host_acceptance_rate', 'host_is_superhost',
       'host_listings_count', 'host_has_profile_pic', 'host_identity_verified',
       'neighbourhood_cleansed', 'latitude', 'longitude', 'room_type',
       'accommodates', 'bathrooms', 'bathrooms_text', 'bedrooms', 'beds',
       'price', 'minimum_nights', 'maximum_nights', 'has_availability',
       'calendar_last_scraped', 'instant_bookable', 'reviews_per_month',
       'amenities_count', 'count_verifications', 'seasonal_availability',
       'season', 'min_rating', 'max_rating', 'distance_from_city_centre'],
      dtype='object')


In [88]:
# encoding

categorical_columns = data.select_dtypes(include=['object', 'category']).columns
print(categorical_columns)

Index(['host_response_time', 'host_is_superhost', 'neighbourhood_cleansed',
       'room_type', 'has_availability', 'season'],
      dtype='object')


In [89]:
# Linear Regression

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.impute import SimpleImputer

# Dummy encode categorical columns

dummy_df = pd.get_dummies(data, columns=categorical_columns)

# Split features (X) and target variable (y)
X = dummy_df.drop('price', axis=1)  # Features
y = dummy_df['price']  # Target variable

# Impute missing values in X
imputer = SimpleImputer(strategy='most_frequent')  # You can use other strategies like 'median' or 'most_frequent'
X_imputed = imputer.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

# Initialize and fit linear regression model
linear_regressor = LinearRegression()
linear_regressor.fit(X_train, y_train)

# Predict on the testing set
y_pred_linear = linear_regressor.predict(X_test)

# Calculate R-squared score
r2_linear = r2_score(y_test, y_pred_linear)
print("R2: {:.2f}".format(r2_linear))


R2: 0.03


In [90]:
# Catboost

from catboost import CatBoostRegressor

# Split features (X) and target variable (y)
X = data.drop('price', axis=1)  # Features
y = data['price']  # Target variable

# Initialize and fit CatBoost regression model
catboost_regressor = CatBoostRegressor(iterations=1000,  # Adjust parameters as needed
                                       learning_rate=0.1,
                                       depth=7,
                                       loss_function='RMSE',
                                       verbose=0)
catboost_regressor.fit(X_train, y_train)

# Predict on the testing set
y_pred_catboost = catboost_regressor.predict(X_test)

# Calculate R-squared score
r2_catboost = r2_score(y_test, y_pred_catboost)
print("R2 (CatBoost): {:.2f}".format(r2_catboost))

R2 (CatBoost): 0.44
