In [33]:
import pandas as pd
import pickle

In [34]:
pickle_path = "../../sandbox/lucie_pinterova/listing_clean_price.pck"

# Load the pickle file
with open(pickle_path, 'rb') as f:
    data = pickle.load(f)

(data.dtypes)

host_since                   int32
host_response_time        category
host_response_rate         float64
host_acceptance_rate       float64
host_is_superhost             bool
host_listings_count          int64
host_has_profile_pic          bool
host_identity_verified        bool
neighbourhood_cleansed    category
latitude                   float64
longitude                  float64
room_type                 category
accommodates                 int64
bathrooms_text             float64
beds                       float64
price                      float64
minimum_nights               int64
maximum_nights               int64
has_availability              bool
instant_bookable              bool
reviews_per_month          float64
count_verifications          int64
seasonal_availability      float64
season                    category
min_rating                 float64
max_rating                 float64
dtype: object

In [35]:
# encoding

categorical_columns = data.select_dtypes(include=['object', 'category']).columns
print(categorical_columns)

Index(['host_response_time', 'neighbourhood_cleansed', 'room_type', 'season'], dtype='object')


In [39]:
# Linear Regression

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.impute import SimpleImputer

# Dummy encode categorical columns

dummy_df = pd.get_dummies(data, columns=categorical_columns)

# Split features (X) and target variable (y)
X = dummy_df.drop('price', axis=1)  # Features
y = dummy_df['price']  # Target variable

# Impute missing values in X
imputer = SimpleImputer(strategy='most_frequent')  # You can use other strategies like 'median' or 'most_frequent'
X_imputed = imputer.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

# Initialize and fit linear regression model
linear_regressor = LinearRegression()
linear_regressor.fit(X_train, y_train)

# Predict on the testing set
y_pred_linear = linear_regressor.predict(X_test)

# Calculate R-squared score
r2_linear = r2_score(y_test, y_pred_linear)
print("R2: {:.2f}".format(r2_linear))


R2: 0.04


In [None]:
# 