### Step 1 – Load Data

In [21]:
import os

print("Current working directory:", os.getcwd())

Current working directory: /Users/babak/Github/ml-projects/01_airbnb_price_regression/notebooks


In [22]:
# Manually set the working directory to your notebook location
target_dir = "/Users/babak/Github/ml-projects/01_airbnb_price_regression/notebooks"
os.chdir(target_dir)

print("Now working in:", os.getcwd())

Now working in: /Users/babak/Github/ml-projects/01_airbnb_price_regression/notebooks


In [23]:
import pandas as pd

df = pd.read_csv('../data/processed/featured_listings.csv')
print(f"Loaded data with shape: {df.shape}")
df.head()

Loaded data with shape: (5596, 23)


Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,...,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,license,price_log,days_since_last_review,room_type_Hotel room,room_type_Private room,room_type_Shared room
0,13188,Garden level studio in ideal loc.,51466,Emma,,Riley Park,49.24773,-123.10509,Entire home/apt,136.0,...,1.89,1,109,56,Municipal registration number: 25-156058<br />...,4.919981,66.0,False,False,False
1,13358,Downtown Designer suite,52116,Lynn,,Downtown,49.281174,-123.125931,Entire home/apt,225.0,...,3.11,1,135,50,Municipal registration number: 25-157257,5.420535,66.0,False,False,False
2,16254,Close to PNE/Hastings Park Garden level suite,63238,Jason,,Hastings-Sunrise,49.27721,-123.04086,Entire home/apt,680.0,...,0.09,1,249,0,,6.523562,1075.0,False,False,False
3,16611,"1 block to skytrain station, shops,restaurant,...",58512,Q,,Grandview-Woodland,49.26339,-123.07145,Entire home/apt,,...,0.03,5,89,0,,,2700.0,False,False,False
4,17765,"Central location, nice little apartment",68672,James,,Mount Pleasant,49.26132,-123.10845,Entire home/apt,,...,1.32,1,0,0,Municipal registration number: 21-156705,,1377.0,False,False,False


### Step 2 – Define Target + Features

In [45]:
# STEP 2: Clean and Prepare Features

# Drop rows where price is missing
df = df.dropna(subset=['price'])

# Drop unnecessary or ID columns
drop_cols = ['id', 'name', 'host_id', 'host_name', 'last_review', 'license']
df = df.drop(columns=[col for col in drop_cols if col in df.columns])

# Separate target and features
y = df['price']
X = df.drop(columns=['price'])

# One-hot encode categorical variables
X = pd.get_dummies(X, drop_first=True)

# Ensure index alignment
X = X.reset_index(drop=True)
y = y.reset_index(drop=True)

# Print confirmation
print("Prepared features and target.")
print("X shape:", X.shape)
print("Remaining dtypes:\n", X.dtypes.value_counts())


Prepared features and target.
X shape: (4584, 39)
Remaining dtypes:
 bool       28
float64     6
int64       5
Name: count, dtype: int64


### Step 3 – Train/Test Split

In [None]:
from sklearn.model_selection import train_test_split

# Explicitly set the target column name
target = 'price'

# Create feature and target sets
X = df.drop(columns=[target])
y = df[target]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Reset indices to ensure alignment
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

# Print confirmation
print(f"Training rows: {X_train.shape[0]} | Test rows: {X_test.shape[0]}")


Training rows: 3667 | Test rows: 917


### Step 4 – Baseline Model

In [43]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np
import pandas as pd

# Predict the mean of y_train
mean_price = y_train.mean()
y_pred_baseline = pd.Series([mean_price] * len(y_test), index=y_test.index)

# Check for missing values in y_test or predictions
print("Missing values in y_test:", y_test.isnull().sum())
print("Missing values in y_pred_baseline:", y_pred_baseline.isnull().sum())

# Apply mask safely
mask = ~y_test.isnull()
mae = mean_absolute_error(y_test[mask], y_pred_baseline[mask])
rmse = np.sqrt(mean_squared_error(y_test[mask], y_pred_baseline[mask]))

# Show metrics
print(f"Baseline Mean Prediction: {mean_price:.2f}")
print(f"Baseline MAE: {mae:.2f}")
print(f"Baseline RMSE: {rmse:.2f}")


Missing values in y_test: 0
Missing values in y_pred_baseline: 0
Baseline Mean Prediction: 227.33
Baseline MAE: 147.56
Baseline RMSE: 710.55


### Step 5 – Linear Regression

In [44]:
from sklearn.linear_model import LinearRegression

# Train
lr_model = LinearRegression()

print("Non-numeric columns in X_train:")
print(X_train.select_dtypes(include='object').columns)

lr_model.fit(X_train, y_train)

# Predict
y_pred_lr = lr_model.predict(X_test)

# Evaluate
mae_lr = mean_absolute_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))

print(f"🔹 Linear Regression MAE: {mae_lr:.2f}")
print(f"🔹 Linear Regression RMSE: {rmse_lr:.2f}")

Non-numeric columns in X_train:
Index(['neighbourhood', 'room_type'], dtype='object')


ValueError: could not convert string to float: 'Downtown'