In [8]:
import os

print("Current working directory:", os.getcwd())

Current working directory: c:\Users\Babak.Baradaranhezav\ml-projects\ml_airbnb_price_regression


In [9]:
from pathlib import Path
import os

# Set working directory only if not already set
cwd = Path.cwd()
if not (cwd / ".git").exists():
    for parent in cwd.parents:
        if (parent / ".git").exists():
            os.chdir(parent)
            print("Working directory set to repo root:", parent)
            break
    else:
        raise FileNotFoundError("Could not find .git repo root. Are you inside the correct project folder?")
else:
    print("Already in repo root:", cwd)


Already in repo root: c:\Users\Babak.Baradaranhezav\ml-projects\ml_airbnb_price_regression


In [10]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load cleaned EDA data
df = pd.read_csv('data/processed/cleaned_listings.csv')  # Update this path as needed

## Step 1: Feature Engineering – Host Age

**Problem**  
`host_since` is a date column but stored as text. We want to understand how experienced a host is.

**Goal**  
Calculate how long a host has been active (in days).

**Approach**  
Convert `host_since` to datetime, and subtract from today to get `host_age_days`.

In [11]:
# Convert host_since to datetime
if 'host_since' in df.columns:
    df['host_since'] = pd.to_datetime(df['host_since'], errors='coerce')
    df['host_age_days'] = (pd.to_datetime('today') - df['host_since']).dt.days
    print("'host_since' converted and 'host_age_days' created.")
else:
    print("'host_since' not found in dataset.")

'host_since' converted and 'host_age_days' created.


## Step 2: Feature Engineering – First Review & Categorical Encoding

**Problem**  
- `last_review` is a date stored as text.
- Categorical features like `room_type` must be encoded.

**Goal**  
Convert `last_review` into `days_since_last_review` and one-hot encode selected categorical variables.

**Approach**  
- Convert `last_review` to datetime.
- One-hot encode `room_type`.

In [12]:
# Convert 'last_review' to datetime
if 'last_review' in df.columns:
    df['last_review'] = pd.to_datetime(df['last_review'], errors='coerce')
    df['days_since_last_review'] = (pd.Timestamp('today') - df['last_review']).dt.days
    print("Feature engineered: days_since_last_review")
else:
    print("'last_review' column not found. Skipping.")

# One-hot encode room_type
cat_cols = ['room_type']
for col in cat_cols:
    if col in df.columns:
        one_hot = pd.get_dummies(df[col], prefix=col, drop_first=True)
        df = pd.concat([df, one_hot], axis=1)
        print(f"One-hot encoded: {col}")
    else:
        print(f"Column '{col}' not found.")

Feature engineered: days_since_last_review
One-hot encoded: room_type


### Step 3: Feature Engineering – Review Scores

**Problem**  
Many review score columns (like `review_scores_rating`, `review_scores_accuracy`, etc.) are numeric but may have missing values.

**Goal**  
- Understand how well-rated each listing is.
- Create an aggregate score or handle missing scores properly.

**Approach**  
- Identify all review score columns.
- Fill missing values with column means or flags.
- (Optional) Create an average score column.


In [13]:
# Identify review score columns
review_cols = [col for col in df.columns if col.startswith('review_scores_')]
print(f"Found review score columns: {review_cols}")

# Fill missing values with column means
for col in review_cols:
    if df[col].dtype in ['float64', 'int64']:
        df[col] = df[col].fillna(df[col].mean())
        print(f"Filled missing values in '{col}' with column mean.")

# Optional: Create a total or average score
if review_cols:
    df['avg_review_score'] = df[review_cols].mean(axis=1)
    print("Created 'avg_review_score' column.")


Found review score columns: ['review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value']
Filled missing values in 'review_scores_rating' with column mean.
Filled missing values in 'review_scores_accuracy' with column mean.
Filled missing values in 'review_scores_cleanliness' with column mean.
Filled missing values in 'review_scores_checkin' with column mean.
Filled missing values in 'review_scores_communication' with column mean.
Filled missing values in 'review_scores_location' with column mean.
Filled missing values in 'review_scores_value' with column mean.
Created 'avg_review_score' column.


### Step 5: Save Feature-Engineered Dataset

In [14]:
# Save processed dataset for modeling
df.to_csv('data/processed/featured_listings.csv', index=False)
print("Saved feature-engineered dataset.")

Saved feature-engineered dataset.
