# Task 2 - Apartment Rent Prediction using Decision Tree Regressor  

**Objective:** Predict rental prices using Spark MLlib Decision Tree Regressor.  

This notebook follows the same preprocessing as the Naive Bayes classification notebook,  
but replaces classification with a **Decision Tree Regressor** for continuous price prediction.


In [2]:
pip install pyspark


Collecting pyspark
  Downloading pyspark-4.0.0.tar.gz (434.1 MB)
     ---------------------------------------- 0.0/434.1 MB ? eta -:--:--
     --------------------------------------- 0.5/434.1 MB 14.9 MB/s eta 0:00:30
     --------------------------------------- 1.3/434.1 MB 16.5 MB/s eta 0:00:27
     --------------------------------------- 2.5/434.1 MB 17.9 MB/s eta 0:00:25
     --------------------------------------- 3.6/434.1 MB 19.1 MB/s eta 0:00:23
     --------------------------------------- 4.3/434.1 MB 19.7 MB/s eta 0:00:22
     --------------------------------------- 5.1/434.1 MB 18.2 MB/s eta 0:00:24
      -------------------------------------- 6.1/434.1 MB 19.6 MB/s eta 0:00:22
      -------------------------------------- 6.9/434.1 MB 20.2 MB/s eta 0:00:22
      -------------------------------------- 7.8/434.1 MB 19.9 MB/s eta 0:00:22
      -------------------------------------- 8.7/434.1 MB 19.9 MB/s eta 0:00:22
      ------------------------------------- 10.0/434.1 MB 20.0

  DEPRECATION: pyspark is being installed using the legacy 'setup.py install' method, because it does not have a 'pyproject.toml' and the 'wheel' package is not installed. pip 23.1 will enforce this behaviour change. A possible replacement is to enable the '--use-pep517' option. Discussion can be found at https://github.com/pypa/pip/issues/8559
  error: subprocess-exited-with-error
  
  × Running setup.py install for pyspark did not run successfully.
  │ exit code: 1
  ╰─> [2430 lines of output]
      running install
      running build
      running build_py
      creating build
      creating build\lib
      creating build\lib\pyspark
      copying pyspark\accumulators.py -> build\lib\pyspark
      copying pyspark\conf.py -> build\lib\pyspark
      copying pyspark\daemon.py -> build\lib\pyspark
      copying pyspark\errors_doc_gen.py -> build\lib\pyspark
      copying pyspark\find_spark_home.py -> build\lib\pyspark
      copying pyspark\install.py -> build\lib\pyspark
      copying p

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator

spark = SparkSession.builder \
    .appName("RentalPriceDecisionTree") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .getOrCreate()

print(f"Spark Version: {spark.version}")

Spark Version: 4.0.0


In [None]:
## Discover and Visualize the Data

# Load CSV directly with Spark (more robust than pandas for big CSVs)
path = "apartments_for_rent_classified_100K.csv"
df_spark = spark.read.csv(
    path,
    header=True,
    sep=";",
    inferSchema=True,
    multiLine=True,
    escape='"'
)

# Rename columns: lower-case, replace spaces with underscores
for c in df_spark.columns:
    df_spark = df_spark.withColumnRenamed(c, c.strip().lower().replace(" ", "_"))

print("\n=== DATA EXPLORATION ===")
print("Schema:")
df_spark.printSchema()

print("\nBasic statistics:")
df_spark.describe().show()

print("\nData types and null counts:")
df_spark.select([count(when(col(c).isNull(), c)).alias(c) for c in df_spark.columns]).show()

# Price statistics
df_spark.select("price").describe().show()

## Prepare the Data for Machine Learning Algorithms
print("\n=== Data Preparation ===")

from pyspark.sql.functions import regexp_replace, col, when, abs, expr

print("\n=== Data Preparation ===")

# --- Step 1: Clean PRICE column ---
df_clean = df_spark.withColumn(
    "price_num",
    regexp_replace(col("price").cast("string"), "[^0-9]", "")  # keep only digits
)

df_clean = df_clean.withColumn("price_num", expr("try_cast(price_num as double)"))
df_clean = df_clean.filter(col("price_num").isNotNull() & (col("price_num") > 0))

# --- Step 2: Clean and cast OTHER numeric columns safely ---
numeric_cols = ['bedrooms', 'bathrooms', 'square_feet', 'latitude', 'longitude']

for col_name in numeric_cols:
    if col_name in df_clean.columns:
        df_clean = df_clean.withColumn(
            col_name,
            regexp_replace(col(col_name).cast("string"), "[^0-9.-]", "")  # keep digits, minus, dot
        )
        df_clean = df_clean.withColumn(col_name, expr(f"try_cast({col_name} as double)"))

# --- Step 3: Fill missing values with safe defaults ---
df_clean = df_clean.fillna({
    'bedrooms': 1.0,
    'bathrooms': 1.0,
    'square_feet': 500.0,
    'latitude': 0.0,
    'longitude': 0.0,
    'cityname': 'Unknown',
    'state': 'Unknown',
    'pets_allowed': 'Unknown',
    'has_photo': 'No',
    'source': 'Unknown'
})

# --- Step 4: Feature engineering ---
df_clean = df_clean.withColumn(
    "price_per_sqft",
    when(col("square_feet") > 0, col("price_num") / col("square_feet")).otherwise(0.0)
)

df_clean = df_clean.withColumn("total_rooms",
    (col("bedrooms") + col("bathrooms"))
)

df_clean = df_clean.withColumn("location_score",
    abs(col("latitude")) + abs(col("longitude"))
)

df_clean = df_clean.withColumn("room_density",
    when(col("square_feet") > 0, (col("bedrooms") + col("bathrooms")) / col("square_feet")).otherwise(0.0)
)

print("Cleaning complete")
print("Original rows:", df_spark.count())
print("Rows after cleaning:", df_clean.count())

df_clean.select("price", "price_num", "bedrooms", "bathrooms", "square_feet").show(20, truncate=False)





=== DATA EXPLORATION ===
Schema:
root
 |-- id: long (nullable = true)
 |-- category: string (nullable = true)
 |-- title: string (nullable = true)
 |-- body: string (nullable = true)
 |-- amenities: string (nullable = true)
 |-- bathrooms: string (nullable = true)
 |-- bedrooms: string (nullable = true)
 |-- currency: string (nullable = true)
 |-- fee: string (nullable = true)
 |-- has_photo: string (nullable = true)
 |-- pets_allowed: string (nullable = true)
 |-- price: string (nullable = true)
 |-- price_display: string (nullable = true)
 |-- price_type: string (nullable = true)
 |-- square_feet: integer (nullable = true)
 |-- address: string (nullable = true)
 |-- cityname: string (nullable = true)
 |-- state: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- source: string (nullable = true)
 |-- time: integer (nullable = true)


Basic statistics:
+-------+--------------------+--------------------+--------------------+--

In [23]:
print("Original rows:", df_spark.count())
print("Rows after cleaning:", df_clean.count())
df_clean.select("price", "price_num").show(20, truncate=False)


Original rows: 99492
Rows after cleaning: 99491
+-----+---------+
|price|price_num|
+-----+---------+
|2195 |2195.0   |
|1250 |1250.0   |
|1395 |1395.0   |
|1600 |1600.0   |
|975  |975.0    |
|1250 |1250.0   |
|1600 |1600.0   |
|1300 |1300.0   |
|795  |795.0    |
|2150 |2150.0   |
|1795 |1795.0   |
|3195 |3195.0   |
|2395 |2395.0   |
|7800 |7800.0   |
|720  |720.0    |
|2000 |2000.0   |
|729  |729.0    |
|1045 |1045.0   |
|1800 |1800.0   |
|1250 |1250.0   |
+-----+---------+
only showing top 20 rows


In [27]:
print("\n=== DECISION TREE REGRESSOR TRAINING ===")

# String indexing for categorical variables
categorical_cols = ['state', 'has_photo']
indexers = []
indexed_cols = []

for col_name in categorical_cols:
    if col_name in df_clean.columns:
        indexer = StringIndexer(
            inputCol=col_name,
            outputCol=f"{col_name}_idx",
            handleInvalid="keep"
        )
        indexers.append(indexer)
        indexed_cols.append(f"{col_name}_idx")

# Final feature set
final_features = [
    'bedrooms', 'bathrooms', 'square_feet',
    'price_per_sqft', 'total_rooms', 'location_score', 'room_density'
] + indexed_cols

assembler = VectorAssembler(
    inputCols=final_features,
    outputCol="features"
)

# Decision Tree Regressor (use price_num instead of price)
dt = DecisionTreeRegressor(featuresCol="features", labelCol="price_num", maxDepth=10,maxBins=100)

# Pipeline
pipeline = Pipeline(stages=indexers + [assembler, dt])

# Train/test split
train_data, test_data = df_clean.randomSplit([0.8, 0.2], seed=42)
print(f"Training data: {train_data.count()} rows")
print(f"Test data: {test_data.count()} rows")

# Train model
dt_model = pipeline.fit(train_data)

# Predictions
predictions = dt_model.transform(test_data)
predictions.select("features", "price_num", "prediction").show(5, truncate=False)

# Evaluate model (also use price_num here)
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(labelCol="price_num", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print(f"Decision Tree RMSE = {rmse:.2f}")

# Model summary
tree_model = dt_model.stages[-1]
print("Decision Tree Depth:", tree_model.depth)
print("Number of Nodes:", tree_model.numNodes)



=== DECISION TREE REGRESSOR TRAINING ===
Training data: 79499 rows
Test data: 19992 rows
+----------------------------------------------------------------------------------------+---------+------------------+
|features                                                                                |price_num|prediction        |
+----------------------------------------------------------------------------------------+---------+------------------+
|[3.0,2.0,1661.0,0.839855508729681,5.0,111.92859999999999,0.0030102347983142685,5.0,0.0] |1395.0   |1490.2280701754387|
|[2.0,2.0,975.0,1.1384615384615384,4.0,111.92859999999999,0.0041025641025641026,5.0,0.0] |1110.0   |1109.61872909699  |
|[1.0,1.0,946.0,2.9651162790697674,2.0,168.2103,0.0021141649048625794,37.0,0.0]          |2805.0   |2765.5443037974683|
|[2.0,2.0,1078.0,0.9257884972170687,4.0,111.92859999999999,0.0037105751391465678,5.0,0.0]|998.0    |1009.2026666666667|
|[2.0,2.0,1138.0,1.0764499121265378,4.0,111.92859999999999,0.003514938

RMSE: 390.03
on average, the model’s predictions are off by about $390 compared to actual rental prices. For rental datasets (prices often $1000–$5000), that’s pretty reasonable.

Tree depth: 10
fair deep tree, able to capture a lot of complexity.

Number of nodes: 1621
Shows the model created a big decision structure with many splits.

In [29]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator

print("\n=== DECISION TREE REGRESSOR WITH FINE-TUNING ===")

# Reuse your pipeline (with indexers, assembler, dt)
dt = DecisionTreeRegressor(featuresCol="features", labelCol="price_num")

pipeline = Pipeline(stages=indexers + [assembler, dt])

# Define parameter grid to search
paramGrid = (ParamGridBuilder()
             .addGrid(dt.maxDepth, [5, 10, 15])        # how deep the tree can go
             .addGrid(dt.maxBins, [64, 128, 256])       # controls handling of categorical features
             .addGrid(dt.minInstancesPerNode, [1, 5, 10])  # minimum rows per leaf
             .build())

# Evaluator (use RMSE, but you can also try R2 or MAE)
evaluator = RegressionEvaluator(
    labelCol="price_num",
    predictionCol="prediction",
    metricName="rmse"
)

# CrossValidator (k-fold CV, here 3 folds)
cv = CrossValidator(
    estimator=pipeline,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=3,             # 3-fold cross-validation
    parallelism=2           # speed up with parallel jobs
)

# Train with cross-validation
cv_model = cv.fit(train_data)

# Predictions on test set
predictions = cv_model.transform(test_data)

# Evaluate RMSE
rmse = evaluator.evaluate(predictions)
print(f"Fine-tuned Decision Tree RMSE = {rmse:.2f}")

# Best model info
best_model = cv_model.bestModel.stages[-1]
print("Best maxDepth:", best_model.getMaxDepth())
print("Best maxBins:", best_model.getMaxBins())
print("Best minInstancesPerNode:", best_model.getMinInstancesPerNode())
print("Best Tree Depth:", best_model.depth)
print("Best Tree Nodes:", best_model.numNodes)



=== DECISION TREE REGRESSOR WITH FINE-TUNING ===
Fine-tuned Decision Tree RMSE = 381.72
Best maxDepth: 15
Best maxBins: 256
Best minInstancesPerNode: 10
Best Tree Depth: 15
Best Tree Nodes: 11553


In [30]:
import random

# Pick a random row index
sample_idx = random.randint(0, predictions.count() - 1)

# Collect one row
sample_row = predictions.collect()[sample_idx]

actual = sample_row["price_num"]
predicted = sample_row["prediction"]
error = actual - predicted

print(f"\nRandom sample index {sample_idx}:")
print(f"  Actual Price:    {actual:,.2f}")
print(f"  Predicted Price: {predicted:,.2f}")
print(f"  Error:           {error:,.2f}")



Random sample index 13:
  Actual Price:    993.00
  Predicted Price: 989.60
  Error:           3.40
