# Task 2 - Apartment Rent Prediction using Decision Tree Regressor  

**Objective:** Predict rental prices using Spark MLlib Decision Tree Regressor.  

This notebook follows the same preprocessing as the Naive Bayes classification notebook,  
but replaces classification with a **Decision Tree Regressor** for continuous price prediction.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator

spark = SparkSession.builder \
    .appName("RentalPriceDecisionTree") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .getOrCreate()

print(f"Spark Version: {spark.version}")

In [None]:
## Discover and Visualize the Data

# Load CSV with pandas then convert to Spark DataFrame
path = "apartments_for_rent_classified_100K.csv"
df = pd.read_csv(path, sep=";", engine="python", encoding="cp1252")

df.columns = (df.columns
              .str.strip()
              .str.lower()
              .str.replace(r"\s+", "_", regex=True))

df_spark = spark.createDataFrame(df)

print("\n=== DATA EXPLORATION ===")
print("Schema:")
df_spark.printSchema()

print("\nBasic statistics:")
df_spark.describe().show()

print("\nData types and null counts:")
df_spark.select([count(when(col(c).isNull(), c)).alias(c) for c in df_spark.columns]).show()

# Price statistics
df_spark.select("price").describe().show()

## Prepare the Data for Machine Learning Algorithms
print("\n=== Data Preparation ===")

# Filter and clean
df_clean = df_spark.filter(col('price').isNotNull() & (col('price') > 0))
df_clean = df_clean.withColumn("price", col("price").cast("double"))

# Fill missing values
df_clean = df_clean.fillna({
    'bedrooms': 1.0,
    'bathrooms': 1.0,
    'square_feet': 500.0,
    'latitude': 0.0,
    'longitude': 0.0,
    'cityname': 'Unknown',
    'state': 'Unknown',
    'pets_allowed': 'Unknown',
    'has_photo': 'No',
    'source': 'Unknown'
})

# Cast numeric columns
numeric_cols = ['bedrooms', 'bathrooms', 'square_feet', 'latitude', 'longitude']
for col_name in numeric_cols:
    if col_name in df_clean.columns:
        df_clean = df_clean.withColumn(col_name, col(col_name).cast("double"))

# Feature engineering
df_clean = df_clean.withColumn("price_per_sqft",
    when(col("square_feet") > 0, col("price") / col("square_feet")).otherwise(0.0))

df_clean = df_clean.withColumn("total_rooms",
    col("bedrooms") + col("bathrooms"))

df_clean = df_clean.withColumn("location_score",
    abs(col("latitude")) + abs(col("longitude")))

df_clean = df_clean.withColumn("room_density",
    when(col("square_feet") > 0, col("total_rooms") / col("square_feet")).otherwise(0.0))

print("New features created successfully")

In [None]:
print("\n=== DECISION TREE REGRESSOR TRAINING ===")

# String indexing for categorical variables
categorical_cols = ['state', 'has_photo']
indexers = []
indexed_cols = []

for col_name in categorical_cols:
    if col_name in df_clean.columns:
        indexer = StringIndexer(
            inputCol=col_name,
            outputCol=f"{col_name}_idx",
            handleInvalid="keep"
        )
        indexers.append(indexer)
        indexed_cols.append(f"{col_name}_idx")

# Final feature set
final_features = [
    'bedrooms', 'bathrooms', 'square_feet',
    'price_per_sqft', 'total_rooms', 'location_score', 'room_density'
] + indexed_cols

assembler = VectorAssembler(
    inputCols=final_features,
    outputCol="features"
)

# Decision Tree Regressor
dt = DecisionTreeRegressor(featuresCol="features", labelCol="price", maxDepth=10)

# Pipeline
pipeline = Pipeline(stages=indexers + [assembler, dt])

# Train/test split
train_data, test_data = df_clean.randomSplit([0.8, 0.2], seed=42)
print(f"Training data: {train_data.count()} rows")
print(f"Test data: {test_data.count()} rows")

# Train model
dt_model = pipeline.fit(train_data)

# Predictions
predictions = dt_model.transform(test_data)
predictions.select("features", "price", "prediction").show(5, truncate=False)

# Evaluate model
evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print(f"Decision Tree RMSE = {rmse:.2f}")

# Model summary
tree_model = dt_model.stages[-1]
print("Decision Tree Depth:", tree_model.depth)
print("Number of Nodes:", tree_model.numNodes)