In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, round

import the SparkSession class to create a Spark environment for data processing, and import functions col, avg, and round to select DataFrame columns, calculate averages, and round numerical values.

In [2]:
spark = SparkSession.builder.appName("HousingDataFrameExample").getOrCreate()

creates a Spark session named "HousingDataFrameExample", establishing the Spark environment needed to read, process, and analyze data using DataFrames.

In [3]:
df = spark.read.csv("housing.csv", header=True, inferSchema=True)

This statement reads the file "housing.csv" into a Spark DataFrame, using the first row as headers and automatically inferring the data types of each column.

In [4]:
df.show(10)

+--------+-----+--------+---------+-------+--------+---------+--------+---------------+---------------+-------+--------+----------------+
|   price| area|bedrooms|bathrooms|stories|mainroad|guestroom|basement|hotwaterheating|airconditioning|parking|prefarea|furnishingstatus|
+--------+-----+--------+---------+-------+--------+---------+--------+---------------+---------------+-------+--------+----------------+
|13300000| 7420|       4|        2|      3|     yes|       no|      no|             no|            yes|      2|     yes|       furnished|
|12250000| 8960|       4|        4|      4|     yes|       no|      no|             no|            yes|      3|      no|       furnished|
|12250000| 9960|       3|        2|      2|     yes|       no|     yes|             no|             no|      2|     yes|  semi-furnished|
|12215000| 7500|       4|        2|      2|     yes|       no|     yes|             no|            yes|      3|     yes|       furnished|
|11410000| 7420|       4|        1

displays the first 10 rows of the DataFrame, allowing a quick view of the dataset’s contents.

In [5]:
df.printSchema()

root
 |-- price: integer (nullable = true)
 |-- area: integer (nullable = true)
 |-- bedrooms: integer (nullable = true)
 |-- bathrooms: integer (nullable = true)
 |-- stories: integer (nullable = true)
 |-- mainroad: string (nullable = true)
 |-- guestroom: string (nullable = true)
 |-- basement: string (nullable = true)
 |-- hotwaterheating: string (nullable = true)
 |-- airconditioning: string (nullable = true)
 |-- parking: integer (nullable = true)
 |-- prefarea: string (nullable = true)
 |-- furnishingstatus: string (nullable = true)



displays the schema of the DataFrame, showing each column’s name, data type, and whether it allows null values.

In [6]:
df.describe().show()

+-------+------------------+------------------+------------------+------------------+------------------+--------+---------+--------+---------------+---------------+------------------+--------+----------------+
|summary|             price|              area|          bedrooms|         bathrooms|           stories|mainroad|guestroom|basement|hotwaterheating|airconditioning|           parking|prefarea|furnishingstatus|
+-------+------------------+------------------+------------------+------------------+------------------+--------+---------+--------+---------------+---------------+------------------+--------+----------------+
|  count|               545|               545|               545|               545|               545|     545|      545|     545|            545|            545|               545|     545|             545|
|   mean| 4766729.247706422|  5150.54128440367|2.9651376146788992|1.2862385321100918|1.8055045871559634|    NULL|     NULL|    NULL|           NULL|           N

computes and displays summary statistics for the DataFrame’s numerical columns, including count, mean, standard deviation, minimum, and maximum values.

In [7]:
print("Total rows:", df.count())

Total rows: 545


prints the total number of rows in the DataFrame, representing the number of records in the dataset.

In [8]:
print("Columns:", df.columns)

Columns: ['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'parking', 'prefarea', 'furnishingstatus']


This statement prints the list of column names in the DataFrame, showing all available fields in the dataset.

In [9]:
df.select("price", "area", "bedrooms").show(10)

+--------+-----+--------+
|   price| area|bedrooms|
+--------+-----+--------+
|13300000| 7420|       4|
|12250000| 8960|       4|
|12250000| 9960|       3|
|12215000| 7500|       4|
|11410000| 7420|       4|
|10850000| 7500|       3|
|10150000| 8580|       4|
|10150000|16200|       5|
| 9870000| 8100|       4|
| 9800000| 5750|       3|
+--------+-----+--------+
only showing top 10 rows


selects the "price", "area", and "bedrooms" columns from the DataFrame and displays the first 10 rows of these selected columns.

In [10]:
df.filter((col("area") >= 2000) & (col("bedrooms") >= 3)).show(10)

+--------+-----+--------+---------+-------+--------+---------+--------+---------------+---------------+-------+--------+----------------+
|   price| area|bedrooms|bathrooms|stories|mainroad|guestroom|basement|hotwaterheating|airconditioning|parking|prefarea|furnishingstatus|
+--------+-----+--------+---------+-------+--------+---------+--------+---------------+---------------+-------+--------+----------------+
|13300000| 7420|       4|        2|      3|     yes|       no|      no|             no|            yes|      2|     yes|       furnished|
|12250000| 8960|       4|        4|      4|     yes|       no|      no|             no|            yes|      3|      no|       furnished|
|12250000| 9960|       3|        2|      2|     yes|       no|     yes|             no|             no|      2|     yes|  semi-furnished|
|12215000| 7500|       4|        2|      2|     yes|       no|     yes|             no|            yes|      3|     yes|       furnished|
|11410000| 7420|       4|        1

filters the DataFrame to include only houses with an area of at least 2000 and at least 3 bedrooms, and then displays the first 10 matching rows.

In [11]:
df_with_pps = df.withColumn("price_per_sqft", round(col("price") / col("area"), 2))
df_with_pps.show(10)

+--------+-----+--------+---------+-------+--------+---------+--------+---------------+---------------+-------+--------+----------------+--------------+
|   price| area|bedrooms|bathrooms|stories|mainroad|guestroom|basement|hotwaterheating|airconditioning|parking|prefarea|furnishingstatus|price_per_sqft|
+--------+-----+--------+---------+-------+--------+---------+--------+---------------+---------------+-------+--------+----------------+--------------+
|13300000| 7420|       4|        2|      3|     yes|       no|      no|             no|            yes|      2|     yes|       furnished|       1792.45|
|12250000| 8960|       4|        4|      4|     yes|       no|      no|             no|            yes|      3|      no|       furnished|       1367.19|
|12250000| 9960|       3|        2|      2|     yes|       no|     yes|             no|             no|      2|     yes|  semi-furnished|       1229.92|
|12215000| 7500|       4|        2|      2|     yes|       no|     yes|           

create a new column "price_per_sqft" by dividing the "price" by the "area" and rounding the result to two decimal places, then display the first 10 rows of the updated DataFrame.

In [12]:
df_with_pps.filter(col("price") >= 1000000).orderBy(col("price").desc()).show(10)

+--------+-----+--------+---------+-------+--------+---------+--------+---------------+---------------+-------+--------+----------------+--------------+
|   price| area|bedrooms|bathrooms|stories|mainroad|guestroom|basement|hotwaterheating|airconditioning|parking|prefarea|furnishingstatus|price_per_sqft|
+--------+-----+--------+---------+-------+--------+---------+--------+---------------+---------------+-------+--------+----------------+--------------+
|13300000| 7420|       4|        2|      3|     yes|       no|      no|             no|            yes|      2|     yes|       furnished|       1792.45|
|12250000| 8960|       4|        4|      4|     yes|       no|      no|             no|            yes|      3|      no|       furnished|       1367.19|
|12250000| 9960|       3|        2|      2|     yes|       no|     yes|             no|             no|      2|     yes|  semi-furnished|       1229.92|
|12215000| 7500|       4|        2|      2|     yes|       no|     yes|           

filters the DataFrame to include only houses with a price of at least 1,000,000, orders them in descending order by price, and displays the top 10 most expensive houses.

In [13]:
df_with_pps.groupBy("bedrooms").agg(
    round(avg("price"), 2).alias("avg_price"),
    round(avg("price_per_sqft"), 2).alias("avg_pps")
).show()

+--------+----------+-------+
|bedrooms| avg_price|avg_pps|
+--------+----------+-------+
|       1| 2712500.0| 743.05|
|       6| 4791500.0|1193.44|
|       3|4954598.13|1029.64|
|       5| 5819800.0| 1089.3|
|       4|5729757.89|1077.88|
|       2|3632022.06| 847.85|
+--------+----------+-------+



groups the DataFrame by the number of bedrooms and calculates two metrics for each group: the average price and the average price per square foot, both rounded to two decimal places, and then displays the results.