In [0]:
# Load Spark and access Databricks dataset
from pyspark.sql import SparkSession

# Initialize SparkSession (usually auto-initialized in Databricks)
spark = SparkSession.builder.getOrCreate()

# Load sample data
filepath = "/FileStore/tables/mudah_apartment_kl_selangor__1_-1.csv"
df = spark.read.csv(filepath, header=True, inferSchema=True)

# Show the first few rows
df.show(5)


+---------+--------------------+---------------+------------------+--------------------+-----------------+-----+-------+--------+-----------+-------------------+--------------------+---------------------+------------+
|   ads_id|           prop_name|completion_year|      monthly_rent|            location|    property_type|rooms|parking|bathroom|       size|          furnished|          facilities|additional_facilities|      region|
+---------+--------------------+---------------+------------------+--------------------+-----------------+-----+-------+--------+-----------+-------------------+--------------------+---------------------+------------+
|100323185|The Hipster @ Tam...|         2022.0|RM 4 200 per month|Kuala Lumpur - Ta...|      Condominium|    5|    2.0|     6.0|1842 sq.ft.|    Fully Furnished|Minimart, Gymnasi...| Air-Cond, Cooking...|Kuala Lumpur|
|100203973|        Segar Courts|           NULL|RM 2 300 per month|Kuala Lumpur - Ch...|      Condominium|    3|    1.0|     2.0

In [0]:
# Check the structure and data types of the DataFrame
df.printSchema()


root
 |-- ads_id: integer (nullable = true)
 |-- prop_name: string (nullable = true)
 |-- completion_year: double (nullable = true)
 |-- monthly_rent: string (nullable = true)
 |-- location: string (nullable = true)
 |-- property_type: string (nullable = true)
 |-- rooms: string (nullable = true)
 |-- parking: double (nullable = true)
 |-- bathroom: double (nullable = true)
 |-- size: string (nullable = true)
 |-- furnished: string (nullable = true)
 |-- facilities: string (nullable = true)
 |-- additional_facilities: string (nullable = true)
 |-- region: string (nullable = true)



In [0]:
# Show summary statistics for numerical columns
df.describe().show()


+-------+------------------+--------------------+-----------------+------------------+--------------------+---------------+------------------+------------------+------------------+---------------+-------------------+--------------------+---------------------+------------+
|summary|            ads_id|           prop_name|  completion_year|      monthly_rent|            location|  property_type|             rooms|           parking|          bathroom|           size|          furnished|          facilities|additional_facilities|      region|
+-------+------------------+--------------------+-----------------+------------------+--------------------+---------------+------------------+------------------+------------------+---------------+-------------------+--------------------+---------------------+------------+
|  count|             19991|               19043|            10806|             19989|               19991|          19991|             19985|             14289|             19985| 

In [0]:
# Count the number of rows in the dataset
row_count = df.count()
print(f"Total rows in the dataset: {row_count}")


Total rows in the dataset: 19991


In [0]:
# Group data by 'location' and count the number of listings
location_count_df = df.groupBy("location").count()

# Display the result for visualization
location_count_df.show(10)


+--------------------+-----+
|            location|count|
+--------------------+-----+
|  Kuala Lumpur - OUG|   63|
|Kuala Lumpur - Ji...|    9|
|Selangor - Kota D...|  172|
|Selangor - Puncak...|   63|
|Selangor - Bandar...|   32|
|Selangor - Glenmarie|   19|
|Selangor - Pulau ...|    1|
|Kuala Lumpur - So...|   60|
|Kuala Lumpur - Ch...|    1|
|Selangor - Petali...|  612|
+--------------------+-----+
only showing top 10 rows



In [0]:
# Sort by count in descending order
sorted_df = location_count_df.orderBy("count", ascending=False)
display(sorted_df.limit(5))

location,count
Kuala Lumpur - Cheras,1623
Selangor - Kajang,1022
Kuala Lumpur - Setapak,973
Selangor - Shah Alam,971
Selangor - Cyberjaya,879


In [0]:
# Group data by 'location' and count the number of listings
location_count_df = df.groupBy("location").count()

# Display the result for visualization
display(location_count_df.limit(10))


location,count
Kuala Lumpur - OUG,63
Kuala Lumpur - Jinjang,9
Selangor - Kota Damansara,172
Selangor - Puncak Alam,63
Selangor - Bandar Kinrara,32
Selangor - Glenmarie,19
Selangor - Pulau Indah (Pulau Lumut),1
Kuala Lumpur - Solaris Dutamas,60
Kuala Lumpur - Chan Sow Lin,1
Selangor - Petaling Jaya,612


Databricks visualization. Run in Databricks to view.

In [0]:
# Group by 'monthly_rent' and count occurrences, then sort by count in descending order
top_rent_values = df.groupBy("monthly_rent").count().orderBy("count", ascending=False)

# Show the top 10 most frequent rent values
top_rent_values.show(10)

# Visualize the data
display(top_rent_values)


+------------------+-----+
|      monthly_rent|count|
+------------------+-----+
|RM 1 500 per month| 1401|
|RM 1 200 per month| 1373|
|RM 1 300 per month| 1209|
|RM 1 600 per month| 1067|
|RM 1 400 per month| 1024|
|RM 1 000 per month|  986|
|RM 1 800 per month|  899|
|RM 1 100 per month|  828|
|RM 1 700 per month|  755|
|RM 2 000 per month|  728|
+------------------+-----+
only showing top 10 rows



monthly_rent,count
RM 1 500 per month,1401
RM 1 200 per month,1373
RM 1 300 per month,1209
RM 1 600 per month,1067
RM 1 400 per month,1024
RM 1 000 per month,986
RM 1 800 per month,899
RM 1 100 per month,828
RM 1 700 per month,755
RM 2 000 per month,728


In [0]:
# Group by 'monthly_rent' and count occurrences
rent_distribution = df.groupBy("monthly_rent").count().orderBy("monthly_rent", ascending=True)

# Display the result for visualization
display(rent_distribution.limit(10))


monthly_rent,count
,2
RM 1 000 per month,986
RM 1 030 per month,2
RM 1 045 per month,1
RM 1 048 per month,1
RM 1 049 per month,2
RM 1 050 per month,106
RM 1 070 per month,1
RM 1 080 per month,4
RM 1 090 per month,5


Databricks visualization. Run in Databricks to view.