In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import regexp_replace
from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType

In [2]:
# Create a Spark session
spark = SparkSession.builder.appName('Seattle').getOrCreate()

In [3]:
# Load file from GitHub
from pyspark import SparkFiles
url = 'https://raw.githubusercontent.com/ZacksAmber/Kaggle-Seattle-Airbnb/main/data/201601/listings.csv'
spark.sparkContext.addFile(url)

# Load dataset as Spark dataframe
spark_df = spark.read.options(inferSchema=True, header=True, sep=',', quote="\"", escape="\"", multiLine=True).csv(SparkFiles.get('listings.csv'))

In [4]:
# Remove '$' in the following columns, then format the column from string to int
dollar_cols = ['price', 'weekly_price', 'monthly_price', 'extra_people', 'security_deposit', 'cleaning_fee']
for dollar_col in dollar_cols:
    spark_df = spark_df.withColumn(dollar_col, regexp_replace(dollar_col, '\$', ''))
    spark_df = spark_df.withColumn(dollar_col, col(dollar_col).cast("Integer"))

In [5]:
# Create a SQL view for Spark SQL query
spark_df.createOrReplaceTempView('spark_view')
# Export spark dataframe to pandas dataframe
pandas_df = spark_df.toPandas()

# Now we have 1 Spark dataframe, 1 Spark view, 1 Pandas dataframe
# We can query data from all of the three data source

In [6]:
# Query data from Spark view
sql = """
SELECT property_type, AVG(price) AS avg_price
FROM spark_view
GROUP BY property_type
"""

spark.sql(sql).show(5)

+---------------+------------------+
|  property_type|         avg_price|
+---------------+------------------+
|      Apartment|122.93266978922716|
|      Townhouse|129.45762711864407|
|Bed & Breakfast|112.05405405405405|
|           null|             120.0|
|      Camper/RV|120.46153846153847|
+---------------+------------------+
only showing top 5 rows



In [7]:
# Query data from Pandas dataframe
pandas_df.groupby('property_type')['price'].mean().head()

property_type
Apartment          122.932670
Bed & Breakfast    112.054054
Boat               282.375000
Bungalow           115.615385
Cabin              104.666667
Name: price, dtype: float64

In [8]:
# Query data from Spark dataframe
spark_df.groupby('property_type').mean('price').toPandas().head()

Unnamed: 0,property_type,avg(price)
0,Apartment,122.93267
1,Townhouse,129.457627
2,Bed & Breakfast,112.054054
3,,120.0
4,Camper/RV,120.461538


In [9]:
# Query data from Spark view
sql = """
SELECT 
    property_type, 
    SUM(price) AS sum_price,
    MIN(price) AS min_price,
    AVG(price) AS avg_price,
    MAX(price) AS max_price
FROM spark_view
GROUP BY property_type
ORDER BY 4 DESC
"""

spark.sql(sql).show(5)

+-------------+---------+---------+------------------+---------+
|property_type|sum_price|min_price|         avg_price|max_price|
+-------------+---------+---------+------------------+---------+
|         Boat|     2259|       75|           282.375|      775|
|  Condominium|    13751|       50| 151.1098901098901|      700|
|         Loft|     5428|       25|             135.7|      425|
|        House|   228365|       25|131.85046189376445|      975|
|    Townhouse|    15276|       28|129.45762711864407|      498|
+-------------+---------+---------+------------------+---------+
only showing top 5 rows



In [10]:
# Query data from Pandas dataframe
pandas_df.groupby('property_type')['price'].agg(['sum', 'min', 'mean', 'max']).sort_values(by='mean', ascending=False).head()

Unnamed: 0_level_0,sum,min,mean,max
property_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Boat,2259.0,75.0,282.375,775.0
Condominium,13751.0,50.0,151.10989,700.0
Loft,5428.0,25.0,135.7,425.0
House,228365.0,25.0,131.850462,975.0
Townhouse,15276.0,28.0,129.457627,498.0


In [11]:
# Query data from Spark dataframe
import pyspark.sql.functions as F

spark_df.groupby('property_type') \
    .agg(F.sum('price').alias('sum_price'), \
         F.min('price').alias('min_price'), \
         F.mean('price').alias('avg_price'), \
         F.max('price').alias('max_price') \
    ).sort('avg_price', ascending=False).show(5)

+-------------+---------+---------+------------------+---------+
|property_type|sum_price|min_price|         avg_price|max_price|
+-------------+---------+---------+------------------+---------+
|         Boat|     2259|       75|           282.375|      775|
|  Condominium|    13751|       50| 151.1098901098901|      700|
|         Loft|     5428|       25|             135.7|      425|
|        House|   228365|       25|131.85046189376445|      975|
|    Townhouse|    15276|       28|129.45762711864407|      498|
+-------------+---------+---------+------------------+---------+
only showing top 5 rows

