In [2]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=2ddf95e1dea3f6fb202f6b8323088c0c0b5c482a65843208c8531199300e53aa
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [3]:
import pyspark

In [4]:
import os
import sys
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [5]:
os.environ['PYSPARK_PYTHON']

'/usr/bin/python3'

In [6]:
os.environ['PYSPARK_DRIVER_PYTHON']

'/usr/bin/python3'

**Initiate and configure Spark Session and Context**

In [7]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Intro to Apache Spark") \
    .config("spark.cores.max", "4") \
    .config('spark.executor.memory', '8G') \
    .config('spark.driver.maxResultSize', '8g') \
    .config('spark.kryoserializer.buffer.max', '512m') \
    .config("spark.driver.cores", "4") \
    .getOrCreate()

sc = spark.sparkContext

print("Using Apache Spark Version", spark.version)

Using Apache Spark Version 3.5.1


**Read CSV into Spark Dataframe**

In [11]:
listings = spark.read.option("header", "true") \
                   .option("delimiter", ",") \
                   .option("inferSchema", "true") \
                   .option("multiLine", "true")\
                   .option("escape", "\"")\
                   .csv("/content/drive/MyDrive/listings.csv")
#listing.show()
listings.printSchema()

root
 |-- id: long (nullable = true)
 |-- listing_url: string (nullable = true)
 |-- scrape_id: long (nullable = true)
 |-- last_scraped: date (nullable = true)
 |-- source: string (nullable = true)
 |-- name: string (nullable = true)
 |-- description: string (nullable = true)
 |-- neighborhood_overview: string (nullable = true)
 |-- picture_url: string (nullable = true)
 |-- host_id: integer (nullable = true)
 |-- host_url: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- host_since: date (nullable = true)
 |-- host_location: string (nullable = true)
 |-- host_about: string (nullable = true)
 |-- host_response_time: string (nullable = true)
 |-- host_response_rate: string (nullable = true)
 |-- host_acceptance_rate: string (nullable = true)
 |-- host_is_superhost: string (nullable = true)
 |-- host_thumbnail_url: string (nullable = true)
 |-- host_picture_url: string (nullable = true)
 |-- host_neighbourhood: string (nullable = true)
 |-- host_listings_count: dou

In [9]:
calendar = spark.read.option("header", "true") \
                   .option("delimiter", ",") \
                   .option("inferSchema", "true") \
                   .option("multiLine", "true")\
                   .option("escape", "\"")\
                   .csv("/content/drive/MyDrive/calendar.csv")
calendar = calendar.drop(*['minimum_nights','maximum_nights'])
#calendar.show()
calendar.printSchema()

root
 |-- listing_id: long (nullable = true)
 |-- date: date (nullable = true)
 |-- available: string (nullable = true)
 |-- price: string (nullable = true)
 |-- adjusted_price: string (nullable = true)
 |-- state: string (nullable = true)
 |-- city: string (nullable = true)
 |-- data_date: date (nullable = true)



**city**

In [14]:
listing.createOrReplaceTempView('listings')
calendar.createOrReplaceTempView('calendar')
city_query = '''
WITH CleanedCalendar AS (
    SELECT
        listing_id,
        date,
        available,
        CASE WHEN price IS NULL THEN adjusted_price ELSE price END AS price,
        city
    FROM calendar
    WHERE available = 't'
)

SELECT
    l.city,
    AVG(CAST(regexp_replace(c.price, '[^0-9.]', '') AS DOUBLE)) AS avg_price,
    COUNT(DISTINCT l.id) AS total_listings
FROM listings l
JOIN CleanedCalendar c ON l.id = c.listing_id
GROUP BY l.city
ORDER BY l.city
'''

result_city = spark.sql(city_query)

In [15]:
import pandas
result_city.limit(5).toPandas()

Unnamed: 0,city,avg_price,total_listings
0,albany,136.455949,399
1,asheville,239.451794,3036
2,austin,495.117807,11561
3,boston,254.077065,3706
4,bozeman,120.0,579


**neighborhood**

In [36]:
listings.createOrReplaceTempView('listings')
calendar.createOrReplaceTempView('calendar')

neighbourhood_query = '''
WITH CleanedCalendar AS (
    SELECT
        listing_id,
        date,
        city,
        state,
        CASE WHEN available = 't' THEN 1 ELSE 0 END AS is_available,
        CAST(regexp_replace(price, '[^0-9.]', '') AS DOUBLE) AS price
    FROM calendar
    WHERE available = 't'  -- focusing only on available dates
),
neighbourhood AS (
  SELECT
      id,
      neighbourhood_cleansed,
      neighborhood_overview,
      city,
      state
  FROM listings
  WHERE neighbourhood_cleansed IS NOT NULL AND id IS NOT NULL
)

SELECT
    n.neighbourhood_cleansed AS neighbourhood_name,
    n.city AS city_name,
    n.state,
    AVG(c.price) AS avg_price,
    MAX(n.neighborhood_overview) AS neighborhood_overview
FROM neighbourhood n
JOIN CleanedCalendar c ON n.id = c.listing_id
GROUP BY
    n.neighbourhood_cleansed,
    n.city,
    n.state
ORDER BY
    n.neighbourhood_cleansed,
    n.city
'''

result_neighbourhood = spark.sql(neighbourhood_query)

In [37]:
result_neighbourhood.limit(5).toPandas()

Unnamed: 0,neighbourhood_name,city_name,state,avg_price,neighborhood_overview
0,28704,asheville,nc,277.077657,"a nice and quiet neighborhood, 10min from air..."
1,28715,asheville,nc,207.354062,You may hear the distant rooster crowing in th...
2,28732,asheville,nc,249.246435,private drive and green space the first in a ...
3,28801,asheville,nc,236.49478,▶What's nearby <br />• Vance Monument - 1 min ...
4,28803,asheville,nc,214.2598,not in a neighborhood


**reviews**

In [40]:
# Spark session includes the MongoDB package
spark = SparkSession.builder \
    .appName("ReviewDataExtraction") \
    .config("spark.mongodb.input.uri", "mongodb://127.0.0.1/test.myCollection") \
    .config("spark.mongodb.output.uri", "mongodb://127.0.0.1/test.myCollection") \
    .getOrCreate()

reviews_df = spark.read.csv("/content/drive/MyDrive/reviews.csv", header=True, inferSchema=True)
reviews_df.show(5)

+----------+-------+----------+-----------+-------------+--------------------+-----+------+----------+
|listing_id|     id|      date|reviewer_id|reviewer_name|            comments|state|  city| data_date|
+----------+-------+----------+-----------+-------------+--------------------+-----+------+----------+
|   1489424|7208791|2013-09-10|    5817914|       Hilary|Efrat and Dan wer...|   ny|albany|2024-01-06|
|   1489424|8001939|2013-10-12|    4786919|       Sharon|As advertised, a ...|   ny|albany|2024-01-06|
|   1489424|8123022|2013-10-16|    4786919|       Sharon|Glad to be back f...|   ny|albany|2024-01-06|
|   1489424|8279957|2013-10-23|    8362214|       Andrej|We stayed only fo...|   ny|albany|2024-01-06|
|   1489424|8303182|2013-10-24|    9458270|      Andreia|I had a pleasant ...|   ny|albany|2024-01-06|
+----------+-------+----------+-----------+-------------+--------------------+-----+------+----------+
only showing top 5 rows

