### Property Level

In [92]:
import pyspark
import os
import sys
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
import pandas as pd
from sqlalchemy import create_engine
from pyspark.sql import SparkSession

from pyspark.sql import functions as F

#### Setup

In [93]:
os.environ['PYSPARK_PYTHON']

'/Users/isbdrr/anaconda3/bin/python'

In [94]:
os.environ['PYSPARK_DRIVER_PYTHON']

'/Users/isbdrr/anaconda3/bin/python'

**Initiate and configure Spark Session and Context**

spark = SparkSession \
    .builder \
    .appName("Intro to Apache Spark") \
    .config("spark.cores.max", "4") \
    .config('spark.executor.memory', '8G') \
    .config('spark.driver.maxResultSize', '8g') \
    .config('spark.kryoserializer.buffer.max', '512m') \
    .config("spark.driver.cores", "4") \
    .getOrCreate()

sc = spark.sparkContext

print("Using Apache Spark Version", spark.version)

In [96]:
# Specify the path to the JDBC driver JAR
jdbc_jar_path = "postgresql://postgres:123@localhost:5432/airbnb"  

spark = SparkSession \
    .builder \
    .appName("Intro to Apache Spark") \
    .config("spark.cores.max", "4") \
    .config('spark.executor.memory', '8G') \
    .config('spark.driver.maxResultSize', '8g') \
    .config('spark.kryoserializer.buffer.max', '512m') \
    .config("spark.jars", jdbc_jar_path) \
    .config("spark.driver.cores", "4") \
    .getOrCreate()

sc = spark.sparkContext

print("Using Apache Spark Version", spark.version)

Using Apache Spark Version 3.5.1


24/04/26 14:00:50 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [97]:
listings = spark.read.option("header", "true") \
                   .option("delimiter", ",") \
                   .option("inferSchema", "true") \
                   .option("multiLine", "true")\
                   .option("escape", "\"")\
                   .csv("/Users/isbdrr/Documents/Columbia_SchoolWork/Spring_2024/Managing_Data/final_project/DM_Temp/listings.csv")
#listing.show()
listings.printSchema()

[Stage 47:>                                                         (0 + 1) / 1]

root
 |-- id: long (nullable = true)
 |-- listing_url: string (nullable = true)
 |-- scrape_id: long (nullable = true)
 |-- last_scraped: date (nullable = true)
 |-- source: string (nullable = true)
 |-- name: string (nullable = true)
 |-- description: string (nullable = true)
 |-- neighborhood_overview: string (nullable = true)
 |-- picture_url: string (nullable = true)
 |-- host_id: integer (nullable = true)
 |-- host_url: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- host_since: date (nullable = true)
 |-- host_location: string (nullable = true)
 |-- host_about: string (nullable = true)
 |-- host_response_time: string (nullable = true)
 |-- host_response_rate: string (nullable = true)
 |-- host_acceptance_rate: string (nullable = true)
 |-- host_is_superhost: string (nullable = true)
 |-- host_thumbnail_url: string (nullable = true)
 |-- host_picture_url: string (nullable = true)
 |-- host_neighbourhood: string (nullable = true)
 |-- host_listings_count: dou

                                                                                

Connect to the database

In [98]:
# Pass the connection string to a variable, conn_url
conn_url = 'postgresql://postgres:123@localhost:5432/airbnb'

# Create an engine that connects to PostgreSQL server
engine = create_engine(conn_url)

# Establish a connection
connection = engine.connect()

# Define JDBC properties
jdbc_properties = {
    "user": "postgres",
    "password": "123",  
    "driver": "org.postgresql.Driver"}


Split the tables into host and property tables

In [99]:
# Host DataFrame
host_columns = [
    'host_id', 'host_url', 'host_name', 'host_since', 'host_location', 'host_about', 'host_response_time',
    'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url', 'host_picture_url',
    'host_neighbourhood', 'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic',
    'host_identity_verified','calculated_host_listings_count', 'calculated_host_listings_count_entire_homes', 'calculated_host_listings_count_private_rooms',
    'calculated_host_listings_count_shared_rooms',
]

host_df = listings.select(host_columns).distinct()  # Ensure unique hosts

# Property DataFrame
property_columns = [
    'id','host_id','listing_url', 'scrape_id', 'last_scraped', 'source', 'name', 'description', 'neighborhood_overview', 'picture_url',
    'neighbourhood', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude', 'longitude', 'property_type', 
    'room_type', 'accommodates', 'bathrooms', 'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price', 'minimum_nights', 
    'maximum_nights', 'minimum_minimum_nights', 'maximum_minimum_nights', 'minimum_maximum_nights', 'maximum_maximum_nights',
    'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'calendar_updated', 'has_availability', 'availability_30', 'availability_60',
    'availability_90', 'availability_365', 'calendar_last_scraped', 'number_of_reviews', 'number_of_reviews_ltm', 
    'number_of_reviews_l30d', 'first_review', 'last_review', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness',
    'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 'license', 'instant_bookable',
    'reviews_per_month'
]

property_df = listings.select(property_columns).distinct()  


#### Fix Price

In [100]:
property_df = property_df.withColumn(
    "price",
    F.regexp_replace(property_df["price"], r"[^0-9.]", "")
)

In [101]:
property_df.select("price").show(10)

24/04/26 14:01:12 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


+------+
| price|
+------+
| 83.00|
|169.00|
| 52.00|
|100.00|
|172.00|
|190.00|
|111.00|
|197.00|
|115.00|
|258.00|
+------+
only showing top 10 rows



                                                                                

#### Fix rating

In [102]:
property_df.select("review_scores_rating","review_scores_accuracy","review_scores_cleanliness","review_scores_checkin","review_scores_communication","review_scores_location","review_scores_value").show(10)

24/04/26 14:01:23 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


+--------------------+----------------------+-------------------------+---------------------+---------------------------+----------------------+-------------------+
|review_scores_rating|review_scores_accuracy|review_scores_cleanliness|review_scores_checkin|review_scores_communication|review_scores_location|review_scores_value|
+--------------------+----------------------+-------------------------+---------------------+---------------------------+----------------------+-------------------+
|                4.89|                  4.93|                     4.87|                  4.9|                       4.95|                  4.82|               4.88|
|                4.97|                  4.97|                     4.99|                 4.97|                       4.98|                   5.0|               4.96|
|                4.53|                  4.77|                      4.5|                  4.7|                        4.7|                  4.47|               4.67|
|         

                                                                                

In [103]:
columns_to_remove = [
    "review_scores_accuracy", "review_scores_cleanliness", "review_scores_checkin",
    "review_scores_communication", "review_scores_location", "review_scores_value"
]

property_df = property_df.drop(*columns_to_remove)

property_df.show(2)

24/04/26 14:01:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


+--------+---------+--------------------+--------------+------------+-----------+--------------------+-----------+---------------------+--------------------+--------------------+----------------------+----------------------------+--------+---------+--------------------+---------------+------------+---------+--------------+--------+----+---------+------+--------------+--------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------+----------------+---------------+---------------+---------------+----------------+---------------------+-----------------+---------------------+----------------------+------------+-----------+--------------------+----------+----------------+-----------------+
|      id|  host_id|         listing_url|     scrape_id|last_scraped|     source|                name|description|neighborhood_overview|         picture_url|       neighbourhood|neighbourhood_cleansed

                                                                                

In [104]:
property_df.select("property_type").show(10)

24/04/26 14:01:46 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


+--------------------+
|       property_type|
+--------------------+
|  Room in aparthotel|
|Private room in b...|
|Private room in b...|
|Private room in b...|
|     Entire bungalow|
|           Camper/RV|
|        Entire condo|
|        Entire condo|
|        Entire condo|
|        Entire condo|
+--------------------+
only showing top 10 rows



                                                                                

In [106]:
# # Insert the host DataFrame into PostgreSQL
# host_df.write.jdbc(
#     url=conn_url,
#     table="host_table",  # Name of the PostgreSQL table for hosts
#     mode="overwrite",  # Use "append" to add to the table, "overwrite" to replace existing data
#     properties=jdbc_properties
# )

# # Insert the property DataFrame into PostgreSQL
# property_df.write.jdbc(
#     url=conn_url,
#     table="property_table",  # Name of the PostgreSQL table for properties
#     mode="overwrite",
#     properties=jdbc_properties
# )


In [107]:
# Get the average price for the property
average_price_per_day = property_df.groupBy("last_scraped", "id").agg(
    F.avg("price").alias("average_price")
)

In [109]:
average_price_per_day.show(10)

24/04/26 14:14:26 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/26 14:14:32 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/26 14:14:32 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/26 14:14:32 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/26 14:14:32 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/26 14:14:32 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/26 14:14:32 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/26 14:14:32 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/26 14:14:32 WARN RowBasedKeyValueBatch: Calling spill() on

+------------+-------------------+-------------+
|last_scraped|                 id|average_price|
+------------+-------------------+-------------+
|  2023-12-19|           50913533|        118.0|
|  2023-12-29|           15831072|        112.0|
|  2023-12-04|           49775935|         70.0|
|  2023-12-15|            7202876|        123.0|
|  2023-12-23| 748607186427320159|         44.0|
|  2023-12-20| 849215001003567365|        297.0|
|  2023-12-20|           50856529|        105.0|
|  2023-12-04|1003417703744134625|         90.0|
|  2023-12-26| 745927340879585415|         73.0|
|  2023-12-25|           31900971|        104.0|
+------------+-------------------+-------------+
only showing top 10 rows



                                                                                