In [19]:
# import os
# import sys
# os.environ['PYSPARK_PYTHON'] = sys.executable
# os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
import pandas as pd
# from sqlalchemy import create_engine
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

#### Setup

In [2]:
# os.environ['PYSPARK_PYTHON']

In [3]:
# os.environ['PYSPARK_DRIVER_PYTHON']

**Initiate and configure Spark Session and Context**

In [20]:
# Specify the path to the JDBC driver JAR
# jdbc_jar_path = "postgresql://postgres:123@localhost:5432/airbnb"  

# spark = SparkSession \
#     .builder \
#     .appName("Intro to Apache Spark") \
#     .config("spark.cores.max", "4") \
#     .config('spark.executor.memory', '8G') \
#     .config('spark.driver.maxResultSize', '8g') \
#     .config('spark.kryoserializer.buffer.max', '512m') \
#     .config("spark.jars", jdbc_jar_path) \
#     .config("spark.driver.cores", "4") \
#     .getOrCreate()

# running local spark
spark = SparkSession \
    .builder \
    .master("local[*]") \
    .config("spark.driver.memory", "12g")\
    .appName("neighborhoods_and_city") \
    .getOrCreate()

sc = spark.sparkContext

print("Using Apache Spark Version", spark.version)
web_ui_url = sc.uiWebUrl
print(f"Spark UI is available at: {web_ui_url}")

Using Apache Spark Version 3.5.1
Spark UI is available at: http://10.206.45.59:4040


In [21]:
listings = spark.read.option("header", "true") \
                   .option("delimiter", ",") \
                   .option("inferSchema", "true") \
                   .option("multiLine", "true")\
                   .option("escape", "\"")\
                   .csv("/Users/isbdrr/Documents/Columbia_SchoolWork/Spring_2024/Managing_Data/final_project/DM_Temp/listings.csv")
#listing.show()
listings.printSchema()

[Stage 20:>                                                         (0 + 1) / 1]

root
 |-- id: long (nullable = true)
 |-- listing_url: string (nullable = true)
 |-- scrape_id: long (nullable = true)
 |-- last_scraped: date (nullable = true)
 |-- source: string (nullable = true)
 |-- name: string (nullable = true)
 |-- description: string (nullable = true)
 |-- neighborhood_overview: string (nullable = true)
 |-- picture_url: string (nullable = true)
 |-- host_id: integer (nullable = true)
 |-- host_url: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- host_since: date (nullable = true)
 |-- host_location: string (nullable = true)
 |-- host_about: string (nullable = true)
 |-- host_response_time: string (nullable = true)
 |-- host_response_rate: string (nullable = true)
 |-- host_acceptance_rate: string (nullable = true)
 |-- host_is_superhost: string (nullable = true)
 |-- host_thumbnail_url: string (nullable = true)
 |-- host_picture_url: string (nullable = true)
 |-- host_neighbourhood: string (nullable = true)
 |-- host_listings_count: dou

                                                                                

Split the tables into host and property tables

In [23]:
# Host DataFrame
host_columns = [
    'host_id', 'host_name', 'host_since', 'host_location', 'host_response_time',
    'host_response_rate', 'host_acceptance_rate', 'host_is_superhost',
    'host_neighbourhood', 'host_listings_count', 'host_total_listings_count',
    'calculated_host_listings_count',
    'last_scraped'
]

host_df = listings.select(host_columns)  # Ensure unique hosts

# Property DataFrame
property_columns = [
    'id','host_id', 'last_scraped','name','neighbourhood_cleansed',  
    'latitude', 'longitude', 'property_type',
    'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'price', 'minimum_nights', 
    'maximum_nights', 'has_availability', 'availability_30', 'availability_60',
    'availability_90', 'availability_365', 'number_of_reviews', 'number_of_reviews_ltm', 
    'number_of_reviews_l30d', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness',
    'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value',
    'reviews_per_month','city','state'
]

property_df = listings.select(property_columns).distinct()    # Ensure unique data


## host table ETL process

In [24]:
# ensure unique host_id in host table
host_df.createOrReplaceTempView('host_to_be_processed')

host_etl = '''
with table as(
    select
        *
        ,row_number() OVER (PARTITION BY host_id order by last_scraped desc) rnk
    from host_to_be_processed
)
select
    *
from table
where rnk=1
'''

host_result = spark.sql(host_etl)
host_result.limit(10).toPandas()

                                                                                

Unnamed: 0,host_id,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_neighbourhood,host_listings_count,host_total_listings_count,calculated_host_listings_count,last_scraped,rnk
0,796,Betty,2008-07-12,"Austin, TX",within an hour,100%,65%,t,Austins' Colony,1.0,1.0,1,2023-12-16,1
1,1618,Elaine,2008-08-08,"Portland, OR",within a few hours,100%,92%,f,Cole,6.0,9.0,4,2023-12-20,1
2,2682,Marcia,2008-09-02,"Denver, CO",within an hour,100%,100%,t,Cole,1.0,4.0,1,2023-12-29,1
3,2971,Donald,2008-09-15,"Hakalau, HI",within an hour,100%,100%,t,,1.0,5.0,1,2023-12-16,1
4,3008,Chas.,2008-09-16,"Los Angeles, CA",,,100%,t,Hollywood,2.0,3.0,2,2023-12-04,1
5,3264,Jen,2008-09-27,"San Diego, CA",,,89%,f,South Park,1.0,1.0,1,2023-12-04,1
6,4396,Casey,2008-11-19,"New York, NY",within a few hours,100%,91%,t,Alphabet City,3.0,4.0,2,2024-02-06,1
7,4957,A.J.,2008-12-10,"Washington, DC",,,100%,f,Northwest Washington,1.0,4.0,1,2023-12-18,1
8,5061,Sandra,2008-12-12,"Washington D.C., DC",within an hour,100%,100%,t,Capitol Hill,2.0,2.0,2,2023-12-18,1
9,5482,Gabriel,2008-12-23,"Topanga, CA",,,50%,f,Old Topanga,2.0,2.0,1,2023-12-04,1


## property table ETL Process
#### Fix Price

In [25]:
property_df = property_df.withColumn(
    "price",
    F.regexp_replace(property_df["price"], r"[^0-9.]", "")
)

In [26]:
property_df.select("price").show(5)

[Stage 24:>                                                         (0 + 1) / 1]

+------+
| price|
+------+
|153.00|
| 45.00|
|275.00|
|280.00|
|199.00|
+------+
only showing top 5 rows



                                                                                

In [27]:
# columns_to_remove = [
#     "review_scores_accuracy", "review_scores_cleanliness", "review_scores_checkin",
#     "review_scores_communication", "review_scores_location", "review_scores_value"
# ]
# 
# property_df = property_df.drop(*columns_to_remove)

property_df.show(10)

[Stage 27:>                                                         (0 + 1) / 1]

+-------------------+---------+------------+--------------------+----------------------+-----------------+------------------+--------------------+---------------+------------+---------+--------+----+------+--------------+--------------+----------------+---------------+---------------+---------------+----------------+-----------------+---------------------+----------------------+--------------------+----------------------+-------------------------+---------------------+---------------------------+----------------------+-------------------+-----------------+-------------+-----+
|                 id|  host_id|last_scraped|                name|neighbourhood_cleansed|         latitude|         longitude|       property_type|      room_type|accommodates|bathrooms|bedrooms|beds| price|minimum_nights|maximum_nights|has_availability|availability_30|availability_60|availability_90|availability_365|number_of_reviews|number_of_reviews_ltm|number_of_reviews_l30d|review_scores_rating|review_scores_a

                                                                                

In [28]:
host_df.show(5)

+---------+---------+----------+-------------+------------------+------------------+--------------------+-----------------+------------------+-------------------+-------------------------+------------------------------+------------+
|  host_id|host_name|host_since|host_location|host_response_time|host_response_rate|host_acceptance_rate|host_is_superhost|host_neighbourhood|host_listings_count|host_total_listings_count|calculated_host_listings_count|last_scraped|
+---------+---------+----------+-------------+------------------+------------------+--------------------+-----------------+------------------+-------------------+-------------------------+------------------------------+------------+
|299638115| Dominick|2019-10-03|   Albany, NY|    within an hour|              100%|                100%|                t|              NULL|                3.0|                      3.0|                             3|  2024-01-06|
| 61700428|     Adam|2016-03-05|   Albany, NY|    within an hour|   

# Insert into Database
> property_df to PostgreSQL table "property"
> host_result to Postgresql table "hosts"


Connect to the database

In [29]:
import pandas as pd
from sqlalchemy import create_engine

In [30]:
# Pass the connection string to a variable, conn_url
conn_url = 'postgresql://postgres:123@localhost:5432/airbnb'

# Create an engine that connects to PostgreSQL server
engine = create_engine(conn_url)

# Establish a connection
connection = engine.connect()

# Define JDBC properties
jdbc_properties = {
    "user": "postgres",
    "password": "123",  
    "driver": "org.postgresql.Driver"}


In [31]:
ddl_hosts = """
CREATE TABLE hosts (
    host_id BIGINT PRIMARY KEY, 
    host_name VARCHAR(255), 
    host_since DATE, 
    host_location VARCHAR(255), 
    host_response_time VARCHAR(50), 
    host_response_rate VARCHAR(10), 
    host_acceptance_rate VARCHAR(10),
    host_is_superhost VARCHAR(10),
    host_neighbourhood VARCHAR(255), 
    host_listings_count DOUBLE PRECISION,
    host_total_listings_count DOUBLE PRECISION,
    calculated_host_listings_count INTEGER, 
    last_scraped DATE
);
"""
connection.execute(ddl_hosts)

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x1427c7650>

In [32]:
host_pd.head(5)

Unnamed: 0,host_id,host_name,host_since,host_location,host_about,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_neighbourhood,host_listings_count,host_total_listings_count,calculated_host_listings_count,last_scraped
0,796,Betty,2008-07-12,"Austin, TX",Welcome to Austin and to my home. I’m a freque...,within an hour,100%,65%,t,Austins' Colony,1.0,1.0,1,2023-12-16
1,1618,Elaine,2008-08-08,"Portland, OR",A little about me:\r\nI a young professional f...,within a few hours,100%,92%,f,Cole,6.0,9.0,4,2023-12-20
2,2682,Marcia,2008-09-02,"Denver, CO",I live in Denver and am retired but worked as ...,within an hour,100%,100%,t,Cole,1.0,4.0,1,2023-12-29
3,2971,Donald,2008-09-15,"Hakalau, HI",,within an hour,100%,100%,t,,1.0,5.0,1,2023-12-16
4,3008,Chas.,2008-09-16,"Los Angeles, CA",Professional and technical writer.\nLiterary C...,,,100%,t,Hollywood,2.0,3.0,2,2023-12-04


In [33]:
host_pd.dtypes

host_id                             int32
host_name                          object
host_since                         object
host_location                      object
host_about                         object
host_response_time                 object
host_response_rate                 object
host_acceptance_rate               object
host_is_superhost                  object
host_neighbourhood                 object
host_listings_count               float64
host_total_listings_count         float64
calculated_host_listings_count      int32
last_scraped                       object
dtype: object

In [49]:
# -- Our dataaset accepts 1.5 bedrooms so we used double
# bathrooms
# bedrooms
# beds

ddl_properties = """
CREATE TABLE properties (
    id BIGINT,
    host_id BIGINT,
    last_scraped DATE,
    name VARCHAR(255),
    neighbourhood_cleansed VARCHAR(255),
    latitude DOUBLE PRECISION,
    longitude DOUBLE PRECISION,
    property_type VARCHAR(255),
    room_type VARCHAR(255),
    accommodates INTEGER,
    bathrooms DOUBLE PRECISION, 
    bedrooms DOUBLE PRECISION, 
    beds DOUBLE PRECISION,
    price DECIMAL(10, 2),
    minimum_nights INTEGER,
    maximum_nights INTEGER,
    has_availability VARCHAR(255),
    availability_30 INTEGER,
    availability_60 INTEGER,
    availability_90 INTEGER,
    availability_365 INTEGER,
    number_of_reviews INTEGER,
    number_of_reviews_ltm INTEGER,
    number_of_reviews_l30d INTEGER,
    review_scores_rating DOUBLE PRECISION,
    review_scores_accuracy DOUBLE PRECISION,
    review_scores_cleanliness DOUBLE PRECISION,
    review_scores_checkin DOUBLE PRECISION,
    review_scores_communication DOUBLE PRECISION,
    review_scores_location DOUBLE PRECISION,
    review_scores_value DOUBLE PRECISION,
    reviews_per_month DOUBLE PRECISION,
    city VARCHAR(255), 
    state VARCHAR(10),
    PRIMARY KEY (id),
    FOREIGN KEY (city, state,neighbourhood_cleansed) REFERENCES neighbourhood(city, state,neighbourhood_cleansed),
    FOREIGN KEY (host_id) REFERENCES hosts(host_id)
);
"""
connection.execute(ddl_properties)

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x39fa66010>

In [22]:
#host_df.write.jdbc(
#    url=conn_url,
#    table="host_table",
#    mode="append",
#    properties=jdbc_properties
#)

In [38]:
host_pd = host_result.toPandas()
host_pd.drop('rnk',axis=1,inplace=True)

                                                                                

In [44]:
host_pd.to_sql(name='hosts', con=engine, if_exists='append', index=False)

660

In [45]:
property_pd = property_df.toPandas()

                                                                                

In [46]:
property_pd.columns

Index(['id', 'host_id', 'last_scraped', 'name', 'neighbourhood_cleansed',
       'latitude', 'longitude', 'property_type', 'room_type', 'accommodates',
       'bathrooms', 'bedrooms', 'beds', 'price', 'minimum_nights',
       'maximum_nights', 'has_availability', 'availability_30',
       'availability_60', 'availability_90', 'availability_365',
       'number_of_reviews', 'number_of_reviews_ltm', 'number_of_reviews_l30d',
       'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'reviews_per_month', 'city', 'state'],
      dtype='object')

In [47]:
property_pd.drop_duplicates(subset='id',keep='first',inplace=True)

In [50]:
property_pd.to_sql(name='properties', con=engine, if_exists='append', index=False)

185