# ETL Process for order_info_by_neighbourhood

## Extract and Transforming using PySpark

### Start Spark Session

In [1]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import FloatType

In [2]:
# running local spark
spark = SparkSession \
    .builder \
    .master("local[*]") \
    .config("spark.driver.memory", "12g")\
    .appName("neighborhoods_and_city") \
    .getOrCreate()
sc = spark.sparkContext

print("Using Apache Spark Version", spark.version)
web_ui_url = sc.uiWebUrl
print(f"Spark UI is available at: {web_ui_url}")

24/04/28 03:42:35 WARN Utils: Your hostname, Kun-Mac.local resolves to a loopback address: 127.0.0.1; using 172.20.23.178 instead (on interface en0)
24/04/28 03:42:35 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/28 03:42:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Using Apache Spark Version 3.5.0
Spark UI is available at: http://172.20.23.178:4040


### Extract Data

In [3]:
listings = spark.read.option("header", "true") \
                    .option("delimiter", ",") \
                    .option("inferSchema", "true") \
                    .option("multiLine", "true")\
                    .option("escape", "\"")\
                    .csv("../Data/listings.csv")

                                                                                

In [4]:
calendar = spark.read.option("header", "true") \
                    .option("delimiter", ",") \
                    .option("inferSchema", "true") \
                    .option("multiLine", "true")\
                    .option("escape", "\"")\
                    .csv("../Data/calendar.csv")
calendar = calendar.drop(*['minimum_nights','maximum_nights'])

24/04/28 03:42:46 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
                                                                                

## Transforming process

In [5]:
def money_to_float(money_str):
    if money_str is None:
        return None
    else:
        cleaned_str = money_str[1:].replace(',', '')
        return float(cleaned_str)
spark.udf.register("money_to_float_udf", money_to_float, FloatType())

<function __main__.money_to_float(money_str)>

In [6]:
listings.createOrReplaceTempView('listings')
calendar.createOrReplaceTempView('calendar')
sql = '''
with neighbourhood as ( 
select
    id
    ,city
    ,state
    ,neighbourhood_cleansed
from listings l
),
calendar_cleaned as ( 
select
    listing_id
    ,date
    ,if(available='t',1,0) as is_available
    ,coalesce(money_to_float_udf(adjusted_price),money_to_float_udf(price)) as price
from calendar
)

select
    t2.state
    ,t2.city
    ,t2.neighbourhood_cleansed
    ,date
    ,sum(is_available)/count(is_available) as occupancy_rate
    ,avg(price) as avg_price
from calendar_cleaned t1
left join neighbourhood t2
    on t1.listing_id = t2.id
group by 
    t2.state
    ,t2.city
    ,t2.neighbourhood_cleansed
    ,date
'''
result_neighbourhood = spark.sql(sql)

24/04/28 03:44:00 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [7]:
print(result_neighbourhood.count())
result_neighbourhood.limit(5).toPandas()

24/04/28 03:45:02 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/28 03:45:02 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/28 03:45:02 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/28 03:45:03 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/28 03:45:03 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/28 03:45:03 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/28 03:45:03 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/28 03:45:03 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/28 03:45:03 WARN RowBasedKeyValueBatch: Calling spill() on

576716


24/04/28 03:48:05 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/28 03:48:05 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/28 03:48:06 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/28 03:48:06 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/28 03:48:06 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/28 03:48:06 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/28 03:48:06 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/28 03:48:06 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/28 03:48:06 WARN RowBasedKeyValueBatch: Calling spill() on

Unnamed: 0,state,city,neighbourhood_cleansed,date,occupancy_rate,avg_price
0,ny,new-york-city,Flatbush,2024-03-16,0.442913,125.580709
1,ny,new-york-city,Flatbush,2024-09-15,0.427165,125.580709
2,ny,new-york-city,Flatbush,2024-11-23,0.336614,125.580709
3,ny,new-york-city,East Village,2024-02-17,0.234043,241.274746
4,ny,new-york-city,East Village,2024-04-23,0.307123,241.274746


# Insert into Database
- result_neighbourhood to PostgreSQL table "order_info_by_neighbourhood"

In [8]:
from sqlalchemy import create_engine
from sqlalchemy.sql import text

# Pass the connection string to a variable, conn_url
conn_url = 'postgresql://postgres:123@localhost:5432/airbnb'

# Create an engine that connects to PostgreSQL server
engine = create_engine(conn_url)

# Establish a connection
connection = engine.connect()

### Loading Data

In [9]:
# creating table
# all the column except primary key can take null value
sql = """
CREATE TABLE IF NOT EXISTS order_info_by_neighbourhood (
    state VARCHAR(10), 
    city VARCHAR(255),
    neighbourhood_cleansed VARCHAR(255),
    date DATE,
    occupancy_rate DOUBLE PRECISION,
    avg_price DOUBLE PRECISION,
    PRIMARY KEY (state, city, neighbourhood_cleansed, date),
    FOREIGN KEY (state, city, neighbourhood_cleansed) REFERENCES neighbourhood(state,city, neighbourhood_cleansed)
);
"""
connection.execute(text(sql))
connection.commit()

In [10]:
pd_df = result_neighbourhood.toPandas()

24/04/28 03:50:51 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/28 03:50:51 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/28 03:50:51 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/28 03:50:51 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/28 03:50:51 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/28 03:50:51 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/28 03:50:51 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/28 03:50:51 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/28 03:50:51 WARN RowBasedKeyValueBatch: Calling spill() on

In [11]:
pd_df.dropna(subset=['neighbourhood_cleansed'],inplace=True)
pd_df.to_sql(name='order_info_by_neighbourhood', con=engine, if_exists='append', index=False)

975