# ETL Process for city and neighbourhood

## Extract and Transforming using PySpark

### Start Spark Session

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .master("local[*]") \
    .config("spark.driver.memory", "12g")\
    .appName("review_city_neighbourhood") \
    .getOrCreate()

sc = spark.sparkContext

print("Using Apache Spark Version", spark.version)
web_ui_url = sc.uiWebUrl
print(f"Spark UI is available at: {web_ui_url}")

24/04/28 02:46:56 WARN Utils: Your hostname, Kun-Mac.local resolves to a loopback address: 127.0.0.1; using 172.20.23.178 instead (on interface en0)
24/04/28 02:46:56 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/28 02:46:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Using Apache Spark Version 3.5.0
Spark UI is available at: http://172.20.23.178:4040


### Extract Data

In [2]:
listings = spark.read.option("header", "true") \
                   .option("delimiter", ",") \
                   .option("inferSchema", "true") \
                   .option("multiLine", "true")\
                   .option("escape", "\"")\
                   .csv("../Data/listings.csv")
listings.printSchema()

                                                                                

root
 |-- id: long (nullable = true)
 |-- listing_url: string (nullable = true)
 |-- scrape_id: long (nullable = true)
 |-- last_scraped: date (nullable = true)
 |-- source: string (nullable = true)
 |-- name: string (nullable = true)
 |-- description: string (nullable = true)
 |-- neighborhood_overview: string (nullable = true)
 |-- picture_url: string (nullable = true)
 |-- host_id: integer (nullable = true)
 |-- host_url: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- host_since: date (nullable = true)
 |-- host_location: string (nullable = true)
 |-- host_about: string (nullable = true)
 |-- host_response_time: string (nullable = true)
 |-- host_response_rate: string (nullable = true)
 |-- host_acceptance_rate: string (nullable = true)
 |-- host_is_superhost: string (nullable = true)
 |-- host_thumbnail_url: string (nullable = true)
 |-- host_picture_url: string (nullable = true)
 |-- host_neighbourhood: string (nullable = true)
 |-- host_listings_count: dou

### Transforming city

In [3]:
listings.createOrReplaceTempView('listings')
city_query = '''
SELECT
    l.city,
    l.state,
    COUNT(DISTINCT l.id) AS total_listings
FROM listings l
GROUP BY 
    l.city
    ,l.state
-- ORDER BY l.city
'''

result_city = spark.sql(city_query)

24/04/28 02:47:01 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [4]:
import pandas
result_city.limit(5).toPandas()

                                                                                

Unnamed: 0,city,state,total_listings
0,nashville,tn,8850
1,broward-county,fl,18230
2,new-york-city,ny,39202
3,boston,ma,4204
4,austin,tx,15419


## Transforming neighborhood

In [5]:
# creating table
# all the column except primary key can take null value
neighbourhood_query = '''
SELECT
    l.neighbourhood_cleansed,
    l.city,
    l.state,
    COUNT(DISTINCT l.id) AS total_listings
FROM listings l
where l.neighbourhood_cleansed is not NULL
GROUP BY
    l.neighbourhood_cleansed,
    l.city,
    l.state
'''

result_neighbourhood = spark.sql(neighbourhood_query)

In [6]:
result_neighbourhood.limit(5).toPandas()

                                                                                

Unnamed: 0,neighbourhood_cleansed,city,state,total_listings
0,Woodland Hills,los-angeles,ca,525
1,Kips Bay,new-york-city,ny,417
2,Upper Laurel,oakland,ca,28
3,Overlook,portland,or,122
4,Lake Los Angeles,los-angeles,ca,10


# Insert into Database
- neighbourhood to PostgreSQL table "neighbourhood"
- city to Postgresql table "city"

### Connecting PostgreSQL

In [7]:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.sql import text

# Pass the connection string to a variable, conn_url
conn_url = 'postgresql://postgres:123@localhost:5432/airbnb'

# Create an engine that connects to PostgreSQL server
engine = create_engine(conn_url)

# Establish a connection
connection = engine.connect()

### Loading City

In [8]:
# creating table
ddl_city = """
CREATE TABLE IF NOT EXISTS city (
    city VARCHAR(255),
    state VARCHAR(10), 
    total_listings INT,
    PRIMARY KEY (city, state) 
);
"""
connection.execute(text(ddl_city))
connection.commit()

In [9]:
# inserting
city_pd = result_city.toPandas()
city_pd.to_sql(name='city', con=engine, if_exists='append', index=False)

                                                                                

34

### Loading Neighbourhood

In [10]:
# creating table
# all the column except primary key can take null value
ddl_neighbourhood = """
CREATE TABLE neighbourhood (
    neighbourhood_cleansed VARCHAR(255),
    city VARCHAR(255), 
    state VARCHAR(10),
    total_listings INT, 
    PRIMARY KEY (state,city, neighbourhood_cleansed), 
    FOREIGN KEY (city, state) REFERENCES city(city, state)
);
"""
connection.execute(text(ddl_neighbourhood))
connection.commit()

In [11]:
neighbourhood_pd = result_neighbourhood.toPandas()
neighbourhood_pd.to_sql(name='neighbourhood', con=engine, if_exists='append', index=False)

24/04/28 02:47:12 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
                                                                                

573