# Initiate and configure Spark Session and Context

In [28]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .master("local[*]") \
    .config("spark.driver.memory", "12g")\
    .appName("review_city_neighbourhood") \
    .getOrCreate()

sc = spark.sparkContext

print("Using Apache Spark Version", spark.version)
web_ui_url = sc.uiWebUrl
print(f"Spark UI is available at: {web_ui_url}")

Using Apache Spark Version 3.5.1
Spark UI is available at: http://10.206.45.59:4041


# Read CSV into Spark Dataframe

In [29]:
listings = spark.read.option("header", "true") \
                   .option("delimiter", ",") \
                   .option("inferSchema", "true") \
                   .option("multiLine", "true")\
                   .option("escape", "\"")\
                   .csv("/Users/isbdrr/Documents/Columbia_SchoolWork/Spring_2024/Managing_Data/final_project/DM_Temp/listings.csv")
listings.printSchema()

[Stage 53:>                                                         (0 + 1) / 1]

root
 |-- id: long (nullable = true)
 |-- listing_url: string (nullable = true)
 |-- scrape_id: long (nullable = true)
 |-- last_scraped: date (nullable = true)
 |-- source: string (nullable = true)
 |-- name: string (nullable = true)
 |-- description: string (nullable = true)
 |-- neighborhood_overview: string (nullable = true)
 |-- picture_url: string (nullable = true)
 |-- host_id: integer (nullable = true)
 |-- host_url: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- host_since: date (nullable = true)
 |-- host_location: string (nullable = true)
 |-- host_about: string (nullable = true)
 |-- host_response_time: string (nullable = true)
 |-- host_response_rate: string (nullable = true)
 |-- host_acceptance_rate: string (nullable = true)
 |-- host_is_superhost: string (nullable = true)
 |-- host_thumbnail_url: string (nullable = true)
 |-- host_picture_url: string (nullable = true)
 |-- host_neighbourhood: string (nullable = true)
 |-- host_listings_count: dou

                                                                                

In [3]:
# calendar = spark.read.option("header", "true") \
#                    .option("delimiter", ",") \
#                    .option("inferSchema", "true") \
#                    .option("multiLine", "true")\
#                    .option("escape", "\"")\
#                    .csv("../Data/calendar.csv")
# calendar = calendar.drop(*['minimum_nights','maximum_nights'])
# #calendar.show()
# calendar.printSchema()

**city**

In [30]:
listings.createOrReplaceTempView('listings')
# calendar.createOrReplaceTempView('calendar')
city_query = '''
SELECT
    l.city,
    l.state,
    COUNT(DISTINCT l.id) AS total_listings
FROM listings l
GROUP BY 
    l.city
    ,l.state
-- ORDER BY l.city
'''

result_city = spark.sql(city_query)

In [31]:
import pandas
result_city.limit(5).toPandas()

                                                                                

Unnamed: 0,city,state,total_listings
0,nashville,tn,8850
1,broward-county,fl,18230
2,new-york-city,ny,39202
3,boston,ma,4204
4,austin,tx,15419


**neighborhood**

In [32]:
# listings.createOrReplaceTempView('listings')
# calendar.createOrReplaceTempView('calendar')

neighbourhood_query = '''
SELECT
    l.neighbourhood_cleansed,
    l.city,
    l.state,
    COUNT(DISTINCT l.id) AS total_listings
FROM listings l
where l.neighbourhood_cleansed is not NULL
GROUP BY
    l.neighbourhood_cleansed,
    l.city,
    l.state
'''

result_neighbourhood = spark.sql(neighbourhood_query)

In [33]:
result_neighbourhood.limit(5).toPandas()

                                                                                

Unnamed: 0,neighbourhood_cleansed,city,state,total_listings
0,Woodland Hills,los-angeles,ca,525
1,Kips Bay,new-york-city,ny,417
2,Upper Laurel,oakland,ca,28
3,Overlook,portland,or,122
4,Lake Los Angeles,los-angeles,ca,10


# Insert into Database
- neighbourhood to PostgreSQL table "neighbourhood"
- city to Postgresql table "city"

In [34]:
import pandas as pd
from sqlalchemy import create_engine

# Pass the connection string to a variable, conn_url
conn_url = 'postgresql://postgres:123@localhost:5432/airbnb'

# Create an engine that connects to PostgreSQL server
engine = create_engine(conn_url)

# Establish a connection
connection = engine.connect()

# Define JDBC properties
jdbc_properties = {
    "user": "postgres",
    "password": "123",  
    "driver": "org.postgresql.Driver"}

In [35]:
ddl_city = """
CREATE TABLE city (
    city VARCHAR(255),
    state VARCHAR(10), 
    total_listings INT,
    PRIMARY KEY (city, state) 
);
"""
connection.execute(ddl_city)

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x309c4d610>

In [36]:
city_pd = result_city.toPandas()

                                                                                

In [37]:
city_pd.to_sql(name='city', con=engine, if_exists='append', index=False)

34

In [38]:
ddl_neighbourhood = """
CREATE TABLE neighbourhood (
    neighbourhood_cleansed VARCHAR(255),
    city VARCHAR(255), 
    state VARCHAR(10),
    total_listings INT, 
    PRIMARY KEY (state,city, neighbourhood_cleansed), 
    FOREIGN KEY (city, state) REFERENCES city(city, state)
);
"""
connection.execute(ddl_neighbourhood)

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x12ccfa0d0>

In [39]:
neighbourhood_pd = result_neighbourhood.toPandas()

                                                                                

In [40]:
neighbourhood_pd.to_sql(name='neighbourhood', con=engine, if_exists='append', index=False)

573