In [None]:
import pyspark
import delta

import pandas as pd

from pyspark.conf import SparkConf

import requests

from bs4 import BeautifulSoup

import json

In [None]:
conf = (
    SparkConf()
    .setAppName("DeltaApp")
    .set("spark.driver.memory", "30g")
    .set("spark.executor.memory", "30g")
    .set("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .set("spark.driver.maxResultSize", "0")
    .set(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
    .set("spark.sql.debug.maxToStringFields", "1000")
)

In [None]:
conf.getAll()

In [None]:
builder = pyspark.sql.SparkSession.builder.config(conf=conf)

In [None]:
spark = delta.configure_spark_with_delta_pip(builder).getOrCreate()

# Create table

In [None]:
countries = pd.read_csv(
    "https://raw.githubusercontent.com/cs109/2014_data/master/countries.csv"
)

In [None]:
countries_df = spark.createDataFrame(countries)

In [None]:
countries_df.write.format("delta").mode("overwrite").save("data/delta/countries")

In [None]:
! ls -l ./data/delta/countries/_delta_log/

# Read data

In [None]:
df = spark.read.format("delta").load("data/delta/countries")
df.show()

In [None]:
df.groupby("Region").count().show()

# inside airbnb

In [None]:
resp = requests.get("https://insideairbnb.com/get-the-data/")
soup = BeautifulSoup(resp.text, "html.parser")

hrefs = [i.get("href", "") for i in soup.find_all("a")]
extensions = ["gz", "csv", "geojson"]

data_links = [i for i in hrefs if i.split(".")[-1] in extensions and "/data/" in i]

In [None]:
cannot_process = []
retry = []

In [None]:
len(cannot_process)

In [None]:
len(retry)

In [None]:
with open("cannot_process.json", "w") as outfile:
    outfile.write(json.dumps(cannot_process))

In [None]:
with open("cannot_process.json", "r") as outfile:
    cannot_process = json.load(outfile)

In [None]:
with open("retry.json", "w") as outfile:
    outfile.write(json.dumps(retry))

In [None]:
with open("retry.json", "r") as openfile:
    data_links = json.load(openfile)

In [None]:
# for link in [i for i in data_links if 'united-kingdom' in i or 'turkey' in i]:
while len(data_links) > 0:
    link = data_links.pop()
    splits = link.split("/")

    try:
        file_name = splits[-1]
        country = splits[3]
        state = splits[4]
        city = splits[5]
        scrape_date = splits[6]
        data_category = file_name.split(".")[0]

        target = (
            f"data/delta/airbnb/{country}/{state}/{city}/{data_category}/{scrape_date}"
        )
        print(link)
        print(f"{target}")

        try:
            df = pd.read_csv(link, low_memory=False)
            if len(df) > 0:
                spark_df = spark.createDataFrame(df)
                spark_df.write.format("delta").mode("overwrite").save(target)
        except UnicodeEncodeError:
            cannot_process.append(link)
            print(f"xxx cannot read link: {link}")
    except Exception:
        retry.append(link)
        print(f"xxx cannot process link: {link}")