In [1]:
import requests
import time
from pyspark.sql import Row, SparkSession
import boto3
import os
import sys
from datetime import datetime

In [2]:
date_of_extraction = datetime.today().strftime('%Y-%m-%d')

# Setting the enviroment variables

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

# Data Extraction

In [3]:
def fetch_data(current_page_number: int):
    start_time = time.time()

    url = f'https://api.openbrewerydb.org/v1/breweries?page={current_page_number}&per_page=200'
    s = requests.Session()
    response = s.get(url)

    if response.status_code != 200:
        raise Exception(response.status_code)
    elif response.status_code == 200:
        try:
            raw = response.json()  

            end_time = time.time() - start_time
            print(f'Fetched data for page {current_page_number}. \nRunning time: {end_time}s.')


            # proceeds to retrieve data until the resulting json is empty
            if len(raw) != 0:
                current_page_number = current_page_number + 1
                return raw + fetch_data(current_page_number)
            
            print(f'There is no data for page {current_page_number}. Extracting process finalized.')
            print(f'Max number of pages retrieved: {current_page_number -1}')
            return raw 
        except Exception as e:
            print(e)

In [4]:
raw_data = fetch_data(1)

Fetched data for page 1. 
Running time: 1.1576762199401855s.
Fetched data for page 2. 
Running time: 0.9486334323883057s.
Fetched data for page 3. 
Running time: 0.8632240295410156s.
Fetched data for page 4. 
Running time: 0.9171068668365479s.
Fetched data for page 5. 
Running time: 0.8481121063232422s.
Fetched data for page 6. 
Running time: 0.8677489757537842s.
Fetched data for page 7. 
Running time: 0.8558189868927002s.
Fetched data for page 8. 
Running time: 0.8929159641265869s.
Fetched data for page 9. 
Running time: 0.9343464374542236s.
Fetched data for page 10. 
Running time: 0.7315497398376465s.
Fetched data for page 11. 
Running time: 0.9041745662689209s.
Fetched data for page 12. 
Running time: 0.8837871551513672s.
Fetched data for page 13. 
Running time: 0.8749206066131592s.
Fetched data for page 14. 
Running time: 0.8571064472198486s.
Fetched data for page 15. 
Running time: 0.9262397289276123s.
Fetched data for page 16. 
Running time: 0.8730511665344238s.
Fetched data for 

In [5]:
spark = SparkSession.builder \
    .appName("DataFrame") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4,com.amazonaws:aws-java-sdk-bundle:1.12.520") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", 
            "com.amazonaws.auth.DefaultAWSCredentialsProviderChain") \
    .getOrCreate()

df = spark.createDataFrame(Row(**x) for x in raw_data) # insert the raw data into a DataFrame

# Inserting into S3 Bucket

In [6]:
client = boto3.client('s3')
df.coalesce(1).write.parquet(f"s3a://openbrewerydb-bronze-layer/{date_of_extraction}", mode="overwrite")