## Importing Libraries

In [2]:
import os
import requests
import re
import time
import logging
import urllib3

from dotenv import load_dotenv

from bs4 import BeautifulSoup

from pyspark.sql import SparkSession

## Loading Environment Variables

In [3]:
load_dotenv()

api_key = os.getenv("API_KEY")

## Create Spark Session

In [7]:
spark = SparkSession.builder \
    .appName("esport_data_bronze") \
    .config("spark.executor.memory", "64g") \
    .getOrCreate()

24/01/06 17:31:31 WARN Utils: Your hostname, pitta resolves to a loopback address: 127.0.1.1; using 192.168.100.7 instead (on interface enp6s0)
24/01/06 17:31:31 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/01/06 17:31:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Get Recent Tournaments Data

In [None]:
# Disable warnings
urllib3.disable_warnings()

# Set your API key and the API endpoint URL
api_endpoint = "http://api.esportsearnings.com/v0/LookupRecentTournaments"

# Load the offset from a file, or start from 0 if it doesn't exist
try:
    with open("offset.txt", "r") as offset_file:
        offset = int(offset_file.read())
except FileNotFoundError:
    offset = 0

all_data = []

# Initialize parameters
batch_size = 100
max_retries = 5
retries = 0  # Initialize the retry counter

while retries < max_retries:
    # Set up the request parameters
    params = {
        "apikey": api_key,
        "offset": offset,
    }

    try:
        # Make the API request
        response = requests.get(api_endpoint, params=params, verify=False)
        
        # Check for successful response
        if response.status_code == 200:
            # Check if response content is b'' (empty bytes)
            if response.content == b'':
                print("No more data to retrieve")
                break
            data = response.json()
            # Check if data is empty
            if not data or data == b'':
                print("No more data to retrieve")
                break  # No more data to retrieve
            all_data.extend(data)  # Append the batch to the list
            offset += batch_size  # Increment the offset for the next batch
            print(f"Processed {offset} records")
        else:
            logging.error(f"API request failed with status code: {response.status_code}")
            retries += 1
            if retries < max_retries:
                print("Retrying in 5 seconds...")
                time.sleep(5)
    except Exception as e:
        logging.error(f"An error occurred: {e}")
        retries += 1
        if retries < max_retries:
            print("Retrying in 5 seconds...")
            time.sleep(5)

if retries == max_retries:
    logging.error("Max retries reached. Exiting.")

# Save the offset to a file for resuming later
with open("offset.txt", "w") as offset_file:
    offset_file.write(str(offset))
    
# Create a DataFrame from the retrieved data
all_data = spark.createDataFrame(all_data)

# Save the DataFrame to CSV
all_data.coalesce(1).write.format("csv").option("header", "true").mode("overwrite").save('esports_tournaments.csv')

# Save the DataFrame to Parquet
all_data.coalesce(1).write.format("parquet").option("header", "true").mode("overwrite").save('esports_tournaments.parquet')