## Importing Libraries

In [2]:
import os
import requests
import re
import time
import logging
import urllib3

from dotenv import load_dotenv

from bs4 import BeautifulSoup

from pyspark.sql import SparkSession

## Loading Environment Variables

In [3]:
load_dotenv()

api_key = os.getenv("API_KEY")

## Create Spark Session

In [8]:
spark = SparkSession.builder \
    .appName("esport_data_bronze") \
    .config("spark.executor.memory", "64g") \
    .getOrCreate()

## Get Recent Tournaments Data

In [24]:
# Disable warnings
urllib3.disable_warnings()

# Set your API key and the API endpoint URL
api_endpoint = "http://api.esportsearnings.com/v0/LookupRecentTournaments"

# Load the offset from a file, or start from 0 if it doesn't exist
try:
    with open("../../data/bronze/offset/offset.txt", "r") as offset_file:
        offset = int(offset_file.read())
except FileNotFoundError:
    offset = 0

all_data = []

# Initialize parameters
batch_size = 100
max_retries = 5
retries = 0  # Initialize the retry counter

while retries < max_retries:
    # Set up the request parameters
    params = {
        "apikey": api_key,
        "offset": offset,
    }

    try:
        # Make the API request
        response = requests.get(api_endpoint, params=params, verify=False)
        
        # Check for successful response
        if response.status_code == 200:
            # Check if response content is b'' (empty bytes)
            if response.content == b'':
                print("No more data to retrieve")
                break
            data = response.json()
            # Check if data is empty
            if not data or data == b'':
                print("No more data to retrieve")
                break  # No more data to retrieve
            all_data.extend(data)  # Append the batch to the list
            offset += batch_size  # Increment the offset for the next batch
            print(f"Processed {offset} records")
        else:
            logging.error(f"API request failed with status code: {response.status_code}")
            retries += 1
            if retries < max_retries:
                print("Retrying in 5 seconds...")
                time.sleep(5)
    except Exception as e:
        logging.error(f"An error occurred: {e}")
        retries += 1
        if retries < max_retries:
            print("Retrying in 5 seconds...")
            time.sleep(5)

if retries == max_retries:
    logging.error("Max retries reached. Exiting.")

# Save the offset to a file for resuming later
with open("../../data/bronze/offset/offset.txt", "w") as offset_file:
    offset_file.write(str(offset))
    
# Create a DataFrame from the retrieved data
all_data = spark.createDataFrame(all_data)

# Save the DataFrame to CSV
all_data.coalesce(1).write.format("csv").option("header", "true").mode("overwrite").save('../../data/bronze/esports_tournaments/esports_tournaments_csv.csv')

# Save the DataFrame to Parquet
all_data.coalesce(1).write.format("parquet").option("header", "true").mode("overwrite").save('../../data/bronze/esports_tournaments/esports_tournaments_parquet.parquet')

Processed 100 records
Processed 200 records
Processed 300 records
Processed 400 records
Processed 500 records
Processed 600 records
Processed 700 records
Processed 800 records
Processed 900 records
Processed 1000 records
Processed 1100 records
Processed 1200 records
Processed 1300 records
Processed 1400 records
Processed 1500 records
Processed 1600 records
Processed 1700 records
Processed 1800 records
Processed 1900 records
Processed 2000 records
Processed 2100 records
Processed 2200 records
Processed 2300 records
Processed 2400 records
Processed 2500 records
Processed 2600 records
Processed 2700 records
Processed 2800 records
Processed 2900 records
Processed 3000 records
Processed 3100 records
Processed 3200 records
Processed 3300 records
Processed 3400 records
Processed 3500 records
Processed 3600 records
Processed 3700 records
Processed 3800 records
Processed 3900 records
Processed 4000 records
Processed 4100 records
Processed 4200 records
Processed 4300 records
Processed 4400 recor

ERROR:root:API request failed with status code: 502


Retrying in 5 seconds...
Processed 6900 records
Processed 7000 records
Processed 7100 records
Processed 7200 records
Processed 7300 records
Processed 7400 records
Processed 7500 records
Processed 7600 records
Processed 7700 records
Processed 7800 records
Processed 7900 records
Processed 8000 records
Processed 8100 records
Processed 8200 records
Processed 8300 records
Processed 8400 records
Processed 8500 records
Processed 8600 records
Processed 8700 records
Processed 8800 records
Processed 8900 records
Processed 9000 records
Processed 9100 records
Processed 9200 records
Processed 9300 records
Processed 9400 records
Processed 9500 records
Processed 9600 records
Processed 9700 records
Processed 9800 records
Processed 9900 records
Processed 10000 records
Processed 10100 records
Processed 10200 records
Processed 10300 records
Processed 10400 records
Processed 10500 records
Processed 10600 records
Processed 10700 records
Processed 10800 records
Processed 10900 records
Processed 11000 record

ERROR:root:API request failed with status code: 502


Retrying in 5 seconds...
Processed 29600 records
Processed 29700 records
Processed 29800 records
Processed 29900 records
Processed 30000 records
Processed 30100 records
Processed 30200 records
Processed 30300 records
Processed 30400 records
Processed 30500 records
Processed 30600 records
Processed 30700 records
Processed 30800 records
Processed 30900 records
Processed 31000 records
Processed 31100 records
Processed 31200 records
Processed 31300 records
Processed 31400 records
Processed 31500 records
Processed 31600 records
Processed 31700 records
Processed 31800 records
Processed 31900 records
Processed 32000 records
Processed 32100 records
Processed 32200 records
Processed 32300 records
Processed 32400 records
Processed 32500 records
Processed 32600 records
Processed 32700 records
Processed 32800 records
Processed 32900 records
Processed 33000 records
Processed 33100 records
Processed 33200 records
Processed 33300 records
Processed 33400 records
Processed 33500 records
Processed 33600

24/02/05 20:51:32 WARN TaskSetManager: Stage 0 contains a task of very large size (5703 KiB). The maximum recommended task size is 1000 KiB.
24/02/05 20:51:41 WARN TaskSetManager: Stage 1 contains a task of very large size (5703 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [23]:
offset = 1
with open("../../data/bronze/offset/offset.txt", "w") as offset_file:
    offset_file.write(str(offset))

## Get Games Awarding Prize Money

In [None]:
# Disable warnings
urllib3.disable_warnings()

# Read the parquet file to obtain the game_id values
parquet_data = spark.read.parquet('esports_tournaments.parquet')

# Extract the game_id column values into game_ids
game_ids = parquet_data.select('GameId').distinct().rdd.flatMap(lambda x: x).collect()

# Construct the URL for the current game ID
api_endpoint = "http://api.esportsearnings.com/v0/LookupGameById"

# Initialize the list to store game data
game_data = []

# Initialize parameters
max_retries = 5

for game_id in game_ids:
    
    # Set up the request parameters
    params = {
    "apikey": api_key,
    "gameid": game_id,
    }   
    
    retries = 0
    

    while retries < max_retries:
        try:
            # Send a GET request to the API
            response = requests.get(api_endpoint, params=params, verify=False)

            # Check if the request was successful (status code 200)
            if response.status_code == 200:
                # Parse the JSON response
                data = response.json()
                # Add the GameId to the data
                data["GameId"] = game_id
                # Append the data to the list of data entries
                game_data.append(data)
                # Print the status
                print(f"Processed game ID {game_id}")
                break
            else:
                logging.error(f"Request for game ID {game_id} failed with status code {response.status_code}")
                retries += 1
                if retries < max_retries:
                    logging.info(f"Retrying in 5 seconds (Retry {retries} of {max_retries})...")
                    time.sleep(5)
        except requests.exceptions.RequestException as e:
            # Handle connection and request exceptions
            logging.error(f"Request error for game ID {game_id}: {e}")
            retries += 1
            if retries < max_retries:
                logging.info(f"Retrying in 5 seconds (Retry {retries} of {max_retries})...")
                time.sleep(5)
                
# Create a DataFrame from the collected game data
game_data = spark.createDataFrame(game_data)
                
# Save the DataFrame to CSV
game_data.coalesce(1).write.format("csv").option("header", "true").mode("overwrite").save('games_awarding_prize_money.csv')

# Save the DataFrame to Parquet
game_data.coalesce(1).write.format("parquet").option("header", "true").mode("overwrite").save('games_awarding_prize_money.parquet')

## Get Games Genres Data

In [None]:
from bs4 import BeautifulSoup
import requests
import pyspark.sql.functions as F

url = 'https://www.esportsearnings.com/games/browse-by-genre'
html = requests.get(url).text
soup = BeautifulSoup(html, 'html.parser')

# Find all genre titles, game statistics, and game boxes
genre_titles = soup.find_all('span', class_='games_main_genre_title')
genre_stats = soup.find_all('span', class_='games_main_genre_stats')
game_boxes = soup.find_all('div', class_='games_main_game_box')
game_links = soup.find_all('a')

# Extract text and statistics as lists
genre_titles = [genre_title.text for genre_title in genre_titles]
genre_num = [int(re.search(r'\d+', genre_stat.text).group()) for genre_stat in genre_stats]
game_titles = [game_box['title'] for game_box in game_boxes if 'title' in game_box.attrs]
game_ids = [int(match.group(1)) for link in game_links if (match := re.compile(r'^/games/(\d+)').match(link.get('href')))]
# games_ids = spark.createDataFrame(game_ids, columns=['Game Id'])

# Initialize an empty list to store dictionaries
data = []

# Iterate through the pairs of genre titles and game boxes
position = 0
for genre_title, num_games in zip(genre_titles, genre_num):
    game_titles_list = game_titles[position:position + num_games]
    game_ids_list = game_ids[position:position + num_games]
    
    # Create a dictionary for each game and add it to the data list
    for game_title, game_id in zip(game_titles_list, game_ids_list):
        data.append({'Genre': genre_title, 'Game Name': game_title, 'Game Id': game_id})
    
    position += num_games

# Create a DataFrame from the list of dictionaries
df = spark.createDataFrame(data)

# Save the DataFrame to CSV
df.coalesce(1).write.format("csv").option("header", "true").mode("overwrite").save('game_genres.csv')

# Save the DataFrame to Parquet
# df.coalesce(1).write.format("parquet").option("header", "true").mode("overwrite").save('game_genres.parquet')

In [None]:
df.show()

In [None]:
print(game_ids)

In [6]:
spark.stop()