## Importing Libraries

In [2]:
import os
import requests
import re
import time
import logging
import urllib3

from dotenv import load_dotenv

from bs4 import BeautifulSoup

from pyspark.sql import SparkSession

## Loading Environment Variables

In [3]:
load_dotenv()

api_key = os.getenv("API_KEY")

## Create Spark Session

In [4]:
spark = SparkSession.builder \
    .appName("esport_data_bronze") \
    .config("spark.executor.memory", "64g") \
    .getOrCreate()

24/02/28 09:33:48 WARN Utils: Your hostname, pitta resolves to a loopback address: 127.0.1.1; using 192.168.100.7 instead (on interface enp6s0)
24/02/28 09:33:48 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/02/28 09:33:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/02/28 09:33:54 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


## Get Recent Tournaments Data

In [5]:
# Disable warnings
urllib3.disable_warnings()

# Set your API key and the API endpoint URL
api_endpoint = "http://api.esportsearnings.com/v0/LookupRecentTournaments"

# Load the offset from a file, or start from 0 if it doesn't exist
try:
    with open("../../data/bronze/offset/offset.txt", "r") as offset_file:
        offset = int(offset_file.read())
except FileNotFoundError:
    offset = 0

all_data = []

# Initialize parameters
batch_size = 100
max_retries = 5
retries = 0  # Initialize the retry counter

while retries < max_retries:
    # Set up the request parameters
    params = {
        "apikey": api_key,
        "offset": offset,
    }

    try:
        # Make the API request
        response = requests.get(api_endpoint, params=params, verify=False)
        
        # Check for successful response
        if response.status_code == 200:
            # Check if response content is b'' (empty bytes)
            if response.content == b'':
                print("No more data to retrieve")
                break
            data = response.json()
            # Check if data is empty
            if not data or data == b'':
                print("No more data to retrieve")
                break  # No more data to retrieve
            all_data.extend(data)  # Append the batch to the list
            offset += batch_size  # Increment the offset for the next batch
            print(f"Processed {offset} records")
        else:
            logging.error(f"API request failed with status code: {response.status_code}")
            retries += 1
            if retries < max_retries:
                print("Retrying in 5 seconds...")
                time.sleep(5)
    except Exception as e:
        logging.error(f"An error occurred: {e}")
        retries += 1
        if retries < max_retries:
            print("Retrying in 5 seconds...")
            time.sleep(5)

if retries == max_retries:
    logging.error("Max retries reached. Exiting.")

# Save the offset to a file for resuming later
with open("../../data/bronze/offset/offset.txt", "w") as offset_file:
    offset_file.write(str(offset))
    
# Create a DataFrame from the retrieved data
all_data = spark.createDataFrame(all_data)

# Save the DataFrame to CSV
all_data.coalesce(1).write.format("csv").option("header", "true").mode("overwrite").save('../../data/bronze/esports_tournaments/esports_tournaments_csv.csv')

# Save the DataFrame to Parquet
all_data.coalesce(1).write.format("parquet").option("header", "true").mode("overwrite").save('../../data/bronze/esports_tournaments/esports_tournaments_parquet.parquet')


Processed 100 records
Processed 200 records
Processed 300 records
Processed 400 records
Processed 500 records
Processed 600 records


## Get Games Awarding Prize Money

In [6]:
# Disable warnings
urllib3.disable_warnings()

# Read the parquet file to obtain the game_id values
parquet_data = spark.read.parquet('../../data/bronze/esports_tournaments/esports_tournaments_parquet.parquet')

# Extract the game_id column values into game_ids
game_ids = parquet_data.select('GameId').distinct().rdd.flatMap(lambda x: x).collect()

# Construct the URL for the current game ID
api_endpoint = "http://api.esportsearnings.com/v0/LookupGameById"

# Initialize the list to store game data
game_data = []

# Initialize parameters
max_retries = 5

for game_id in game_ids:
    
    # Set up the request parameters
    params = {
    "apikey": api_key,
    "gameid": game_id,
    }   
    
    retries = 0
    

    while retries < max_retries:
        try:
            # Send a GET request to the API
            response = requests.get(api_endpoint, params=params, verify=False)

            # Check if the request was successful (status code 200)
            if response.status_code == 200:
                # Parse the JSON response
                data = response.json()
                # Add the GameId to the data
                data["GameId"] = game_id
                # Append the data to the list of data entries
                game_data.append(data)
                # Print the status
                print(f"Processed game ID {game_id}")
                break
            else:
                logging.error(f"Request for game ID {game_id} failed with status code {response.status_code}")
                retries += 1
                if retries < max_retries:
                    logging.info(f"Retrying in 5 seconds (Retry {retries} of {max_retries})...")
                    time.sleep(5)
        except requests.exceptions.RequestException as e:
            # Handle connection and request exceptions
            logging.error(f"Request error for game ID {game_id}: {e}")
            retries += 1
            if retries < max_retries:
                logging.info(f"Retrying in 5 seconds (Retry {retries} of {max_retries})...")
                time.sleep(5)
                
# Create a DataFrame from the collected game data
game_data = spark.createDataFrame(game_data)
                
# Save the DataFrame to CSV
game_data.coalesce(1).write.format("csv").option("header", "true").mode("overwrite").save('../../data/bronze/esports_tournaments/games_awarding_prize_money_csv.csv')

# Save the DataFrame to Parquet
game_data.coalesce(1).write.format("parquet").option("header", "true").mode("overwrite").save('../../data/bronze/esports_tournaments/games_awarding_prize_money_parquet.parquet')

                                                                                

Processed game ID 541
Processed game ID 558
Processed game ID 191
Processed game ID 418
Processed game ID 730
Processed game ID 270
Processed game ID 222
Processed game ID 705
Processed game ID 442
Processed game ID 367
Processed game ID 243
Processed game ID 348
Processed game ID 415
Processed game ID 277
Processed game ID 847
Processed game ID 656
Processed game ID 502
Processed game ID 167
Processed game ID 831
Processed game ID 385
Processed game ID 155
Processed game ID 736
Processed game ID 857
Processed game ID 564
Processed game ID 588
Processed game ID 241
Processed game ID 602
Processed game ID 347
Processed game ID 237
Processed game ID 330
Processed game ID 724
Processed game ID 530
Processed game ID 198
Processed game ID 414
Processed game ID 823
Processed game ID 487
Processed game ID 486
Processed game ID 196
Processed game ID 427
Processed game ID 677
Processed game ID 858
Processed game ID 532
Processed game ID 635
Processed game ID 184
Processed game ID 853
Processed 

ERROR:root:Request for game ID 249 failed with status code 502


Processed game ID 249
Processed game ID 762
Processed game ID 819
Processed game ID 822
Processed game ID 795
Processed game ID 688
Processed game ID 381
Processed game ID 383
Processed game ID 220
Processed game ID 777
Processed game ID 379
Processed game ID 567
Processed game ID 535
Processed game ID 580
Processed game ID 473
Processed game ID 440
Processed game ID 653
Processed game ID 213
Processed game ID 573
Processed game ID 454
Processed game ID 192
Processed game ID 337
Processed game ID 841
Processed game ID 156
Processed game ID 628
Processed game ID 587
Processed game ID 636
Processed game ID 429
Processed game ID 715
Processed game ID 160
Processed game ID 618
Processed game ID 482
Processed game ID 382
Processed game ID 206
Processed game ID 478
Processed game ID 431
Processed game ID 362
Processed game ID 491
Processed game ID 655
Processed game ID 355
Processed game ID 200
Processed game ID 855
Processed game ID 826
Processed game ID 614
Processed game ID 373
Processed 

ERROR:root:Request for game ID 572 failed with status code 502


Processed game ID 572
Processed game ID 394
Processed game ID 751
Processed game ID 786
Processed game ID 169
Processed game ID 758
Processed game ID 273
Processed game ID 456
Processed game ID 739
Processed game ID 458
Processed game ID 340
Processed game ID 526
Processed game ID 469
Processed game ID 211


                                                                                

## Get Games Genres Data

In [8]:
from bs4 import BeautifulSoup
import requests
import pyspark.sql.functions as F

url = 'https://www.esportsearnings.com/games/browse-by-genre'
html = requests.get(url).text
soup = BeautifulSoup(html, 'html.parser')

# Find all genre titles, game statistics, and game boxes
genre_titles = soup.find_all('span', class_='games_main_genre_title')
genre_stats = soup.find_all('span', class_='games_main_genre_stats')
game_boxes = soup.find_all('div', class_='games_main_game_box')
game_links = soup.find_all('a')

# Extract text and statistics as lists
genre_titles = [genre_title.text for genre_title in genre_titles]
genre_num = [int(re.search(r'\d+', genre_stat.text).group()) for genre_stat in genre_stats]
game_titles = [game_box['title'] for game_box in game_boxes if 'title' in game_box.attrs]
game_ids = [int(match.group(1)) for link in game_links if (match := re.compile(r'^/games/(\d+)').match(link.get('href')))]
# games_ids = spark.createDataFrame(game_ids, columns=['Game Id'])

# Initialize an empty list to store dictionaries
data = []

# Iterate through the pairs of genre titles and game boxes
position = 0
for genre_title, num_games in zip(genre_titles, genre_num):
    game_titles_list = game_titles[position:position + num_games]
    game_ids_list = game_ids[position:position + num_games]
    
    # Create a dictionary for each game and add it to the data list
    for game_title, game_id in zip(game_titles_list, game_ids_list):
        data.append({'Genre': genre_title, 'Game Name': game_title, 'Game Id': game_id})
    
    position += num_games

# Create a DataFrame from the list of dictionaries
df = spark.createDataFrame(data)

# Save the DataFrame to CSV
df.coalesce(1).write.format("csv").option("header", "true").mode("overwrite").save('../../data/bronze/esports_tournaments/game_genres_csv.csv')

# Save the DataFrame to Parquet
df.coalesce(1).write.format("parquet").option("header", "true").mode("overwrite").save('../../data/bronze/esports_tournaments/game_genres_parquet.parquet')

                                                                                

In [6]:
spark.stop()