In [None]:
import logging
import re
import tomllib
from pathlib import Path

# import pendulum
import pandas as pd
from selectolax.parser import HTMLParser
from splinter import Browser
from splinter.driver.webdriver.chrome import WebDriver
from webdriver_manager.chrome import ChromeDriverManager

from classes import Gamer
from utils import browser_action, retrieve_game_nodes

### Set Up Logging

In [None]:
logging.basicConfig(
    format="{levelname:8s} - {module} - {funcName}: {message}", style="{"
)

logger_main = logging.getLogger(__name__)
logger_main.setLevel(logging.DEBUG)

### Create Input Data

In [None]:
with open(Path().resolve().parents[0] / "secrets.toml", "rb") as f:
    secrets = tomllib.load(f)

gamers: list[Gamer] = [
    Gamer(name, gamer_tag) for name, gamer_tag in secrets["gamer_ids"].items()
]

for gamer in gamers:
    print(gamer, "\n")

## Extract
### Create Browser Instance

In [None]:
# Set up Splinter (prep the automated browser).
executable_path = {"executable_path": ChromeDriverManager().install()}
browser: WebDriver = Browser("chrome", **executable_path, headless=False)

In [None]:
# def scrape_game_history():
#     # 3 loops.
#     for gamer in gamers:
#         visit(url)

#         game_history = retrieve_game_history(browser)
#         if not game_history:
#             print("No game history")

#         # m games per n pages up until the crawler runs into an already seen game.
#         for game in game_history:
#             if is_cpu_game():
#                 continue
#             else:
#                 collect()

# def visit(url):
#     browser_action(browser.visit, url=url)

# def retrieve_game_history(browser):
#     parsed_html = BeautifulSoup(browser.html, "lxml")
#     return parsed_html.find_all("div", attrs={"class": "mlb22-games-box"})

### Visit and Retrieve HTML

In [None]:
browser_action(browser.visit, url=gamers[0].url_gamer)

In [None]:
# Retrieve html.
game_history_html = browser.html
game_history_parser = HTMLParser(game_history_html)

### Retrieve Individual Game Nodes

In [None]:
game_nodes, game_history_parser = retrieve_game_nodes(
    parser=game_history_parser,
    css_selector="div.mlb22-games-box",
    browser=browser,
)
game_html = game_nodes[0].html
game_parser = HTMLParser(game_html)

## Individual Game Processing
### Game Date

In [None]:
date_p_tag = game_parser.css_first("p:nth-child(2)")
game_date = date_p_tag.text()
game_date

### View Game href

In [None]:
view_game_a_tag = game_parser.tags("a")
view_game_href = view_game_a_tag[-1].attributes["href"]
view_game_href

### Click the View Game Link

In [None]:
browser_action(browser.links.find_by_href(view_game_href).click)

# Retrieve html.
game_stats_html = browser.html
game_stats_parser = HTMLParser(game_stats_html)

### Collect All Tables

In [None]:
game_data = pd.read_html(game_stats_parser.html)
len(game_data)

In [None]:
parenthesis_pattern = re.compile(r"\s\([\w\s-]+\)")
num_cap_letter_pattern = re.compile(r"(\d+)([A-Z])")
num_space_inn_pattern = re.compile(r"(\d+) (Inning)")
inning_stats_pattern = re.compile(
    r"Runs: \d+ Hits: \d+ Walks: \d+ Errors: \d+ Pitches: \d+"
)

In [None]:
# Raw game log section.
game_log_section = game_stats_parser.css("div.section-block")[-1]
game_log_section = game_log_section.html  # .text(strip=True)

# Split game log section into the game log and the supplementary information.
game_log_section = game_log_section.split("Game Log Legend")

# Find the game difficulty.
match = re.search(r"Hitting Difficulty is ([\w\s-]+)", game_log_section[-1])
difficulty = match.group(1)
# difficulty

# Assign the game log portion and remove any asterisk characters.
game_log = game_log_section[0]

# Remove the leading div and h3 tag text.
game_log = game_log.replace('<div class="section-block">\n<h3>Game Log</h3>\n', "")

# Remove any asterisk characters.
game_log = game_log.replace("*", "")

# # Remove information contained in parenthesis.
game_log = re.sub(parenthesis_pattern, "", game_log)

# Split on the <br> tag text
game_log_split = game_log.split("<br>")
game_log_split = [line.strip() for line in game_log_split if line]

game_log_split = [
    line for line in game_log_split if not re.match(inning_stats_pattern, line)
]
inning_stats = [line for line in game_log_split if re.match(inning_stats_pattern, line)]

In [None]:
game_log = (" ").join(game_log_split)

In [None]:
game_log

In [None]:
boxscore_names = [
    "linescore",
    "away_hitting",
    "away_pitching",
    "home_hitting",
    "home_pitching",
]

### Write to CSV

In [None]:
for name, boxscore in zip(boxscore_names, game_data):
    path = f"{name}.csv"
    boxscore.to_csv(path, index=False)

### Read from CSV

In [None]:
game_data = [pd.read_csv(f"{name}.csv") for name in boxscore_names]

### Clean Tables
#### Linescore

In [None]:
def clean_linescore(linescore_raw: pd.DataFrame) -> pd.DataFrame:
    """Clean raw linescore."""
    linescore = linescore_raw.copy()

    linescore = linescore.drop(columns="0")
    linescore.iloc[0, :3] = ["team_name", "gamer_tag", "result"]
    linescore.columns = linescore.iloc[0]
    linescore = linescore.drop(index=0).set_index("gamer_tag")

    cols_to_numeric = ["R", "H", "E"]
    linescore[cols_to_numeric] = linescore[cols_to_numeric].apply(pd.to_numeric)
    linescore.index.name = None

    return linescore

In [None]:
linescore_raw = game_data[0]
linescore = clean_linescore(linescore_raw)
linescore

In [None]:
linescore.dtypes

#### Hitting Boxscores

In [None]:
hitting_raw = game_data[1]
hitting_raw

#### Pitching Boxscores

In [None]:
pitching_raw = game_data[2]
pitching_raw