In [None]:
import pickle
import re
from pathlib import Path
from pprint import pprint

# import pendulum
import pandas as pd
from selectolax.parser import HTMLParser

from classes import GameLog

### Read from CSV

In [None]:
base_path = Path("../data").resolve()

with open(base_path / "extract_vars.pickle", "rb") as f:
    extract_vars = pickle.load(f)

game_stats_parser_html = extract_vars[0]
boxscore_names = extract_vars[1]
game_stats_parser = HTMLParser(game_stats_parser_html)

In [None]:
game_boxscores = [pd.read_csv(base_path / f"{name}.csv") for name in boxscore_names]

In [None]:
boxscore = game_boxscores[0]
boxscore

### Clean Tables
#### Linescore

In [None]:
def clean_linescore(linescore_raw: pd.DataFrame) -> pd.DataFrame:
    """Clean raw linescore."""
    linescore = linescore_raw.copy()

    linescore = linescore.drop(columns="0")
    linescore.iloc[0, :3] = pd.Series(["team", "gamer", "result"])
    linescore.columns = pd.Index(linescore.iloc[0])
    linescore = linescore.drop(index=0)
    linescore.insert(loc=0, column="gamer", value=linescore.pop("gamer"))
    linescore.columns = linescore.columns.astype(str)

    linescore.index = pd.Index(["away", "home"])

    cols_to_numeric = ["R", "H", "E"]
    linescore[cols_to_numeric] = linescore[cols_to_numeric].apply(pd.to_numeric)
    linescore.index.name = None

    return linescore

In [None]:
linescore_raw = game_boxscores[0]
linescore = clean_linescore(linescore_raw)
linescore

#### Batting Boxscores

In [None]:
batting_raw = game_boxscores[1]
batting_raw

In [None]:
batters = batting_raw["Batter"]
batters = batters.drop(batters.tail(1).index)
batters

### Batters Without Position Info
#### RegEx Patterns

In [None]:
name_re = re.compile(r"([a-z]-)?(.+),")
period_re = re.compile(r"\.{1}")
ellipsis_re = re.compile(r"(.+)\.{3}(.+)")

#### Include periods in names that have them

In [None]:
batters = batters.str.extract(name_re.pattern)[1]
batters

#### Periods in names removed

In [None]:
batters_clean = batters.str.replace(period_re.pattern, "", regex=True)
batters_clean

### Only Batters With Periods In Their Names
#### With Periods

In [None]:
batters_to_replace = batters[batters.str.contains(period_re.pattern, regex=True)]
batters_to_replace

#### Without Periods

In [None]:
batters_replacements = batters_to_replace.str.replace(period_re.pattern, "", regex=True)
batters_replacements
for b in batters_replacements:
    print(b)

#### Pitching Boxscores

In [None]:
pitching_raw = game_boxscores[2]
pitchers = pitching_raw["Pitcher"]
pitchers.to_list()

In [None]:
pitchers = pitching_raw["Pitcher"]
pitchers
# pitchers = pitchers.drop(pitchers.tail(1).index)
# pitchers[pitchers.str.contains(r"\.{1}")]
# Regex to remove ...

#### Compile Relevant RegEx's for Game Log Processing

In [None]:
patterns = {
    "parenthesis_re": re.compile(r"\s\([\w\s%-]+\)"),
    "inning_stats_re": re.compile(
        r"Runs: \d+ Hits: \d+ Walks: \d+ Errors: \d+ Pitches: \d+"
    ),
    "game_difficulty_re": re.compile(r"Hitting Difficulty is (?P<difficulty>[\w\s-]+)"),
}

#### Split Game Log

In [None]:
# Raw game log section.
game_log_section = game_stats_parser.css("div.section-block")[-1]
game_log_section = game_log_section.html

# Remove the leading div and h3 tag text. Remove any asterisk characters. Remove
# information contained in parenthesis.
game_log_section = game_log_section.replace(
    '<div class="section-block">\n<h3>Game Log</h3>\n', ""
)
game_log_section = game_log_section.replace("*", "")
game_log_section = patterns["parenthesis_re"].sub("", game_log_section)

# Split game log section into the game log and the supplementary information.
game_log_section = game_log_section.partition("Game Log Legend")
game_log = game_log_section[0]
game_misc = game_log_section[1] + game_log_section[2]

#### Game Difficulty

In [None]:
# Find the game difficulty which is in the second half of the game log section.
match = patterns["game_difficulty_re"].search(game_misc)
if match:
    difficulty = match.group("difficulty")
else:
    difficulty = None
difficulty

#### Prepare Game Log String

In [None]:
# Split on the <br> tag text, empty strings and whitespace are dealt with as well.
game_log_split_full = game_log.split("<br>")
game_log_split_full = [line.strip() for line in game_log_split_full if line]
game_log_split = [
    line for line in game_log_split_full if not patterns["inning_stats_re"].match(line)
]
inning_stats = [
    line for line in game_log_split_full if patterns["inning_stats_re"].match(line)
]

In [None]:
game_log = (" ").join(game_log_split)
game_log

#### Game Log

In [None]:
game_log_section = game_stats_parser.css("div.section-block")[-1]
game_log = GameLog(batters, game_log_section.html)

In [None]:
game_log.batters_to_replace

In [None]:
game_log.batters_replacements

In [None]:
inning_number_re = re.compile(r"^(?P<inning>Inning \d+):")
team_re = re.compile(r"^(?P<team>.+) batting\.")
inning_stats_re = re.compile(r"(?P<stat>[\w ]+): (?P<value>\d+) ?")

In [None]:
def process_inning_stats(game_log: GameLog) -> dict[str, dict[str, dict[str, int]]]:
    """Separate, by inning and team, the inning stat lines from the game."""
    inning_stats: dict[str, dict[str, dict[str, int]]] = {}

    for line in game_log.log_split:
        inning_number_match = inning_number_re.match(line)
        team_match = team_re.match(line)
        inning_stats_matches = inning_stats_re.finditer(line)

        if inning_number_match:
            inning = inning_number_match.group("inning").replace(":", "")
            inning_stats[inning] = {}

        elif team_match:
            team = team_match.group("team")
            inning_stats[inning][team] = {}

        elif inning_stats_matches:
            inning_stats[inning][team] = {
                match.group("stat"): int(match.group("value"))
                for match in inning_stats_matches
            }
    return inning_stats

In [None]:
a = process_inning_stats(game_log)
pprint(a)