In [None]:
import pickle
import re
from pathlib import Path
from pprint import pprint

# import pendulum
import pandas as pd
from pandas import DataFrame
from selectolax.parser import HTMLParser

from classes import BattingBoxscore, GameLog, HomeAwayData, Linescore, PitchingBoxscore

### Read from CSV

In [None]:
base_path = Path("../data").resolve()

with open(base_path / "extract_vars.pickle", "rb") as f:
    extract_vars = pickle.load(f)

game_stats_parser_html = extract_vars[0]
table_names = extract_vars[1]
game_stats_parser = HTMLParser(game_stats_parser_html)

In [None]:
game_tables = [pd.read_csv(base_path / f"{name}.csv") for name in table_names]

## Linescore

In [None]:
linescore = Linescore(game_tables[0])

In [None]:
linescore.linescore

## Boxscores
#### RegEx Patterns

In [None]:
batter_names_re = re.compile(r"([a-z]-)?(?P<name>.+),")
period_re = re.compile(r"\.{1}")
abbreviated_re = re.compile(r"(?P<name>.+)(\.{3})")
ellipsis_re = re.compile(r"\.{3}")
pitching_stat_re = re.compile(r"(?P<parenthesis> \((?P<stat>\w+)\))")

### Batting Boxscores

In [None]:
batting_boxscore_raw = HomeAwayData(game_tables[1], game_tables[3])

In [None]:
def clean_batting_boxscore(boxscore_raw: DataFrame):
    """Clean the raw batting boxscore."""
    boxscore = boxscore_raw.copy()
    boxscore = boxscore.drop(columns="AVG")

    # Clean the batter names using the `batter_names_re` pattern
    hitters = boxscore.iloc[:-1, boxscore.columns.get_loc("Batter")]
    boxscore.iloc[:-1, boxscore.columns.get_loc("Batter")] = hitters.str.extract(
        batter_names_re.pattern
    )["name"]

    boxscore = boxscore.set_index("Batter")
    boxscore.index.name = None

    return boxscore

In [None]:
# batting_boxscore = clean_batting_boxscore(batting_boxscore_raw.away)
# batting_boxscore

batting = BattingBoxscore(batting_boxscore_raw)
batting.boxscore_raw["home"]

#### Periods in names removed

In [None]:
def generate_replacement_data(players):
    """Generate the to_be_replaced and replacement names Series."""
    # All names with any periods removed.
    batters_no_periods = batters_home.str.replace(period_re.pattern, "", regex=True)
    # Names that have periods.
    batters_to_replace = batters_home[
        batters_home.str.contains(period_re.pattern, regex=True)
    ]
    # Names that have periods, now with the periods removed.
    batters_replacements = batters_to_replace.str.replace(
        period_re.pattern, "", regex=True
    )

    return batters_no_periods, batters_to_replace, batters_replacements

In [None]:
batters_home = pd.Series(batting.boxscore.home.index)
(
    batters_no_periods,
    batters_to_replace,
    batters_replacements,
) = generate_replacement_data(batters_home)

### Only Batters With Periods In Their Names
#### With Periods

In [None]:
batters_to_replace = batters_home[
    batters_home.str.contains(period_re.pattern, regex=True)
]
batters_to_replace

#### Without Periods

In [None]:
batters_replacements = batters_to_replace.str.replace(period_re.pattern, "", regex=True)
batters_replacements

### Pitching Boxscores

In [None]:
pitching_boxscore_raw = HomeAwayData(game_tables[2], game_tables[4])

In [None]:
# df = pitching_boxscore_raw.away

# df.iloc[2,0] = "WoodsRicha"
# df.iloc[4,0] = "Montes de "
# df.iloc[6,0] = "De Los San"
# df.iloc[7,0] = "Ponce de L"
# pitching_boxscore_raw.away = df

In [None]:
pitching_boxscore_raw.away

In [None]:
pitching_categories = ["W", "L", "S", "BS", "H"]


def clean_pitching_boxscore(boxscore_raw: DataFrame):
    """Clean the raw pitching boxscore."""
    boxscore = boxscore_raw.copy()
    boxscore = boxscore.drop(columns="ERA")

    # Generate extended pitching stats from the game decisions in parentheses.
    parentheses_groups = boxscore["Pitcher"].str.extract(pitching_stat_re.pattern)
    extended_stats = pd.get_dummies(parentheses_groups["stat"])

    # Add in the game decisions that were not awarded in this game as columns of zeros.
    for cat in pitching_categories:
        if cat not in extended_stats.columns:
            extended_stats[cat] = 0
    # Reorder the columns so that the extended stats are in a uniform order.
    extended_stats = extended_stats[pitching_categories]

    # Tally the totals for the newly added pitch decision stats.
    extended_stats.iloc[len(extended_stats) - 1] = extended_stats.sum()

    # Concatenate the extended stats onto the boxscore.
    boxscore = pd.concat([boxscore, extended_stats], axis=1)

    # Remove the parentheses game decisions.
    boxscore["Pitcher"] = boxscore["Pitcher"].str.replace(
        pitching_stat_re.pattern, "", regex=True
    )

    # Collect any
    abbreviated_names = boxscore.Pitcher.str.extract(abbreviated_re.pattern)["name"]
    abbreviated_names = abbreviated_names[abbreviated_names.notna()]

    boxscore["Pitcher"] = boxscore["Pitcher"].str.replace(
        ellipsis_re.pattern, "", regex=True
    )

    boxscore = boxscore.set_index("Pitcher")
    boxscore.index.name = None

    return boxscore, abbreviated_names

In [None]:
# pitching_boxscore, abbreviated_names = clean_pitching_boxscore(
#     pitching_boxscore_raw.away
# )
# pitching_boxscore

pitching = PitchingBoxscore(HomeAwayData(game_tables[2], game_tables[4]))
pitching.boxscore.home

In [None]:
pitching.abbreviated_names.home

#### Game Log

In [None]:
game_log_section = game_stats_parser.css("div.section-block")[-1]
game_log = GameLog(batters_home, game_log_section.html)

In [None]:
pprint(game_log.inning_stats["Inning 1"])

In [None]:
game_log.misc

In [None]:
log = game_log.log
for to_replace, replacement in zip(
    game_log.batters_to_replace, game_log.batters_replacements
):
    print("old value:", to_replace)
    print("new value:", replacement)
    log = log.replace(to_replace, replacement)

#### To Do:
##### BattingBoxscore
- generate `batters_to_replace` (home and away)
- generate `batters_replacements` (home and away)
- create `batters_to_replace` property with read access
- create `batters_replacements` property with read access
- create function to get the `abbreviated_names`


##### PitchingBoxscore
- generate `pitchers_to_replace` (home and away)
- generate `pitchers_replacements` (home and away)
- create `pitchers_to_replace` property with read access
- create `pitchers_replacements` property with read access
- create function to get the `abbreviated_names`

##### GameLog
- create function that takes `to_replace` and `replacements` Series and does the replacement in the game log
- create function that takes in `abbreviated_names` and finds and returns the full names (to be used by the Boxscores)
- create function that generates the extended hitting stats from the game log

In [None]:
game_log.log[:350]

In [None]:
log[:350]

In [None]:
log