# Imports

In [1]:
# Core
import os
import sys
import ast
import re
import datetime as dt
from typing import List, Dict, Tuple, Any, Optional, Union
import logging
from dataclasses import dataclass, field

# Data analysis
import numpy as np
import pandas as pd

# Visalization
import matplotlib.pyplot as plt
from matplotlib.figure import Figure
import seaborn as sns

# Fitting distributions
import scipy.stats
from fitter import Fitter

from distribution_fitter import DistributionFitter
from distribution_validator import DistributionValidator
from utils import *
import json

ImportError: attempted relative import with no known parent package

In [None]:
logging.basicConfig(level=logging.INFO)

# Reading the data

In [None]:
# Helper function to read all the historic dataframes from a path
def read_files(path: str) -> pd.DataFrame:
    files = os.listdir(path)
    files = [os.path.join(path, file) for file in files if re.search(".csv", file)]
    dfs = list(map(lambda file: pd.read_csv(file, low_memory=False), files))
    return pd.concat(dfs)

In [None]:
mlb_df = read_files("../data/MLB/")

In [None]:
mlb_df.columns

In [None]:
mlb_df.dtypes

# Cleaning data

In [None]:
mlb_df.game_info__bigdataball_dataset.value_counts()

In [None]:
def with_season_type(game_type: str) -> str:
    return "regular-season" if re.search("Season", game_type) else "playoffs"

In [None]:
def format_date(date_str: str) -> dt.datetime:
    try:
        _date = dt.datetime.strptime(date_str, "%d/%m/%Y")
    except ValueError:
        _date = dt.datetime.strptime(date_str, "%m/%d/%Y")
    return _date

In [None]:
cleaned_mlb_df = mlb_df.assign(
    game_info__date=pd.to_datetime(mlb_df.game_info__date.apply(format_date)),
    fantasy_points__draftkings=mlb_df.fantasy_points__draftkings.apply(lambda s: float(str(s).replace(",","."))),
    fantasy_points__fanduel=mlb_df.fantasy_points__fanduel.apply(lambda s: float(str(s).replace(",","."))),
    fantasy_points__yahoo=mlb_df.fantasy_points__yahoo.apply(lambda s: float(str(s).replace(",","."))),
    game_info__game_type=mlb_df.game_info__bigdataball_dataset.apply(with_season_type)
).pipe(lambda df: df.assign(
    year=df['game_info__date'].dt.year
))

# Fitting distributions

## By player

In [None]:
def get_frequencies(df: pd.DataFrame, groupby_keys: List[str]) -> pd.DataFrame:
    frequency_df = df.groupby(groupby_keys).size().reset_index(name='frequency')
    frequency_df = frequency_df.sort_values(by=['frequency'], ascending=False)
    return frequency_df

In [None]:
def plot_distribution_by_variable(df: pd.DataFrame, variable: str, xlabel: str, ylabel: str) -> None:
    
    fig, ax = plt.subplots(ncols=2, figsize=(20, 6))
    
    sns.histplot(data=df, x=variable, kde=True, ax=ax[0])
    ax[0].set_ylabel(ylabel)
    ax[0].set_xlabel(xlabel)
    
    sns.boxplot(data=df, x=variable, orient='h', ax=ax[1])
    sns.stripplot(data=df, x=variable, orient='h', color=".25", ax=ax[1])
    
    ax[1].tick_params(axis='y', which='both', left=False, top=False, labelleft=False)
    ax[1].set_xlabel(xlabel)
    
    fig.suptitle(f'Distribution of: {xlabel.lower()} by {ylabel.lower()}')
    
    plt.plot();

In [None]:
plot_distribution_by_variable(get_frequencies(cleaned_mlb_df, ["player_info__player_id"]), "frequency", "Player - Appearances", "Frequency")

In [None]:
plot_distribution_by_variable(get_frequencies(cleaned_mlb_df, ["player_info__player_name"]), "frequency", "Player - Appearances", "Frequency")

In [None]:
games_per_date = cleaned_mlb_df[["game_info__game_type", "game_info__game_id", "game_info__date"]].drop_duplicates()\
.groupby(["game_info__date", "game_info__game_type"])\
.size()\
.reset_index(name='frequency')\
.rename(columns={"game_info__date": "date", "game_info__game_type": "game_type"})

fig, ax = plt.subplots(figsize=(20, 6))

sns.lineplot(
    data=games_per_date,
    x="date",
    y="frequency",
    hue="game_type",
    ax=ax
)

plt.plot();

In [None]:
def plot_histogram(
    df: pd.DataFrame, 
    filter_column: str, 
    factor: Any, 
    value: str,
    kde: bool = False
) -> Figure:
    fig, ax = plt.subplots(ncols=1, figsize=(16, 8))

    sns.histplot(
        data=df[
        (df[filter_column] == factor) & (~df[value].isna())
        ], 
        x=value, 
        kde=kde,
        bins=100
    )
    ax.set_ylabel('Frecuency')
    ax.set_xlabel(" - ".join(value.split("__")))

    fig.suptitle(f'Distribution of: {value} by factor: {factor}')

    return fig

In [None]:
fig = plot_histogram(cleaned_mlb_df, "player_info__player_name", "Will Smith", "fantasy_points__draftkings", True)

## Position

In [None]:
get_frequencies(cleaned_mlb_df, ["position__fanduel"])

In [None]:
fig = plot_histogram(cleaned_mlb_df, "position__fanduel", "P", "fantasy_points__fanduel", True)

In [None]:
fig = plot_histogram(cleaned_mlb_df, "position__fanduel", "OF", "fantasy_points__fanduel", True)

# Distributions fitting process

## Fanduel for Pitchers

In [None]:
cleaned_mlb_df.head()

In [None]:
pitchers_df = cleaned_mlb_df[
    (cleaned_mlb_df.position__draftkings == "P") & 
    (~cleaned_mlb_df['fantasy_points__fanduel'].isna())
    ]

In [None]:
# Setting up style for better visualization
sns.set(style="whitegrid")

# Plotting the histogram
fig, ax = plt.subplots(ncols=2, figsize=(20, 6))

sns.histplot(
    data=pitchers_df, 
    x='fantasy_points__fanduel', 
    hue='game_info__game_type', 
    element='step', 
    stat='density', 
    common_norm=False, # Helps on visualization,
    ax=ax[0]
)

ax[0].set_title("Histogram of Points over game types")

sns.histplot(
    data=pitchers_df, 
    x='fantasy_points__fanduel', 
    element='step', 
    stat='density', 
    common_norm=False, # Helps on visualization,
    ax=ax[1]
)

ax[1].set_title('Histogram of Points')

plt.show()

### Adding more variables for categorizing players and don't assume a general distribution

In [None]:
statistics_by_pitcher = pitchers_df\
.groupby('player_info__player_id')\
.agg(number_of_observations=pd.NamedAgg("player_info__player_id", "count"))\
.sort_values(by=['number_of_observations'])\
.pipe(lambda df: df.assign(
    cumsum_observations=df.number_of_observations.cumsum(), 
    total_observations=df.number_of_observations.sum()
))\
.pipe(lambda df: df.assign(
    cumulative_proportion=(df.cumsum_observations / df.total_observations) * 100))\
.reset_index()\
.pipe(lambda df: df.assign(
    decile_group=pd.cut(df.cumulative_proportion, bins=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype("str")
))

In [None]:
pitchers_df = pitchers_df.merge(right=statistics_by_pitcher)

In [None]:
pitchers_df.head()

In [None]:
fig, ax = plt.subplots(nrows=2, figsize=(20, 12))

sns.kdeplot(
    data=pitchers_df, 
    x='fantasy_points__fanduel', 
    hue='decile_group', 
    common_norm=False,
    ax=ax[0],
)

ax[0].set_title('Distribution of Fanduel points by Decile of appereances')

sns.histplot(
    data=pitchers_df, 
    x='fantasy_points__fanduel', 
    element='step', 
    stat='density', 
    hue='decile_group', 
    common_norm=False, # Helps on visualization,
    ax=ax[1]
)

ax[1].set_title('Histogram of Fanduel points by Decile of appereances')

plt.show()

### Fitting a general distribution for all the population of pitchers

#### Fitting

In [None]:
data = pitchers_df['fantasy_points__fanduel']

In [None]:
pitchers_distribution_fitter = DistributionFitter(
    distributions=[dist for dist in get_distributions() if dist not in ["levy_stable", "studentized_range", "erlang", "lognorm", "loguniform"]], 
    bins=sturges_bins(data)
)

In [None]:
pitchers_distribution_fitter.fit(data)

In [None]:
pitchers_distribution_fitter.summary(sort_by="ks_statistic", top_n=10)

#### Validation

In [None]:
DistributionValidator().validate_goodness_of_fit(
    distribution_fitter=pitchers_distribution_fitter, 
    distribution_name="johnsonsu", 
    sample_proportion=0.02, 
    suptitle="Goodness of Fit for overall data of pitchers - johnsonsu distribution"
)

In [None]:
DistributionValidator().validate_goodness_of_fit(
    distribution_fitter=pitchers_distribution_fitter, 
    distribution_name="norminvgauss", 
    sample_proportion=0.02, 
    suptitle="Goodness of Fit for overall data of pitchers - norminvgauss distribution"
)

In [None]:
DistributionValidator().validate_goodness_of_fit(
    distribution_fitter=pitchers_distribution_fitter, 
    distribution_name="genhyperbolic", 
    sample_proportion=0.02, 
    suptitle="Goodness of Fit for overall data of pitchers - genhyperbolic distribution"
)

In [None]:
DistributionValidator().validate_goodness_of_fit(
    distribution_fitter=pitchers_distribution_fitter, 
    distribution_name="exponnorm", 
    sample_proportion=0.02, 
    suptitle="Goodness of Fit for overall data of pitchers - exponnorm distribution"
)

### Fitting a distribution by decile

#### Fitting

In [None]:
deciles_groups = pitchers_df.decile_group.unique()

fitters = {}

for decile_group in deciles_groups:

    filtered_data = pitchers_df[pitchers_df.decile_group == decile_group]['fantasy_points__fanduel']
    
    fitter = DistributionFitter(
        distributions=[dist for dist in get_distributions() if dist not in ["levy_stable", "studentized_range", "erlang", "lognorm", "loguniform"]],
        bins=sturges_bins(filtered_data)
    )

    fitter.fit(filtered_data)
    logger.info(f"fitting {decile_group}")
    fitters[decile_group] = fitter

#### Validation

##### **Decile 10%**

In [None]:
fitters['(0, 10]'].summary(sort_by="ks_statistic", top_n=10)

In [None]:
DistributionValidator().validate_goodness_of_fit(
    distribution_fitter=fitters['(0, 10]'], 
    distribution_name="johnsonsu", 
    sample_proportion=0.1, 
    suptitle="Goodness of Fit for overall data of pitchers - Decile 10% with johnsonsu distribution"
)

In [None]:
DistributionValidator().validate_goodness_of_fit(
    distribution_fitter=fitters['(0, 10]'], 
    distribution_name="norminvgauss", 
    sample_proportion=0.1, 
    suptitle="Goodness of Fit for overall data of pitchers - Decile 10% with norminvgauss distribution"
)

In [None]:
DistributionValidator().validate_goodness_of_fit(
    distribution_fitter=fitters['(0, 10]'], 
    distribution_name="exponnorm", 
    sample_proportion=0.1, 
    suptitle="Goodness of Fit for overall data of pitchers - Decile 10% with exponnorm distribution"
)

##### **Decile 20%**

In [None]:
fitters['(10, 20]'].summary(sort_by="ks_statistic", top_n=10)

In [None]:
DistributionValidator().validate_goodness_of_fit(
    distribution_fitter=fitters['(10, 20]'], 
    distribution_name="genhyperbolic", 
    sample_proportion=0.1, 
    suptitle="Goodness of Fit for overall data of pitchers - Decile 20% with genhyperbolic distribution"
)

In [None]:
DistributionValidator().validate_goodness_of_fit(
    distribution_fitter=fitters['(10, 20]'], 
    distribution_name="norminvgauss", 
    sample_proportion=0.1, 
    suptitle="Goodness of Fit for overall data of pitchers - Decile 20% with norminvgauss distribution"
)

In [None]:
DistributionValidator().validate_goodness_of_fit(
    distribution_fitter=fitters['(10, 20]'], 
    distribution_name="johnsonsu", 
    sample_proportion=0.1, 
    suptitle="Goodness of Fit for overall data of pitchers - Decile 20% with johnsonsu distribution"
)

In [None]:
DistributionValidator().validate_goodness_of_fit(
    distribution_fitter=fitters['(10, 20]'], 
    distribution_name="exponnorm", 
    sample_proportion=0.1, 
    suptitle="Goodness of Fit for overall data of pitchers - Decile 20% with exponnorm distribution"
)

##### **Decile 30%**

In [None]:
fitters['(20, 30]'].summary(sort_by="ks_statistic", top_n=10)

In [None]:
DistributionValidator().validate_goodness_of_fit(
    distribution_fitter=fitters['(20, 30]'], 
    distribution_name="genhyperbolic", 
    sample_proportion=0.1, 
    suptitle="Goodness of Fit for overall data of pitchers - Decile 30% with genhyperbolic distribution"
)

In [None]:
DistributionValidator().validate_goodness_of_fit(
    distribution_fitter=fitters['(20, 30]'], 
    distribution_name="norminvgauss", 
    sample_proportion=0.1, 
    suptitle="Goodness of Fit for overall data of pitchers - Decile 30% with norminvgauss distribution"
)

In [None]:
DistributionValidator().validate_goodness_of_fit(
    distribution_fitter=fitters['(20, 30]'], 
    distribution_name="johnsonsu", 
    sample_proportion=0.1, 
    suptitle="Goodness of Fit for overall data of pitchers - Decile 30% with johnsonsu distribution"
)

In [None]:
DistributionValidator().validate_goodness_of_fit(
    distribution_fitter=fitters['(20, 30]'], 
    distribution_name="exponnorm", 
    sample_proportion=0.1, 
    suptitle="Goodness of Fit for overall data of pitchers - Decile 30% with exponnorm distribution"
)

##### **Decile 40%**

In [None]:
fitters['(30, 40]'].summary(sort_by="ks_statistic", top_n=10)

In [None]:
DistributionValidator().validate_goodness_of_fit(
    distribution_fitter=fitters['(30, 40]'], 
    distribution_name="genhyperbolic", 
    sample_proportion=0.1, 
    suptitle="Goodness of Fit for overall data of pitchers - Decile 40% with genhyperbolic distribution"
)

In [None]:
DistributionValidator().validate_goodness_of_fit(
    distribution_fitter=fitters['(30, 40]'], 
    distribution_name="norminvgauss", 
    sample_proportion=0.1, 
    suptitle="Goodness of Fit for overall data of pitchers - Decile 40% with norminvgauss distribution"
)

In [None]:
DistributionValidator().validate_goodness_of_fit(
    distribution_fitter=fitters['(30, 40]'], 
    distribution_name="johnsonsu", 
    sample_proportion=0.1, 
    suptitle="Goodness of Fit for overall data of pitchers - Decile 40% with johnsonsu distribution"
)

In [None]:
DistributionValidator().validate_goodness_of_fit(
    distribution_fitter=fitters['(30, 40]'], 
    distribution_name="exponnorm", 
    sample_proportion=0.1, 
    suptitle="Goodness of Fit for overall data of pitchers - Decile 40% with exponnorm distribution"
)

##### **Decile 50%**

In [None]:
fitters['(40, 50]'].summary(sort_by="ks_statistic", top_n=10)

In [None]:
DistributionValidator().validate_goodness_of_fit(
    distribution_fitter=fitters['(40, 50]'], 
    distribution_name="norminvgauss", 
    sample_proportion=0.1, 
    suptitle="Goodness of Fit for overall data of pitchers - Decile 50% with norminvgauss distribution"
)

In [None]:
DistributionValidator().validate_goodness_of_fit(
    distribution_fitter=fitters['(40, 50]'], 
    distribution_name="johnsonsu", 
    sample_proportion=0.1, 
    suptitle="Goodness of Fit for overall data of pitchers - Decile 50% with johnsonsu distribution"
)

In [None]:
DistributionValidator().validate_goodness_of_fit(
    distribution_fitter=fitters['(40, 50]'], 
    distribution_name="exponnorm", 
    sample_proportion=0.1, 
    suptitle="Goodness of Fit for overall data of pitchers - Decile 50% with exponnorm distribution"
)

##### **Decile 60%**

In [None]:
fitters['(50, 60]'].summary(sort_by="ks_statistic", top_n=10)

In [None]:
DistributionValidator().validate_goodness_of_fit(
    distribution_fitter=fitters['(50, 60]'], 
    distribution_name="norminvgauss", 
    sample_proportion=0.1, 
    suptitle="Goodness of Fit for overall data of pitchers - Decile 60% with norminvgauss distribution"
)

In [None]:
DistributionValidator().validate_goodness_of_fit(
    distribution_fitter=fitters['(50, 60]'], 
    distribution_name="johnsonsu", 
    sample_proportion=0.1, 
    suptitle="Goodness of Fit for overall data of pitchers - Decile 60% with johnsonsu distribution"
)

In [None]:
DistributionValidator().validate_goodness_of_fit(
    distribution_fitter=fitters['(50, 60]'], 
    distribution_name="exponnorm", 
    sample_proportion=0.1, 
    suptitle="Goodness of Fit for overall data of pitchers - Decile 60% with exponnorm distribution"
)

##### **Decile 70%**

In [None]:
fitters['(60, 70]'].summary(sort_by="ks_statistic", top_n=10)

In [None]:
DistributionValidator().validate_goodness_of_fit(
    distribution_fitter=fitters['(60, 70]'], 
    distribution_name="johnsonsu", 
    sample_proportion=0.1, 
    suptitle="Goodness of Fit for overall data of pitchers - Decile 70% with johnsonsu distribution"
)

In [None]:
DistributionValidator().validate_goodness_of_fit(
    distribution_fitter=fitters['(60, 70]'], 
    distribution_name="norminvgauss", 
    sample_proportion=0.1, 
    suptitle="Goodness of Fit for overall data of pitchers - Decile 70% with norminvgauss distribution"
)

In [None]:
DistributionValidator().validate_goodness_of_fit(
    distribution_fitter=fitters['(60, 70]'], 
    distribution_name="genhyperbolic", 
    sample_proportion=0.1, 
    suptitle="Goodness of Fit for overall data of pitchers - Decile 70% with genhyperbolic distribution"
)

In [None]:
DistributionValidator().validate_goodness_of_fit(
    distribution_fitter=fitters['(60, 70]'], 
    distribution_name="exponnorm", 
    sample_proportion=0.1, 
    suptitle="Goodness of Fit for overall data of pitchers - Decile 70% with exponnorm distribution"
)

##### **Decile 80%**

In [None]:
fitters['(70, 80]'].summary(sort_by="ks_statistic", top_n=10)

In [None]:
DistributionValidator().validate_goodness_of_fit(
    distribution_fitter=fitters['(70, 80]'], 
    distribution_name="genhyperbolic", 
    sample_proportion=0.1, 
    suptitle="Goodness of Fit for overall data of pitchers - Decile 80% with genhyperbolic distribution"
)

In [None]:
DistributionValidator().validate_goodness_of_fit(
    distribution_fitter=fitters['(70, 80]'], 
    distribution_name="johnsonsu", 
    sample_proportion=0.1, 
    suptitle="Goodness of Fit for overall data of pitchers - Decile 80% with johnsonsu distribution"
)

In [None]:
DistributionValidator().validate_goodness_of_fit(
    distribution_fitter=fitters['(70, 80]'], 
    distribution_name="norminvgauss", 
    sample_proportion=0.1, 
    suptitle="Goodness of Fit for overall data of pitchers - Decile 80% with norminvgauss distribution"
)

In [None]:
DistributionValidator().validate_goodness_of_fit(
    distribution_fitter=fitters['(70, 80]'], 
    distribution_name="exponnorm", 
    sample_proportion=0.1, 
    suptitle="Goodness of Fit for overall data of pitchers - Decile 80% with exponnorm distribution"
)

##### **Decile 90%**

In [None]:
fitters['(80, 90]'].summary(sort_by="ks_statistic", top_n=10)

In [None]:
DistributionValidator().validate_goodness_of_fit(
    distribution_fitter=fitters['(80, 90]'], 
    distribution_name="norminvgauss", 
    sample_proportion=0.1, 
    suptitle="Goodness of Fit for overall data of pitchers - Decile 90% with norminvgauss distribution"
)

In [None]:
DistributionValidator().validate_goodness_of_fit(
    distribution_fitter=fitters['(80, 90]'], 
    distribution_name="johnsonsu", 
    sample_proportion=0.1, 
    suptitle="Goodness of Fit for overall data of pitchers - Decile 90% with johnsonsu distribution"
)

In [None]:
DistributionValidator().validate_goodness_of_fit(
    distribution_fitter=fitters['(80, 90]'], 
    distribution_name="exponnorm", 
    sample_proportion=0.1, 
    suptitle="Goodness of Fit for overall data of pitchers - Decile 90% with exponnorm distribution"
)

##### **Decile 100%**

In [None]:
fitters['(90, 100]'].summary(sort_by="ks_statistic", top_n=10)

In [None]:
DistributionValidator().validate_goodness_of_fit(
    distribution_fitter=fitters['(90, 100]'], 
    distribution_name="exponnorm", 
    sample_proportion=0.1, 
    suptitle="Goodness of Fit for overall data of pitchers - Decile 100% with exponnorm distribution"
)

#### Pitchers dataframe for distributions

In [None]:
pitchers_distributions_df = pitchers_df\
.assign(
    row_number=pitchers_df\
    .groupby('player_info__player_id')['game_info__date']\
    .rank(method='first', ascending = False).astype(int)
).query("row_number == 1")[["player_info__player_id", "decile_group"]]

In [None]:
deciles_distributions = pd.DataFrame(
    [
        {"decile_group": "(0, 10]", "decile_distribution": "exponnorm", "decile_parameters": str(fitters["(0, 10]"].get_distribution_parameters("exponnorm"))},
        {"decile_group": "(10, 20]", "decile_distribution": "exponnorm", "decile_parameters": str(fitters["(10, 20]"].get_distribution_parameters("exponnorm"))}, 
        {"decile_group": "(20, 30]", "decile_distribution": "exponnorm", "decile_parameters": str(fitters["(20, 30]"].get_distribution_parameters("exponnorm"))},
        {"decile_group": "(30, 40]", "decile_distribution": "exponnorm", "decile_parameters": str(fitters["(30, 40]"].get_distribution_parameters("exponnorm"))},
        {"decile_group": "(40, 50]", "decile_distribution": "exponnorm", "decile_parameters": str(fitters["(40, 50]"].get_distribution_parameters("exponnorm"))},
        {"decile_group": "(50, 60]", "decile_distribution": "exponnorm", "decile_parameters": str(fitters["(50, 60]"].get_distribution_parameters("exponnorm"))},
        {"decile_group": "(60, 70]", "decile_distribution": "genhyperbolic", "decile_parameters": str(fitters["(60, 70]"].get_distribution_parameters("exponnorm"))},
        {"decile_group": "(70, 80]", "decile_distribution": "genhyperbolic", "decile_parameters": str(fitters["(70, 80]"].get_distribution_parameters("exponnorm"))},
        {"decile_group": "(80, 90]", "decile_distribution": "norminvgauss", "decile_parameters": str(fitters["(80, 90]"].get_distribution_parameters("exponnorm"))},
        {"decile_group": "(90, 100]", "decile_distribution": "exponnorm", "decile_parameters": str(fitters["(90, 100]"].get_distribution_parameters("exponnorm"))},
    ]
)

In [None]:
pitchers_distributions_df = pitchers_distributions_df.assign(
    general_distribution="exponnorm",
    general_parameters=str(pitchers_distribution_fitter.get_distribution_parameters("exponnorm"))
).merge(right=deciles_distributions)

In [None]:
pitchers_distributions_df.size