In [None]:
"""
sets.ipynb

Jupyter Notebook to perform analysis on card sets.

Author: Jordan Bourdeau, Casey Forey
Date Created: 4/7/24
"""

In [None]:
# Imports
from importlib import reload
import json
from matplotlib import pyplot as plt
import numpy as np
import os
import pandas as pd
from sklearn.ensemble import IsolationForest
from statsmodels.stats.proportion import proportions_ztest

from src import constants as c
from src.calculate import calculate_market_data as cmd
from src.calculate import calculate_set_data as csd
from src.load import load_card_data as lcd
from src.load import load_set_data as lsd
from src.load import load_utils
from src.plot import plot_set_data as psd
from src.plot import plot_utils

In [None]:
reload(load_utils)

# Load dataset
all_printings_filepath: str = os.path.join(c.DATA_DIRECTORY, 'AllPrintings.json')
all_printings: dict = load_utils.load_json_data(all_printings_filepath)

In [None]:
reload(lsd)

sets = lsd.load_set_and_release_year()
sets[sets['set_code'] == 'LTR']

In [None]:
plot_utils.plot_dataframe_as_table(sets.head(), os.path.join(c.IMAGE_DIRECTORY, 'original_set_list_df.png'))

In [None]:
reload(lsd)

lsd.save_format_set_ban_counts(all_printings, 'modern')

In [None]:
reload(lcd)

df = lcd.load_first_card_printing_in_format('modern', all_printings)
df

In [None]:
reload(lsd)

# Augmenting set data with tournament data and number of banend cards
set_card_usages_and_bans: pd.DataFrame = lsd.load_augmented_set_data(all_printings, 'modern')

In [None]:
plot_utils.plot_dataframe_as_table(set_card_usages_and_bans.head(), os.path.join(c.IMAGE_DIRECTORY, 'augmented_set_df.png'))

In [None]:
reload(csd)
reload(plot_utils)

np.random.seed(0)

# Selecting numerical columns to train the model on
columns_for_model = ['total_count', 'num_banned', 'mean_price', 'median_price', 'std_price']
data_for_model = set_card_usages_and_bans[columns_for_model]

outliers = csd.find_set_outliers(set_card_usages_and_bans, columns_for_model)

print(f'Found {len(outliers)} outliers')
outliers_after_fire_design: pd.DataFrame = outliers[outliers['release_year'] >= 2019]
print(f'{(len(outliers_after_fire_design) / len(outliers)) * 100:.2f}% of outlier sets came after the fire design principle.')

trimmed_outliers: pd.DataFrame = outliers.drop(['set_name', 'median_price', 'release_month'], axis=1)
plot_utils.plot_dataframe_as_table(trimmed_outliers, os.path.join(c.IMAGE_DIRECTORY, 'outliers.png'))

In [None]:
reload(psd)

psd.plot_outlier_distribution(outliers, 'modern')

In [None]:
reload(psd)

psd.plot_set_table(set_card_usages_and_bans)

In [None]:
# Do two-sample proportion z-test on number of outliers
num_sets_per_year_df: pd.DataFrame = set_card_usages_and_bans \
    .groupby(['release_year']) \
    .size() \
    .reset_index() \
    .rename(columns={0: 'count'})

sets_after_2019: int = num_sets_per_year_df[num_sets_per_year_df['release_year'] >= 2019]['count'].sum()
sets_before_2019: int = num_sets_per_year_df[num_sets_per_year_df['release_year'] < 2019]['count'].sum()

print(f'Number of sets before 2019: {sets_before_2019}')
print(f'Number of sets after 2019: {sets_after_2019}')

outliers_grouped_by_year: pd.DataFrame = outliers \
    .groupby(['release_year']) \
    .size() \
    .reset_index() \
    .rename(columns={0: 'count'})

outliers_after_2019: int = outliers_grouped_by_year[outliers_grouped_by_year['release_year'] >= 2019]['count'].sum()
outliers_before_2019: int = outliers_grouped_by_year[outliers_grouped_by_year['release_year'] < 2019]['count'].sum()

print(f'Number of outliers before 2019: {outliers_before_2019}')
print(f'Number of outliers after 2019: {outliers_after_2019}')

expected_proportion: float = sets_after_2019 / (sets_after_2019 + sets_before_2019)
actual_proportion: float = outliers_after_2019 / (outliers_after_2019 + outliers_before_2019)

print(f'Expected Proportion of Outliers after 2019: {expected_proportion * 100:.2f}%')
print(f'Actual Proportion of Outliers after 2019: {actual_proportion * 100:.2f}%')

# Perform z-test
count: int = outliers_after_2019
nobs: int = outliers_after_2019 + outliers_before_2019
stat, pval = proportions_ztest(count, nobs, value=expected_proportion, alternative='larger')

# Output the results
print("Test Statistic (z-score):", stat)
print("P-value:", pval)

In [None]:
num_sets_per_year_df

In [None]:
# Alternate test without LCI point included
# Do two-sample proportion z-test on number of outliers
num_sets_per_year_df: pd.DataFrame = set_card_usages_and_bans \
    .groupby(['release_year']) \
    .size() \
    .reset_index() \
    .rename(columns={0: 'count'})

sets_after_2019: int = num_sets_per_year_df[num_sets_per_year_df['release_year'] >= 2019]['count'].sum()
sets_before_2019: int = num_sets_per_year_df[num_sets_per_year_df['release_year'] < 2019]['count'].sum()

print(f'Number of sets before 2019: {sets_before_2019}')
print(f'Number of sets after 2019: {sets_after_2019}')

outliers_grouped_by_year: pd.DataFrame = outliers \
    .groupby(['release_year']) \
    .size() \
    .reset_index() \
    .rename(columns={0: 'count'})

outliers_after_2019: int = outliers_grouped_by_year[outliers_grouped_by_year['release_year'] >= 2019]['count'].sum()
outliers_before_2019: int = outliers_grouped_by_year[outliers_grouped_by_year['release_year'] < 2019]['count'].sum()

print(f'Number of outliers before 2019: {outliers_before_2019}')
print(f'Number of outliers after 2019: {outliers_after_2019}')

expected_proportion: float = sets_after_2019 / (sets_after_2019 + sets_before_2019)
actual_proportion: float = outliers_after_2019 / (outliers_after_2019 + outliers_before_2019)

print(f'Expected Proportion of Outliers after 2019: {expected_proportion * 100:.2f}%')
print(f'Actual Proportion of Outliers after 2019: {actual_proportion * 100:.2f}%')

# Perform z-test
count: int = outliers_after_2019 - 1
nobs: int = outliers_after_2019 + outliers_before_2019 - 1
stat, pval = proportions_ztest(count, nobs, value=expected_proportion, alternative='larger')

# Output the results
print("Test Statistic (z-score):", stat)
print("P-value:", pval)