In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy as sp
import scipy.stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.stats.diagnostic as smd
import pickle
import seaborn as sns

In [None]:
years = list(range(2001, 2024))
years_contracts = list(range(1996, 2024))

In [None]:
with open('./data/final_stats.pickle', 'rb') as statistics:
    final_stats = pickle.load(statistics)

In [None]:
final_stats[2001]["minutes_played"].describe()

In [None]:
with open('./data/cleaned_data/players_contracts.pickle', 'rb') as play_contracts:
    players_contracts = pickle.load(play_contracts)

In [None]:
stats_columns = ["ID", "salary_perc", "last_year_of_contract", "age", "games_played_perc", "games_started_perc", "minutes_played", "avg_minutes_played", "WS48", "PER", "team_successes",
              "defensive", "most_improved", "most_valuable", "most_valuable_finals", "sixth_man", "all_league", "all_def", "season"]
last_year_of_contracts = pd.DataFrame(columns=stats_columns)

In [None]:
for year in years[:-1]:
    contracts = pd.DataFrame()
    contracts = final_stats[year].query("last_year_of_contract == True").copy()
    contracts["season"] = year
    contracts = contracts.reset_index()
    last_year_of_contracts = pd.concat([last_year_of_contracts, contracts]).reset_index(drop=True)

In [None]:
last_year_of_contracts = last_year_of_contracts[(last_year_of_contracts["minutes_played"] > 96)].reset_index(drop=True)

In [None]:
last_year_of_contracts

In [None]:
model_columns = ["salary_perc", "age", "games_played_perc", "games_started_perc", "minutes_played", "avg_minutes_played", "WS48", "PER", "team_successes", "player_successes"]
model_data = pd.DataFrame(columns = model_columns)

In [None]:
for index, row in last_year_of_contracts.iterrows():
    row_id = row['ID']
    season = row["season"]
    try:
        salary_perc = final_stats[season+1].query(f'ID == "{row_id}"')["salary_perc"].iloc[0]
        age = row["age"]
        games_played_perc = row["games_played_perc"]
        games_started_perc = row["games_started_perc"]
        minutes_played = row["minutes_played"]
        avg_minutes_played = row["avg_minutes_played"]
        win_shares_per = row["PER"]
        player_efficiency = row["WS48"]
        team_successes = row["team_successes"]
        success_points = row['defensive'] * 2 + row['most_improved'] * 1 + row['most_valuable'] * 5 + row['most_valuable_finals'] * 4 + row['sixth_man'] * 2 + row['all_league'] * 2 + row['all_def'] * 1

        new_df_row = [salary_perc, age, games_played_perc, games_started_perc, minutes_played, avg_minutes_played, win_shares_per, player_efficiency, team_successes, success_points]
        model_data.loc[len(model_data)] = new_df_row
    except:
        pass

In [None]:
model_data

In [None]:
x_val = model_data["age"].copy()
x_val_log = x_val.apply(lambda x: np.log(x))
plt.hist(x_val_log, color='grey')
plt.ylabel('Counts')
plt.xlabel('age')
plt.plot()

In [None]:
x_val = model_data["games_played_perc"].copy()
x_val_log = x_val.apply(lambda x: np.log2(x))
plt.hist(x_val_log, color='grey')
plt.ylabel('Counts')
plt.xlabel('games')
plt.plot()

In [None]:
x_val = model_data["games_started_perc"].copy()
x_val_sqrt = x_val.apply(lambda x: np.sqrt(x))
# for x_log in x_val_log:
#     print(x_log)
plt.hist(x_val_sqrt, color='grey')
plt.ylabel('Counts')
plt.xlabel('games_started_perc')
plt.plot()

In [None]:
model_data["WS48"].describe()

In [None]:
x_val = model_data["WS48"].copy()
plt.hist(x_val, color='grey')
plt.ylabel('Counts')
plt.xlabel('WS48')
plt.plot()

In [None]:
model_data["PER"].describe()

In [None]:
x_val = model_data["PER"].copy()
plt.hist(x_val, color='grey')
plt.ylabel('Counts')
plt.xlabel('PER')
plt.plot()

In [None]:
x_val = model_data["team_successes"].copy()
x_val_pow = x_val.apply(lambda x: np.power(x, 3))
plt.hist(x_val_pow, color='grey')
plt.ylabel('Counts')
plt.xlabel('team_successes')
plt.plot()

In [None]:
x_val = model_data["player_successes"].copy()
x_val_pow = x_val.apply(lambda x: np.power(x, 3))
plt.hist(x_val_pow, color='grey')
plt.ylabel('Counts')
plt.xlabel('player_successes')
plt.plot()

In [None]:
x_val = model_data["avg_minutes_played"].copy()
x_val_log = x_val.apply(lambda x: np.log((1+x)/(1-x)))
plt.hist(x_val, color='grey')
plt.ylabel('Counts')
plt.xlabel('avg_minutes_played')
plt.plot()

In [None]:
reg = smf.ols(formula='np.log(salary_perc) ~ np.log(age) + np.log2(games_played_perc) + np.sqrt(games_started_perc) + WS48 + np.power(team_successes, 3) + np.power(player_successes, 3)', data=model_data)
results = reg.fit()
print(f'results.summary(): \n{results.summary()}\n')

In [None]:
reg = smf.ols(formula='np.log(salary_perc) ~ np.log(age) + np.log2(games_played_perc) + np.sqrt(games_started_perc) + PER + np.power(team_successes, 3) + np.power(player_successes, 3)', data=model_data)
results = reg.fit()
print(f'results.summary(): \n{results.summary()}\n')

In [None]:
results_np = sm.OLS(results.model.endog, results.model.exog).fit()

In [None]:
# test reset
reset_test = smd.linear_reset(res=results_np, use_f = True)
reset_test

In [None]:
# WHITE TEST
white_test = smd.het_white(results.resid,  results.model.exog)

#define labels to use for output of White's test
labels = ['Test Statistic', 'Test Statistic p-value', 'F-Statistic', 'F-Test p-value']

#print results of White's test
print(dict(zip(labels, white_test)))

In [None]:
# BREUSCH-PAGAN TEST
bp_test = smd.het_breuschpagan(results.resid, results.model.exog)
#define labels to use for output of BP test
labels = ['Test Statistic', 'Test Statistic p-value', 'F-Statistic', 'F-Test p-value']

#print results of BP test
print(dict(zip(labels, bp_test)))

In [None]:
# BREUSCH-GODFREY TEST
bg_test = smd.acorr_breusch_godfrey(results)

#define labels to use for output of BP test
labels = ['Test Statistic', 'Test Statistic p-value', 'F-Statistic', 'F-Test p-value']

#print results of BP test
print(dict(zip(labels, bg_test)))