In [14]:
import pickle
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split

In [15]:
years = list(range(2001, 2024))

In [16]:
with open('./data/cleaned_data/final_stats.pickle', 'rb') as statistics:
    final_stats = pickle.load(statistics)

In [17]:
stats_columns = ["ID", "salary_cap_perc", "last_year_of_contract", "age", "games_played_perc", "games_started_perc", "minutes_played", "avg_minutes_played", "WS48", "PER", "team_successes",
              "defensive", "most_improved", "most_valuable", "most_valuable_finals", "sixth_man", "all_league", "all_def", "season"]
last_year_of_contracts = pd.DataFrame(columns=stats_columns)

In [18]:
# get data for players on last year of contract. Don't inclue season 2023 becuase of lack of next season data for players
for year in years[:-1]:
    contracts = pd.DataFrame()
    contracts = final_stats[year].query("last_year_of_contract == True").copy()
    contracts["season"] = year
    contracts = contracts.reset_index()
    last_year_of_contracts = pd.concat([last_year_of_contracts, contracts]).reset_index(drop=True)

In [19]:
# get rif of Nans and players that did not contribute a lot to their teams (possible end of careers, injury etc)
last_year_of_contracts = last_year_of_contracts[(last_year_of_contracts["minutes_played"] > 96)].reset_index(drop=True)
last_year_of_contracts = last_year_of_contracts[(last_year_of_contracts["avg_minutes_played"] > 5)].reset_index(drop=True)
last_year_of_contracts = last_year_of_contracts[(last_year_of_contracts["games_played_perc"] > 0.1)].reset_index(drop=True)
last_year_of_contracts = last_year_of_contracts.dropna()

In [20]:
# Create df for analysis and prediction
model_columns = ["ID", "season", "salary_cap_perc", "age", "games_played_perc", "games_started_perc", "minutes_played", "avg_minutes_played", "WS48", "PER", "team_successes", "player_successes"]
data = pd.DataFrame(columns = model_columns)

In [21]:
# Load data and clean games_player_perc of anomalies
for index, row in last_year_of_contracts.iterrows():
    row_id = row['ID']
    season = row["season"]
    try:
        salary_cap_perc = final_stats[season+1].query(f'ID == "{row_id}"')["salary_cap_perc"].iloc[0]
        age = row["age"]
        if row["games_played_perc"] > 1:
            games_played_perc = 1
        else:
            games_played_perc = row["games_played_perc"]
        games_started_perc = row["games_started_perc"]
        minutes_played = row["minutes_played"]
        avg_minutes_played = row["avg_minutes_played"]
        win_shares_per = row["WS48"]
        player_efficiency = row["PER"]
        team_successes = row["team_successes"]
        success_points = row['defensive'] * 2 + row['most_improved'] * 1 + row['most_valuable'] * 5 + row['most_valuable_finals'] * 4 + row['sixth_man'] * 2 + row['all_league'] * 2 + row['all_def'] * 1

        new_df_row = [row_id, season, salary_cap_perc, age, games_played_perc, games_started_perc, minutes_played, avg_minutes_played, win_shares_per, player_efficiency, team_successes, success_points]
        data.loc[len(data)] = new_df_row
    except:
        pass

In [22]:
data.to_csv("./data/final_data/data.csv", index=False, encoding='utf-8')

In [23]:
X = data.drop(columns=["salary_cap_perc"])
y = data["salary_cap_perc"]

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
X_train = sm.add_constant(X_train)
X_test = sm.add_constant(X_test)

In [26]:
X_train.to_csv('./data/final_data/X_train.csv', index=False, encoding='utf-8')
X_test.to_csv('./data/final_data/X_test.csv', index=False, encoding='utf-8')
y_train.to_csv('./data/final_data/y_train.csv', index=False, encoding='utf-8')
y_test.to_csv('./data/final_data/y_test.csv', index=False, encoding='utf-8')