In [1]:
import pickle
import pandas as pd

In [2]:
years = list(range(2001, 2024))

In [3]:
with open('./data/final_stats.pickle', 'rb') as statistics:
    final_stats = pickle.load(statistics)

In [4]:
stats_columns = ["ID", "salary_cap_perc", "last_year_of_contract", "age", "games_played_perc", "games_started_perc", "minutes_played", "avg_minutes_played", "WS48", "PER", "team_successes",
              "defensive", "most_improved", "most_valuable", "most_valuable_finals", "sixth_man", "all_league", "all_def", "season"]
last_year_of_contracts = pd.DataFrame(columns=stats_columns)

In [5]:
# get data for players on last year of contract. Don't inclue season 2023 becuase of lack of next season data for players
for year in years[:-1]:
    contracts = pd.DataFrame()
    contracts = final_stats[year].query("last_year_of_contract == True").copy()
    contracts["season"] = year
    contracts = contracts.reset_index()
    last_year_of_contracts = pd.concat([last_year_of_contracts, contracts]).reset_index(drop=True)

In [6]:
# get rif of Nans and players that did not contribute a lot to their teams (possible end of careers, injury etc)
last_year_of_contracts = last_year_of_contracts[(last_year_of_contracts["minutes_played"] > 96)].reset_index(drop=True)
last_year_of_contracts = last_year_of_contracts[(last_year_of_contracts["avg_minutes_played"] > 5)].reset_index(drop=True)
last_year_of_contracts = last_year_of_contracts[(last_year_of_contracts["games_played_perc"] > 0.1)].reset_index(drop=True)
last_year_of_contracts = last_year_of_contracts.dropna()

In [7]:
# Create df for analysis and prediction
model_columns = ["ID", "salary_cap_perc", "age", "games_played_perc", "games_started_perc", "minutes_played", "avg_minutes_played", "WS48", "PER", "team_successes", "player_successes"]
data = pd.DataFrame(columns = model_columns)

In [8]:
# Load data and clean games_player_perc of anomalies
for index, row in last_year_of_contracts.iterrows():
    row_id = row['ID']
    season = row["season"]
    try:
        player_id = row["ID"]
        salary_cap_perc = final_stats[season+1].query(f'ID == "{row_id}"')["salary_cap_perc"].iloc[0]
        age = row["age"]
        if row["games_played_perc"] > 1:
            games_played_perc = 1
        else:
            games_played_perc = row["games_played_perc"]
        games_started_perc = row["games_started_perc"]
        minutes_played = row["minutes_played"]
        avg_minutes_played = row["avg_minutes_played"]
        win_shares_per = row["WS48"]
        player_efficiency = row["PER"]
        team_successes = row["team_successes"]
        success_points = row['defensive'] * 2 + row['most_improved'] * 1 + row['most_valuable'] * 5 + row['most_valuable_finals'] * 4 + row['sixth_man'] * 2 + row['all_league'] * 2 + row['all_def'] * 1

        new_df_row = [player_id, salary_cap_perc, age, games_played_perc, games_started_perc, minutes_played, avg_minutes_played, win_shares_per, player_efficiency, team_successes, success_points]
        data.loc[len(data)] = new_df_row
    except:
        pass

In [9]:
data.to_csv("./data/data.csv", index=False, encoding='utf-8')