In [1]:
import numpy as np
import pandas as pd
import json

from sklearn.preprocessing import OneHotEncoder

In [None]:
data_dir = "data/"

## Import Players

In [4]:
players = pd.read_csv(data_dir + "new_players.csv")
players = players.drop(labels="Unnamed: 0", axis=1).to_numpy()

In [5]:
for player in players:
    if player[3] in ("BRK", "NJN"):
        player[3] = "BKN"
    elif player[3] in ("CHH", "CHO"):
        player[3] = "CHA"
    elif player[3] in ("NOH", "NOK"):
        player[3] = "NOP"
    elif player[3] == "PHO":
        player[3] = "PHX"
    elif player[3] == "SEA":
        player[3] = "OKC"
    elif player[3] == "SEA":
        player[3] = "OKC"
    elif player[3] in ("TOT", "VAN"):
        player[3] = "OTH"
    

In [6]:
players[12602]

array(['Alan Williams', 'PF', 26, 'BKN', 5, 0, 5.2, 1.6, 2.6, 0.615, 0.0,
       0.2, 0.0, 1.6, 2.4, 0.667, 0.615, 0.4, 0.8, 0.5, 0.8, 3.0, 3.8,
       0.6, 0.2, 0.0, 0.2, 0.4, 3.6, 18], dtype=object)

In [7]:
team_to_id = pd.read_csv(data_dir + "new_details.csv")[["TEAM_ID", "TEAM_ABBREVIATION"]]
team_to_id = team_to_id.drop_duplicates()

# Replace old names
team_to_id["TEAM_ABBREVIATION"] = team_to_id["TEAM_ABBREVIATION"].replace("NJN", "BKN")
team_to_id["TEAM_ABBREVIATION"] = team_to_id["TEAM_ABBREVIATION"].replace("NOH", "NOP")
team_to_id["TEAM_ABBREVIATION"] = team_to_id["TEAM_ABBREVIATION"].replace("NOK", "NOP")
team_to_id["TEAM_ABBREVIATION"] = team_to_id["TEAM_ABBREVIATION"].replace("SEA", "OKC")

team_to_id.loc[len(team_to_id)] = [1111111111, "OTH"]

team_to_id = dict(zip(team_to_id.TEAM_ABBREVIATION, team_to_id.TEAM_ID))

In [8]:
player_to_id = pd.read_csv("player_to_ids.csv")
player_to_id = player_to_id.drop(labels="Unnamed: 0", axis=1)
player_to_id = dict(zip(player_to_id.PLAYER_NAME, player_to_id.PLAYER_ID))

In [9]:
# Example
print( player_to_id["Romeo Langford"] )
players[0]

1629641


array(['Mahmoud Abdul-Rauf', 'PG', 28, 'SAC', 31, 0, 17.1, 3.3, 8.8,
       0.377, 0.2, 1.0, 0.161, 3.2, 7.8, 0.405, 0.386, 0.5, 0.5, 1.0, 0.2,
       1.0, 1.2, 1.9, 0.5, 0.0, 0.6, 1.0, 7.3, 97], dtype=object)

In [10]:
# Numpy array to get player numerical data
data_cols = np.r_[2, 4:30]
cat_cols  = np.r_[1, 3]

## Create one hot encoding for "Team Abbreviation" and "Position"

In [11]:
# Create Encoders
cat_encoder = OneHotEncoder(handle_unknown='ignore')

In [12]:
# Obtain all categorical data to pre-process
categories = players[:, cat_cols]

# Fit encoder to categories
cat_encoder.fit( categories )

## Create data dictionary
#### - {'Year': 'Team': 'PlayerID': Data}

In [13]:
player_dict = {}
years       = []
teams       = []

for row in players:
    year = row[29] + 1
    if year == 100:
        year = 0
    name = row[0]
    team = row[3]

    if team not in team_to_id.keys():
        continue

    if name not in player_to_id.keys():
        continue
    
    team_id = team_to_id[team]
    name_id = player_to_id[name]

    if year not in years:
        player_dict[year] = {}
        teams             = []
        years.append(year)
        
    if team_id not in teams:
        player_dict[year][team_id] = {}
        teams.append(team_id)

    # Obtain categorical data
    player_categories = row[cat_cols]
    player_categories = player_categories[np.newaxis, :]

    # Encode categorical data
    encoding = cat_encoder.transform( player_categories ).toarray()
    encoding = np.squeeze(encoding)

    # Obtain raw data
    data = row[data_cols]

    # Merge category encoding with raw data
    new_row = np.concatenate( (data, encoding) )

    player_dict[year][team_id][name_id] = new_row.tolist()

In [14]:
print( player_dict.keys() )
print( player_dict[22].keys() )
print( player_dict[22][1610612747].keys() )
print( player_dict[22][1610612747][2544] )

dict_keys([98, 99, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23])
dict_keys([1610612761, 1610612763, 1610612748, 1610612751, 1111111111, 1610612740, 1610612762, 1610612749, 1610612739, 1610612754, 1610612747, 1610612753, 1610612752, 1610612745, 1610612764, 1610612756, 1610612758, 1610612765, 1610612766, 1610612741, 1610612737, 1610612743, 1610612755, 1610612759, 1610612746, 1610612760, 1610612750, 1610612742, 1610612744, 1610612757, 1610612738])
dict_keys([2546, 2772, 201571, 203145, 202340, 1630602, 201954, 203076, 1629635, 201961, 1629117, 1629659, 2730, 1630643, 2544, 1626169, 1629203, 1630222, 201599, 1630644, 1628370, 1630559, 200765, 202738, 201566])
[37, 56, 56, 37.2, 11.4, 21.8, 0.524, 2.9, 8.0, 0.359, 8.6, 13.8, 0.62, 0.59, 4.5, 6.0, 0.756, 1.1, 7.1, 8.2, 6.2, 1.3, 1.1, 3.5, 2.2, 30.3, 21, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 

## Export Dictionary

In [15]:
with open(data_dir + "player_dictionary.json", "w") as file:
    json.dump(player_dict, file, indent=2)

In [16]:
with open(data_dir + "team_to_id.json", "w") as file:
    json.dump(team_to_id, file, indent=2)

## Create games dictionary

In [17]:
with open(data_dir + "player_dictionary.json") as file:
    player_dict = json.load( file )

with open(data_dir + "team_to_id.json") as file:
    team_to_id = json.load( file )

In [228]:
games_pd   = pd.read_csv(data_dir + "new_games.csv")
games   = games_pd.to_numpy()[:, 1:]

details_pd = pd.read_csv(data_dir + "new_details.csv")
details = details_pd.to_numpy()[:, 1:]

In [229]:
games_dict = {}

for row in games:
    # Add game_id to dictionary
    games_dict[row[0]] = {}

    # Add home team to dictionary
    games_dict[row[0]][row[1]] = []

    # Add away team to dictionary
    games_dict[row[0]][row[2]] = []

    games_dict[row[0]]["points_home"] = row[4]
    games_dict[row[0]]["points_away"] = row[5]
    games_dict[row[0]]["win_home"]    = row[6]
    games_dict[row[0]]["season"]      = row[3] + 1
    games_dict[row[0]]["team_home"]   = row[1]
    games_dict[row[0]]["team_away"]   = row[2]

for row in details:
    if row[1] in games_dict[row[0]].keys():
        games_dict[row[0]][row[1]].append(row[4])

## Export Games Dictionary

In [230]:
with open(data_dir + "games_dict.json", "w") as file:
    json.dump(games_dict, file, indent=2)