# Import Starting Data

In [1]:
import pandas as pd #1
import os           #3
import numpy as np

In [2]:
from copy import deepcopy
from statistics import stdev
import matplotlib.pyplot as plt

In [3]:
given_data_folder = 'base'

file_names = os.listdir(given_data_folder)
file_names.sort()
file_names

['MMasseyOrdinals.csv',
 'MRegularSeasonDetailedResults.csv',
 'SampleSubmissionWarmup.csv',
 'WRegularSeasonDetailedResults.csv']

In [4]:
reg_season_file_name = 'WRegularSeasonDetailedResults.csv'

In [5]:
file_path = given_data_folder + '/' + reg_season_file_name 
reg_season_df  = pd.read_csv(file_path)
print(len(reg_season_df))

reg_season_df.head(5)

70007


Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2010,11,3103,63,3237,49,H,0,23,54,...,13,6,10,11,27,11,23,7,6,19
1,2010,11,3104,73,3399,68,N,0,26,62,...,21,14,27,14,26,7,20,4,2,27
2,2010,11,3110,71,3224,59,A,0,29,62,...,14,19,23,17,23,8,15,6,0,15
3,2010,11,3111,63,3267,58,A,0,27,52,...,26,16,25,22,22,15,11,14,5,14
4,2010,11,3119,74,3447,70,H,1,30,74,...,17,11,21,21,32,12,14,4,2,14


# Cut both down to just the last X years

In [6]:
recent_season  = 2022

seasons = [2021, 2019, 2018, 2017]

In [7]:
recent_season_df = reg_season_df[reg_season_df.Season == recent_season]

for season in seasons:
    recent_season_df = pd.concat([recent_season_df, 
                                  reg_season_df[reg_season_df.Season == (season)]], ignore_index = True)
    recent_season_df.reset_index()
    
print(len(recent_season_df))
recent_season_df.head(5)

24275


Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2022,8,3102,79,3176,62,H,0,30,69,...,17,13,16,6,25,5,22,2,5,22
1,2022,8,3104,109,3149,32,H,0,39,79,...,17,5,5,7,25,7,30,4,2,24
2,2022,8,3112,87,3169,44,H,0,34,65,...,9,4,7,6,13,10,15,5,0,17
3,2022,8,3113,71,3294,41,H,0,25,53,...,13,10,15,5,19,7,30,9,3,13
4,2022,8,3123,84,3454,75,H,1,31,70,...,14,28,32,10,28,17,13,4,0,22


# Turn Both DFs into lists of columns

In [8]:
# Creates list of the column names from the df
columns = recent_season_df.columns.tolist()

# Creates list of the columns
recent_season_df_columns = []
for expected_column in columns:
    recent_season_df_columns.append( recent_season_df[expected_column].tolist() )

print("rows:", len(recent_season_df_columns[0]), "  columns:", len(recent_season_df_columns))

rows: 24275   columns: 34


# Combine team IDs and season years

In [9]:
# 0: Season, 2: WTeamID, 4: LTeamID
for i in range(len(recent_season_df_columns[0])):
    # WTeamID
    recent_season_df_columns[2][i] = "{}_{}".format(recent_season_df_columns[2][i], recent_season_df_columns[0][i])
    # LTeamID
    recent_season_df_columns[4][i] = "{}_{}".format(recent_season_df_columns[4][i], recent_season_df_columns[0][i])


### separate_fga_and_fgp

In [10]:
# get just the games from the specified season
# ---------------------------------------------
#given_df = df_columns

# change df to list of columns
# -----------------------------
#columns = given_df.columns.tolist()

# Creates list of the columns
#orig_columns = []
#for expected_column in columns:
    #temp = given_df[expected_column].tolist()
    #orig_columns.append(temp)

orig_columns = deepcopy(recent_season_df_columns)

#  0: Season      1: DayNum      2: WTeamID     3: WScore      4: LTeamID     5: LScore      6: WLoc      
#  7: NumOT       8: WFGM        9: WFGA       10: WFGM3      11: WFGA3      12: WFTM       13: WFTA      
# 14: WOR        15: WDR        16: WAst       17: WTO        18: WStl       19: WBlk       20: WPF       
# 21: LFGM       22: LFGA       23: LFGM3      24: LFGA3      25: LFTM       26: LFTA       27: LOR       
# 28: LDR        29: LAst       30: LTO        31: LStl       32: LBlk       33: LPF 


# cut down to just the relevent statistic columns
# ------------------------------------------------
number_columns = [deepcopy(orig_columns[3])] + deepcopy(orig_columns[ 8:21]) + [deepcopy(orig_columns[5])] + deepcopy(orig_columns[21:])

#  0: WScore      1: WFGM        2: WFGA        3: WFGM3       4: WFGA3       5: WFTM        6: WFTA      
#  7: WOR         8: WDR         9: WAst       10: WTO        11: WStl       12: WBlk       13: WPF
# 14: LScore     15: LFGM       16: LFGA       17: LFGM3      18: LFGA3      19: LFTM       20: LFTA
# 21: LOR        22: LDR        23: LAst       24: LTO        25: LStl       26: LBlk       27: LPF 


# customize the stats to my new preferences
# ------------------------------------------

# change FGM and FGA to not include FGM3 and FGA3
for i in range(len(number_columns[1])):
    number_columns[ 1][i] = number_columns[ 1][i]-number_columns[ 3][i] #  1: WFGM -  3: WFGM3
    number_columns[ 2][i] = number_columns[ 2][i]-number_columns[ 4][i] #  2: WFGA -  4: WFGA3

    number_columns[15][i] = number_columns[15][i]-number_columns[17][i] # 15: LFGM - 17: LFGM3
    number_columns[16][i] = number_columns[16][i]-number_columns[18][i] # 16: LFGA - 18: LFGA3

# change from FGM to FG%
for i in range(len(number_columns[1])):
    number_columns[ 1][i] = number_columns[ 1][i]/number_columns[ 2][i] #  1: WFGM2 /  2: WFGA2
    
    if number_columns[ 4][i] == 0: number_columns[ 3][i] = 1.0
    else: number_columns[ 3][i] = number_columns[ 3][i]/number_columns[ 4][i] #  3: WFGM3 /  4: WFGA3
    if number_columns[ 6][i] == 0: number_columns[ 5][i] = 1.0
    else: number_columns[ 5][i] = number_columns[ 5][i]/number_columns[ 6][i] #  5: WFTM  /  6: WFTA

        
    number_columns[15][i] = number_columns[15][i]/number_columns[16][i] # 15: LFGM2 / 16: LFGA2
    
    if number_columns[18][i] == 0: number_columns[17][i] = 1.0
    else: number_columns[17][i] = number_columns[17][i]/number_columns[18][i] # 17: LFGM3 / 18: LFGA3
    if number_columns[20][i] == 0: number_columns[19][i] = 1.0
    else: number_columns[19][i] = number_columns[19][i]/number_columns[20][i] # 19: LFTM  / 20: LFTA

# get rid of points
number_columns.pop(14) # 14: LScore
number_columns.pop( 0) #  0: WScore


#  0: WFG%2       1: WFGA2       2: WFG%3       3: WFGA3       4: WFT%        5: WFTA      
#  6: WOR         7: WDR         8: WAst        9: WTO        10: WStl       11: WBlk       12: WPF
# 13: LFG%2      14: LFGA2      15: LFG%3      16: LFGA3      17: LFT%       18: LFTA
# 19: LOR        20: LDR        21: LAst       22: LTO        23: LStl       24: LBlk       25: LPF 

headers_list = [ "FG%2",  "FGA",  "FG%3",  "FGA3",  "FT%",  "FTA",  "OR",  "DR",  "Ast",  "TO",  "Stl",  "Blk",  "PF", 
                "xFG%2", "xFGA", "xFG%3", "xFGA3", "xFT%", "xFTA", "xOR", "xDR", "xAst", "xTO", "xStl", "xBlk", "xPF"]

# add back team IDSs
team_IDs = []
team_IDs.append(orig_columns[2]) # WTeam ID
team_IDs.append(orig_columns[4]) # LTeam ID


#number_columns, team_IDs, headers_list, season, print_report)

In [11]:
print(len(number_columns))
print(len(number_columns[0]))

26
24275


In [12]:
print(len(recent_season_df_columns))
print(len(recent_season_df_columns[0]))

34
24275


### put colums back into original shape

In [13]:
#number_columns
col_ids = [ 0, 1,  2, 3,  4, 5,
           13,14, 15,16, 17,18]

orig_ids = [ 8, 9, 10,11, 12,13,
            21,22, 23,24, 25,26]

current = 0
fgp_columns = []
for i in range(len(recent_season_df_columns)):
    if i in orig_ids:
        fgp_columns.append( number_columns[col_ids[current]].copy() )
        current += 1
    else:
        fgp_columns.append( recent_season_df_columns[i].copy() )

In [14]:
print(len(fgp_columns))
print(len(fgp_columns[0]))

34
24275


In [15]:
recent_season_df_columns = fgp_columns

# Create Inputs and Solutions 

In [16]:
'''
normalized_indicis = [  3,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 
                        5, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35]
''';

In [17]:
df_columns = deepcopy(recent_season_df_columns)

### Normalizing each input stat to 0-1 and getting variance
### -------------------------------------------------------------------

In [18]:
# Normalizing each input stat to 0-1
# -----------------------------------

# create columns of all actual data
stats_columns = [df_columns[3].copy()] + deepcopy(df_columns[  8:21 ])
additional    = [df_columns[5].copy()] + deepcopy(df_columns[ 21:   ])

for i in range(len(stats_columns)):
    stats_columns[i] += additional[i]


# getting averages/standard devation
column_averages = []
column_std = []
for column in stats_columns:
    column_std.append(stdev(column))
    column_averages.append( sum(column)/len(column) )


# get valid max candidates
within_range = []
for i in range(len(stats_columns)):
    within_range_column = []
    for value in stats_columns[i]:
        if value < (column_averages[i] + (column_std[i]*2.5)) and value > (column_averages[i] - (column_std[i]*2.5)):
            within_range_column.append(value)
    within_range.append(within_range_column)


# now create max columns list for use
max_columns = []
for i in range(len(within_range)):
    max_columns.append( max(within_range[i]) )
max_columns += max_columns


# normalize all values
normalized_indicis = [3,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 
                      5, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33]
for i in range(len(normalized_indicis)):
    column_max  = max_columns[i]
    column      = normalized_indicis[i]
    for i in range(len( df_columns[column] )):
        df_columns[column][i] = (df_columns[column][i]/column_max)


# create list of normalized variance values
variance = []
for i in range(len(column_std)):
    variance.append( (column_std[i]/max_columns[i])**2 )
variance += variance


# return max columns and variance lists
output_max_columns  = max_columns.copy()
output_variance     = variance.copy()


In [19]:
'''
# change df to list of rows
# --------------------------

# Make a list of rows too
df_rows = []
for i in range(len(df_columns[0])):
    new_row = []
    for j in range(len(df_columns)):
        new_row.append(df_columns[j][i])
    df_rows.append(new_row)



# game solutions and team dictionary
# -----------------------------------

# inputs and solutions
inputs      = []
solutions   = []


# team1 ID, team2 ID, team1 win? (1 or 0)
game_solutions = []


# key = teamID
# value = [count, team]
team_dictionary = {}


for i in range(len(df_rows)):
    row = df_rows[i]

    WTeam_stats = [row[3]] + row[ 8:21].copy()
    LTeam_stats = [row[5]] + row[21:  ].copy()


    # if both teams are in the team dictionary, create input and solution rows
    # -------------------------------------------------------------------------
    if row[2] in team_dictionary and row[4] in team_dictionary:


        new_input_row_1 = team_dictionary[row[2]][1].copy() + team_dictionary[row[4]][2].copy()
        new_input_row_2 = team_dictionary[row[4]][1].copy() + team_dictionary[row[2]][2].copy()

        orig_length = len(new_input_row_1)
        for j in range(len(new_input_row_1)):
            new_input_row_1.append(-new_input_row_2[j])
            new_input_row_2.append(-new_input_row_1[j])


        WTeam_inputs = team_dictionary[row[2]][1].copy() + team_dictionary[row[2]][2].copy()
        LTeam_inputs = team_dictionary[row[4]][1].copy() + team_dictionary[row[4]][2].copy()

        # for solutions
        new_solutions_row_1 = []
        new_solutions_row_2 = []

        # for updating dictionaries
        W_games = team_dictionary[row[2]][0]
        L_games = team_dictionary[row[4]][0]

        # 26 long
        for j in range(len(WTeam_inputs)):
            # team FOR is positive
            if j < (len(WTeam_inputs)//2): # half

                # input and solution rows
                # ------------------------
                new_solutions_row_1.append(WTeam_stats[j])
                new_solutions_row_2.append(LTeam_stats[j])

                # Update team dictionaries
                # -------------------------
                # WTeam
                team_dictionary[row[2]][1][j] = ((W_games * team_dictionary[row[2]][1][j]) + WTeam_stats[j])/(W_games+1)
                team_dictionary[row[2]][2][j] = ((W_games * team_dictionary[row[2]][2][j]) + LTeam_stats[j])/(W_games+1)

                # LTeam
                team_dictionary[row[4]][1][j] = ((L_games * team_dictionary[row[4]][1][j]) + LTeam_stats[j])/(L_games+1)
                team_dictionary[row[4]][2][j] = ((L_games * team_dictionary[row[4]][2][j]) + WTeam_stats[j])/(L_games+1)

            # team AGAINST is positive
            else:
                # input and solution rows
                # ------------------------
                new_solutions_row_1.append(-LTeam_stats[j-(len(WTeam_inputs)//2)])
                new_solutions_row_2.append(-WTeam_stats[j-(len(WTeam_inputs)//2)])


        # update game count of both teams
        team_dictionary[row[2]][0] += 1
        team_dictionary[row[4]][0] += 1


        # append new inputs and solutions to overall list
        # ------------------------------------------------
        inputs.append(new_input_row_1)
        inputs.append(new_input_row_2)

        solutions.append(new_solutions_row_1)
        solutions.append(new_solutions_row_2)




    # if both teams are not in the dictionary already then just add/update them
    # --------------------------------------------------------------------------
    # create LTeam entry, update WTeam entry
    elif row[2] in team_dictionary:
        # LTeam
        team_dictionary[row[4]] = [1, WTeam_stats, LTeam_stats]

        # WTeam
        W_games = team_dictionary[row[2]][0]
        for j in range(len(WTeam_stats)):
            team_dictionary[row[2]][1][j] = ((W_games * team_dictionary[row[2]][1][j]) + WTeam_stats[j])/(W_games+1)
            team_dictionary[row[2]][2][j] = ((W_games * team_dictionary[row[2]][2][j]) + LTeam_stats[j])/(W_games+1)
        team_dictionary[row[2]][0] += 1


    # create WTeam entry, update LTeam entry
    elif row[4] in team_dictionary:
        # WTeam
        team_dictionary[row[2]] = [1, LTeam_stats, WTeam_stats]

        # LTeam
        L_games = team_dictionary[row[4]][0]
        for j in range(len(LTeam_stats)):
            team_dictionary[row[4]][1][j] = ((L_games * team_dictionary[row[4]][1][j]) + LTeam_stats[j])/(L_games+1)
            team_dictionary[row[4]][2][j] = ((L_games * team_dictionary[row[4]][2][j]) + WTeam_stats[j])/(L_games+1)
        team_dictionary[row[4]][0] += 1


    # create entry for both WTeam and LTeam
    else:
        team_dictionary[row[2]] = [1, WTeam_stats, LTeam_stats]
        team_dictionary[row[4]] = [1, LTeam_stats, WTeam_stats]

''';

In [20]:
def create_input_rows(W_data, L_data):

    # 4 parts
    W_for = []
    W_aga = []
    L_for = []
    L_aga = []
    for j in range(len(W_data[1][0])):

        W_for_val = 0
        W_aga_val = 0
        W_games = W_data[0]
        for k in range(W_games):
            W_for_val += W_data[1][k][j]
            W_aga_val += W_data[2][k][j]
        W_for.append(W_for_val/W_games)
        W_aga.append(W_aga_val/W_games)


        L_for_val = 0
        L_aga_val = 0
        L_games = L_data[0]
        for k in range(L_games):
            L_for_val += L_data[1][k][j]
            L_aga_val += L_data[2][k][j]
        L_for.append(L_for_val/L_games)
        L_aga.append(L_aga_val/L_games)


    new_input_row_1 = W_for.copy() + L_aga.copy()
    new_input_row_2 = L_for.copy() + W_aga.copy()     
    #new_input_row_1 = W_data[1].copy() + L_data[2].copy()
    #new_input_row_2 = L_data[1].copy() + W_data[2].copy()

    orig_length = len(new_input_row_1)
    for j in range(len(new_input_row_1)):
        new_input_row_1.append(-new_input_row_2[j])
        new_input_row_2.append(-new_input_row_1[j])


    # not sure what this does tbh
    WTeam_inputs = W_for.copy() + W_aga.copy()
    LTeam_inputs = L_for.copy() + L_aga.copy()
    #WTeam_inputs = W_data[1].copy() + W_data[2].copy()
    #LTeam_inputs = L_data[1].copy() + L_data[2].copy()
    
    
    return new_input_row_1, new_input_row_2, WTeam_inputs, LTeam_inputs

In [21]:
# recent games kept in history 
# -----------------------------
num_kept = 5


# change df to list of rows
# --------------------------

# Make a list of rows too
df_rows = []
for i in range(len(df_columns[0])):
    new_row = []
    for j in range(len(df_columns)):
        new_row.append(df_columns[j][i])
    df_rows.append(new_row)



# game solutions and team dictionary
# -----------------------------------

# inputs and solutions
inputs      = []
solutions   = []


# team1 ID, team2 ID, team1 win? (1 or 0)
game_solutions = []


# key = teamID
# value = [count, team]
team_dictionary = {}


for i in range(len(df_rows)):
    row = df_rows[i]
    
    W_id = row[2]
    L_id = row[4]

    WTeam_stats = [row[3]] + row[ 8:21].copy()
    LTeam_stats = [row[5]] + row[21:  ].copy()


    # if both teams are in the team dictionary, create input and solution rows
    # -------------------------------------------------------------------------
    if W_id in team_dictionary and row[4] in team_dictionary:

        
        # create input rows
        # ------------------
        W_data = team_dictionary[W_id]
        L_data = team_dictionary[L_id]
        
        new_input_row_1, new_input_row_2, WTeam_inputs, LTeam_inputs = create_input_rows(W_data, L_data)
        
        ''' 
        # 4 parts
        W_for = []
        W_aga = []
        L_for = []
        L_aga = []
        for j in range(len(team_dictionary[W_id][1][0])):
            
            W_for_val = 0
            W_aga_val = 0
            W_games = team_dictionary[W_id][0]
            for k in range(W_games):
                W_for_val += team_dictionary[W_id][1][k][j]
                W_aga_val += team_dictionary[W_id][2][k][j]
            W_for.append(W_for_val/W_games)
            W_aga.append(W_aga_val/W_games)
                
                
            L_for_val = 0
            L_aga_val = 0
            L_games = team_dictionary[L_id][0]
            for k in range(L_games):
                L_for_val += team_dictionary[L_id][1][k][j]
                L_aga_val += team_dictionary[L_id][2][k][j]
            L_for.append(L_for_val/L_games)
            L_aga.append(L_aga_val/L_games)
        
        
        new_input_row_1 = W_for.copy() + L_aga.copy()
        new_input_row_2 = L_for.copy() + W_aga.copy()  
            
        #new_input_row_1 = team_dictionary[W_id][1].copy() + team_dictionary[L_id][2].copy()
        #new_input_row_2 = team_dictionary[L_id][1].copy() + team_dictionary[W_id][2].copy()

        orig_length = len(new_input_row_1)
        for j in range(len(new_input_row_1)):
            new_input_row_1.append(-new_input_row_2[j])
            new_input_row_2.append(-new_input_row_1[j])
        
        
        # not sure what this does tbh
        WTeam_inputs = W_for.copy() + W_aga.copy()
        LTeam_inputs = L_for.copy() + L_aga.copy()

        #WTeam_inputs = team_dictionary[W_id][1].copy() + team_dictionary[W_id][2].copy()
        #LTeam_inputs = team_dictionary[L_id][1].copy() + team_dictionary[L_id][2].copy()
        '''
        
        
        # for solutions
        new_solutions_row_1 = []
        new_solutions_row_2 = []

        # for updating dictionaries
        W_games = team_dictionary[W_id][0]
        L_games = team_dictionary[L_id][0]

        # 26 long
        for j in range(len(WTeam_inputs)):
            # team FOR is positive
            if j < (len(WTeam_inputs)//2): # half

                # input and solution rows
                # ------------------------
                new_solutions_row_1.append(WTeam_stats[j])
                new_solutions_row_2.append(LTeam_stats[j])

                # Update team dictionaries
                # -------------------------
                # WTeam
                W_games = team_dictionary[W_id][0]
                if W_games >= num_kept:
                    team_dictionary[W_id][1].pop(0)
                    team_dictionary[W_id][1].append(WTeam_stats)

                    team_dictionary[W_id][2].pop(0)
                    team_dictionary[W_id][2].append(LTeam_stats)

                else:
                    team_dictionary[W_id][1].append(WTeam_stats)
                    team_dictionary[W_id][2].append(LTeam_stats)
                    team_dictionary[W_id][0] += 1
            
                # WTeam
                #team_dictionary[W_id][1][j] = ((W_games * team_dictionary[W_id][1][j]) + WTeam_stats[j])/(W_games+1)
                #team_dictionary[W_id][2][j] = ((W_games * team_dictionary[W_id][2][j]) + LTeam_stats[j])/(W_games+1)

                
                # LTeam
                L_games = team_dictionary[L_id][0]
                if L_games >= num_kept:
                    team_dictionary[L_id][1].pop(0)
                    team_dictionary[L_id][1].append(LTeam_stats)

                    team_dictionary[L_id][2].pop(0)
                    team_dictionary[L_id][2].append(WTeam_stats)

                else:
                    team_dictionary[L_id][1].append(LTeam_stats)
                    team_dictionary[L_id][2].append(WTeam_stats)
                    team_dictionary[L_id][0] += 1
                
                # LTeam
                #team_dictionary[L_id][1][j] = ((L_games * team_dictionary[L_id][1][j]) + LTeam_stats[j])/(L_games+1)
                #team_dictionary[L_id][2][j] = ((L_games * team_dictionary[L_id][2][j]) + WTeam_stats[j])/(L_games+1)

                
            # team AGAINST is positive
            else:
                # input and solution rows
                # ------------------------
                new_solutions_row_1.append(-LTeam_stats[j-(len(WTeam_inputs)//2)])
                new_solutions_row_2.append(-WTeam_stats[j-(len(WTeam_inputs)//2)])


        # update game count of both teams
        #team_dictionary[W_id][0] += 1
        #team_dictionary[L_id][0] += 1


        # append new inputs and solutions to overall list
        # ------------------------------------------------
        inputs.append(new_input_row_1)
        inputs.append(new_input_row_2)

        solutions.append(new_solutions_row_1)
        solutions.append(new_solutions_row_2)




    # if both teams are not in the dictionary already then just add/update them
    # --------------------------------------------------------------------------
    # create LTeam entry, update WTeam entry
    elif row[2] in team_dictionary:
        # LTeam
        #team_dictionary[L_id] = [1, WTeam_stats, LTeam_stats]
        team_dictionary[L_id] = [1, [WTeam_stats], [LTeam_stats]]

        # WTeam
        W_games = team_dictionary[W_id][0]
        if W_games >= num_kept:
            team_dictionary[W_id][1].pop(0)
            team_dictionary[W_id][1].append(WTeam_stats)
            
            team_dictionary[W_id][2].pop(0)
            team_dictionary[W_id][2].append(LTeam_stats)
            
        else:
            team_dictionary[W_id][1].append(WTeam_stats)
            team_dictionary[W_id][2].append(LTeam_stats)
            team_dictionary[W_id][0] += 1
        
        
        #for j in range(len(WTeam_stats)):
            #team_dictionary[W_id][1][j] = ((W_games * team_dictionary[W_id][1][j]) + WTeam_stats[j])/(W_games+1)
            #team_dictionary[W_id][2][j] = ((W_games * team_dictionary[W_id][2][j]) + LTeam_stats[j])/(W_games+1)
        #team_dictionary[W_id][0] += 1


    # create WTeam entry, update LTeam entry
    elif row[4] in team_dictionary:
        # WTeam
        team_dictionary[W_id] = [1, [LTeam_stats], [WTeam_stats]]

        # LTeam
        L_games = team_dictionary[L_id][0]
        if L_games >= num_kept:
            team_dictionary[L_id][1].pop(0)
            team_dictionary[L_id][1].append(LTeam_stats)
            
            team_dictionary[L_id][2].pop(0)
            team_dictionary[L_id][2].append(WTeam_stats)
            
        else:
            team_dictionary[L_id][1].append(LTeam_stats)
            team_dictionary[L_id][2].append(WTeam_stats)
            team_dictionary[L_id][0] += 1
            
        
        #for j in range(len(LTeam_stats)):
            #team_dictionary[L_id][1][j] = ((L_games * team_dictionary[L_id][1][j]) + LTeam_stats[j])/(L_games+1)
            #team_dictionary[L_id][2][j] = ((L_games * team_dictionary[L_id][2][j]) + WTeam_stats[j])/(L_games+1)
        #team_dictionary[L_id][0] += 1


    # create entry for both WTeam and LTeam
    else:
        team_dictionary[W_id] = [1, [WTeam_stats], [LTeam_stats]]
        team_dictionary[L_id] = [1, [LTeam_stats], [WTeam_stats]]


print(len(inputs), len(inputs[0]))

46386 56


In [22]:
headers_list = ["Pts", "FG%2", "FGA2", "FG%3", "FGA3", "FT%", "FTA", "OR", 
                "DR", "Ast", "TO", "Stl", "Blk", "PF",
                "xPts", "xFG%2", "xFGA2", "xFG%3", "xFGA3", "xFT%", "xFTA", "xOR",
                "xDR", "xAst", "xTO", "xStl", "xBlk", "xPF"]
solutions0 = inputs[1]
#solutions0 = solutions[0]


lines = ["","","",""]
for i in range(len(headers_list)):
    if i < len(headers_list)//2:
        lines[0] += "{:>5}  ".format(headers_list[i])
        lines[1] += "{:>5.2}  ".format(solutions0[i])
    else:
        lines[2] += "{:>5}  ".format(headers_list[i])
        lines[3] += "{:>5.2}  ".format(solutions0[i])
        
for line in lines:
    print(line)
        

  Pts   FG%2   FGA2   FG%3   FGA3    FT%    FTA     OR     DR    Ast     TO    Stl    Blk     PF  
 0.47   0.56   0.65   0.22   0.43    1.0   0.29   0.32   0.53   0.25   0.74   0.31   0.11   0.62  
 xPts  xFG%2  xFGA2  xFG%3  xFGA3   xFT%   xFTA    xOR    xDR   xAst    xTO   xStl   xBlk    xPF  
  0.8   0.75   0.55   0.49   0.77    0.8   0.74    0.5   0.61   0.67   0.74   0.94   0.56    0.9  


# save training data to a csv

In [23]:
# inputs and solutions
headers_output = [
    "Pts", "FGM", "FGA", "FGM3", "FGA3", "FTM", "FTA", "OR", 
    "DR", "Ast", "TO", "Stl", "Blk", "PF",
    "Pts_a", "FGM_a", "FGA_a", "FGM3_a", "FGA3_a", "FTM_a", "FTA_a", "OR_a", 
    "DR_a", "Ast_a", "TO_a", "Stl_a", "Blk_a", "PF_a", 
    
    "xPts", "xFGM", "xFGA", "xFGM3", "xFGA3", "xFTM", "xFTA", "xOR", 
    "xDR", "xAst", "xTO", "xStl", "xBlk", "xPF",
    "xPts_a", "xFGM_a", "xFGA_a", "xFGM3_a", "xFGA3_a", "xFTM_a", "xFTA_a", "xOR_a", 
    "xDR_a", "xAst_a", "xTO_a", "xStl_a", "xBlk_a", "xPF_a",

    "solution"]

print(len(headers_output))

57


In [24]:
# to get new solutions, use the points (0,15)
new_solutions = []
for row in solutions:
    if abs(row[0]) > abs(row[15]): new_solutions.append([ 1])
    else:                          new_solutions.append([-1])
        
print(len(new_solutions[0]), len(new_solutions))

1 46386


In [25]:
# convert the inputs to float32 for less storage space
new_inputs = []
for i in range(len(inputs)):
    new_input_row = []
    for j in range(len(inputs[i])):
        new_input_row.append(np.float32(inputs[i][j]))
    new_inputs.append(new_input_row)
        
#new_inputs = inputs

print(len(new_inputs[0]), len(new_inputs))

56 46386


In [26]:
# put into the same list
output_rows = []
for i in range(len(new_inputs)):
    new_row = new_inputs[i]
    #print(len(new_row))
    #print(new_row)
    new_row.append(new_solutions[i][0])

    output_rows.append(new_row)
    
print(len(output_rows[0]), len(output_rows))

57 46386


In [27]:
# create the final dataframe
training_df = pd.DataFrame(output_rows, columns=headers_output)

print(training_df.shape)
training_df.head()

(46386, 57)


Unnamed: 0,Pts,FGM,FGA,FGM3,FGA3,FTM,FTA,OR,DR,Ast,...,xFTM_a,xFTA_a,xOR_a,xDR_a,xAst_a,xTO_a,xStl_a,xBlk_a,xPF_a,solution
0,0.571429,0.549107,0.666667,0.252747,0.371429,0.769231,0.764706,0.5,0.605263,0.291667,...,-0.8,-0.735294,-0.5,-0.605263,-0.666667,-0.740741,-0.9375,-0.555556,-0.896552,1
1,0.469388,0.563187,0.65,0.219048,0.428571,1.0,0.294118,0.318182,0.526316,0.25,...,-0.5625,-0.470588,-0.636364,-0.710526,-1.041667,-0.37037,-0.6875,-0.222222,-0.517241,-1
2,0.642857,0.557823,0.7,0.394286,0.714286,0.619048,0.617647,0.772727,0.578947,0.416667,...,-0.636364,-0.323529,-0.181818,-0.684211,-0.5,-0.777778,-0.375,-0.222222,-0.655172,1
3,0.632653,0.545518,0.85,0.328571,0.571429,0.705882,0.5,0.772727,0.684211,0.416667,...,-0.612903,-0.911765,-0.818182,-0.657895,-0.75,-0.222222,-0.5625,-1.222222,-0.517241,-1
4,0.622449,0.686384,0.533333,0.352041,0.4,0.647059,1.0,0.454545,0.815789,0.375,...,-0.55,-0.588235,-0.136364,-0.368421,-0.375,-0.37037,-0.5,-0.0,-0.896552,1


In [28]:
# to csv
file_name = "prepped/W_training_data.csv"
training_df.to_csv(file_name, index=False)

# using final team dictionary entries, create input data for final solutions

In [29]:
# import the sample answer sheet
sample_submission_file_name = 'SampleSubmissionWarmup.csv'

In [30]:
file_path = given_data_folder + '/' + sample_submission_file_name 
sample_submission_df  = pd.read_csv(file_path)
print(len(sample_submission_df))

sample_submission_df.head(5)

614319


Unnamed: 0,ID,Pred
0,2017_1101_1102,0.5
1,2017_1101_1103,0.5
2,2017_1101_1104,0.5
3,2017_1101_1105,0.5
4,2017_1101_1106,0.5


In [31]:
# get rows from the sample submission
matchup_column = list(sample_submission_df['ID'])

print(len(matchup_column))

614319


# troubleshooting

In [32]:
not_in = 0
not_in_dict = {}
for i in range(len(matchup_column)):
    values = matchup_column[i].split('_')
    year   = values[0]
    
    WTeam_ID = "{}_{}".format(values[1], year)
    LTeam_ID = "{}_{}".format(values[2], year)
    
    WTeam = int(values[1])
    LTeam = int(values[2])
    
    if WTeam_ID not in team_dictionary or LTeam_ID not in team_dictionary:
        if int(WTeam) > 3100: 
            #print(WTeam, year)
            not_in += 1
            if WTeam not in not_in_dict: not_in_dict[WTeam] = [year, 0]
            else:                        not_in_dict[WTeam][1] += 1
            
        if int(LTeam) > 3100:
            #print(LTeam, year)
            not_in += 1
            if LTeam not in not_in_dict: not_in_dict[LTeam] = [year, 0]
            else:                        not_in_dict[LTeam][1] += 1
            
print(not_in)
#print(not_in_dict)

1374


In [33]:
for key in not_in_dict:
    if not_in_dict[key][1] > 1:
        print(key, not_in_dict[key][1])

3169 343
3197 343


In [34]:
team_dictionary['3169_2021'] = team_dictionary['3169_2019']
team_dictionary['3197_2021'] = team_dictionary['3197_2019']

In [35]:
final_inputs = []
for i in range(len(matchup_column)):
    # get the ids
    # ------------
    values = matchup_column[i].split('_')
    year   = values[0]
    
    WTeam_ID = "{}_{}".format(values[1], year)
    LTeam_ID = "{}_{}".format(values[2], year)
    
    if WTeam_ID in team_dictionary and LTeam_ID in team_dictionary:
        
        # create input rows
        # ------------------
        W_data = team_dictionary[WTeam_ID]
        L_data = team_dictionary[LTeam_ID]
        
        new_input_row_1, new_input_row_2, WTeam_inputs, LTeam_inputs = create_input_rows(W_data, L_data)
        
        '''
        # get the input rows
        # -------------------
        new_input_row_1 = team_dictionary[WTeam_ID][1].copy() + team_dictionary[LTeam_ID][2].copy()
        new_input_row_2 = team_dictionary[LTeam_ID][1].copy() + team_dictionary[WTeam_ID][2].copy()

        orig_length = len(new_input_row_1)
        for j in range(len(new_input_row_1)):
            new_input_row_1.append(-new_input_row_2[j])
            new_input_row_2.append(-new_input_row_1[j])
        '''

        # add in team IDS
        final_inputs.append([WTeam_ID, LTeam_ID] + new_input_row_1)
        final_inputs.append([LTeam_ID, WTeam_ID] + new_input_row_2)
    
print(len(final_inputs[0]), len(final_inputs))

58 610814


In [36]:
# inputs and solutions
headers = [
    "team_1_id", "team_2_id",
    
    "Pts", "FGM", "FGA", "FGM3", "FGA3", "FTM", "FTA", "OR", 
    "DR", "Ast", "TO", "Stl", "Blk", "PF",
    "Pts_a", "FGM_a", "FGA_a", "FGM3_a", "FGA3_a", "FTM_a", "FTA_a", "OR_a", 
    "DR_a", "Ast_a", "TO_a", "Stl_a", "Blk_a", "PF_a", 
    
    "xPts", "xFGM", "xFGA", "xFGM3", "xFGA3", "xFTM", "xFTA", "xOR", 
    "xDR", "xAst", "xTO", "xStl", "xBlk", "xPF",
    "xPts_a", "xFGM_a", "xFGA_a", "xFGM3_a", "xFGA3_a", "xFTM_a", "xFTA_a", "xOR_a", 
    "xDR_a", "xAst_a", "xTO_a", "xStl_a", "xBlk_a", "xPF_a"]

print(len(headers))

58


In [37]:
# create the final dataframe
testing_df = pd.DataFrame(final_inputs, columns=headers)

print(testing_df.shape)
testing_df.head()

(610814, 58)


Unnamed: 0,team_1_id,team_2_id,Pts,FGM,FGA,FGM3,FGA3,FTM,FTA,OR,...,xFGA3_a,xFTM_a,xFTA_a,xOR_a,xDR_a,xAst_a,xTO_a,xStl_a,xBlk_a,xPF_a
0,3101_2017,3102_2017,0.908163,0.732143,0.533333,0.667411,0.914286,0.75,0.705882,0.590909,...,-0.514286,-0.866667,-0.441176,-0.318182,-0.684211,-0.333333,-0.666667,-0.25,-0.333333,-0.689655
1,3102_2017,3101_2017,0.408163,0.274554,0.533333,0.547619,0.342857,0.727273,0.647059,0.409091,...,-0.342857,-0.642857,-0.411765,-0.409091,-0.631579,-0.291667,-0.37037,-0.4375,-0.666667,-0.586207
2,3101_2017,3103_2017,0.908163,0.732143,0.533333,0.667411,0.914286,0.75,0.705882,0.590909,...,-0.514286,-0.866667,-0.441176,-0.318182,-0.684211,-0.333333,-0.666667,-0.25,-0.333333,-0.689655
3,3103_2017,3101_2017,0.377551,0.475893,0.666667,0.102679,0.457143,0.571429,0.411765,0.363636,...,-0.457143,-0.5,-0.588235,-0.681818,-0.947368,-0.791667,-0.555556,-0.3125,-0.0,-0.586207
4,3101_2017,3104_2017,0.908163,0.732143,0.533333,0.667411,0.914286,0.75,0.705882,0.590909,...,-0.514286,-0.866667,-0.441176,-0.318182,-0.684211,-0.333333,-0.666667,-0.25,-0.333333,-0.689655


In [38]:
# to csv
file_name = "prepped/W_testing.csv"
testing_df.to_csv(file_name, index=False)