# Import Starting Data

In [1]:
import pandas as pd #1
import os           #3
import numpy as np

In [2]:
from copy import deepcopy
from statistics import stdev
import matplotlib.pyplot as plt

In [3]:
given_data_folder = 'base'

file_names = os.listdir(given_data_folder)
file_names.sort()
file_names

['MMasseyOrdinals.csv',
 'MNCAATourneyDetailedResults.csv',
 'MRegularSeasonDetailedResults.csv',
 'SampleSubmission2023.csv',
 'SampleSubmissionWarmup.csv',
 'WRegularSeasonDetailedResults.csv']

In [4]:
reg_season_file_name = 'MRegularSeasonDetailedResults.csv'

In [5]:
file_path = given_data_folder + '/' + reg_season_file_name 
reg_season_df  = pd.read_csv(file_path)
print(len(reg_season_df))

reg_season_df.head(5)

106834


Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2003,10,1104,68,1328,62,N,0,27,58,...,10,16,22,10,22,8,18,9,2,20
1,2003,10,1272,70,1393,63,N,0,26,62,...,24,9,20,20,25,7,12,8,6,16
2,2003,11,1266,73,1437,61,N,0,24,58,...,26,14,23,31,22,9,12,2,5,23
3,2003,11,1296,56,1457,50,N,0,18,38,...,22,8,15,17,20,9,19,4,3,23
4,2003,11,1400,77,1208,71,N,0,30,61,...,16,17,27,21,15,12,10,7,1,14


### (only need to change if mens)

In [6]:
ranks_file_name = 'MMasseyOrdinals.csv'

In [7]:
file_path = given_data_folder + '/' + ranks_file_name 
ranks_df  = pd.read_csv(file_path)
print(len(ranks_df))

ranks_df.head(5)

4877976


Unnamed: 0,Season,RankingDayNum,SystemName,TeamID,OrdinalRank
0,2003,35,SEL,1102,159
1,2003,35,SEL,1103,229
2,2003,35,SEL,1104,12
3,2003,35,SEL,1105,314
4,2003,35,SEL,1106,260


<hr>

# Initial Data Prep

In [8]:
from embeddings_data_prep import *

<ul>
    <li> Cut both down to just the last X years </li>
    <li> Turn Both DFs into lists of columns    </li>
    <li>  Combine team IDs and season years     </li>
    
</ul>

In [9]:
#reg_season_df
#ranks_df

In [10]:
#seasons = [2017, 2018, 2019, 2021, 2022]
#seasons = [2023]
seasons = [2021, 2022, 2023]
mens = True

In [11]:
recent_season_df = get_seasons(reg_season_df, seasons)
season_columns   = df_to_list_of_columns(recent_season_df, print_report=True)

season_columns = combine_season_and_team_ids(season_columns, 0, [2,4])

# there is no ranks df for womens
if mens:
    recent_ranks_df = get_seasons(ranks_df, seasons)
    ranks_columns   = df_to_list_of_columns(recent_ranks_df, print_report=True)

    ranks_columns = combine_season_and_team_ids(ranks_columns, 0, [3])

rows: 14002   columns: 34
rows: 757090   columns: 5


### separate_fga_and_fgp

In [12]:
fgp_columns = separate_fga_and_fgp(season_columns, print_report=True)

recent_season_df_columns = fgp_columns

number columns   | rows: 26, columns: 14002
original columns | rows: 34, columns: 14002
fgp columns      | rows: 34, columns: 14002


### (if mens) create a ranks dictionary and add team ranks to data 

In [13]:
if mens:
    ranks_dict = create_ranks_dictionary(ranks_columns, print_report=True)

1113_2021
[42.0, [['AP', 25], ['BNZ', 46], ['BWE', 67], ['DES', 25], ['DII', 20], ['DOK', 58], ['EBP', 35], ['HAS', 49], ['INC', 26], ['JNG', 51], ['LEF', 54], ['MAS', 59], ['MOR', 81], ['PGH', 32], ['POM', 29], ['SAG', 41], ['SMS', 68], ['TRK', 27], ['TRP', 40], ['USA', 17], ['WIL', 32]]]


In [14]:
if mens:
    add_team_ranks_to_data(recent_season_df_columns, ranks_dict, print_report=False)

<hr>

# Create Inputs and Solutions 

### Normalizing each input stat to 0-1 and getting variance

In [15]:
df_columns = deepcopy(recent_season_df_columns)

In [16]:
df_columns, output_max_columns, output_variance = normalizing_stats(df_columns, mens=mens)

### create team dictionary, inputs, and solutions

In [17]:
team_dictionary, inputs, solutions = create_team_dict_and_input_rows(df_columns, 
                                                                     num_kept=5, 
                                                                     mens=mens, 
                                                                     print_report=True,
                                                                     #existing_dict = team_dictionary
                                                                    )

25156 88
  Pts   FG%2   FGA2   FG%3   FGA3    FT%    FTA     OR     DR    Ast     TO    Stl    Blk     PF    Str  
 0.54   0.53   0.66   0.28   0.82   0.75   0.36    0.8   0.56   0.31   0.78   0.71  0.071    0.0   0.79  
recW%   xPts  xFG%2  xFGA2  xFG%3  xFGA3   xFT%   xFTA    xOR    xDR   xAst    xTO   xStl   xBlk    xPF   xStr  
 0.59   0.49   0.79   0.57   0.74    0.5   0.93   0.76    0.6   0.42   0.46   0.43    0.0   0.45   0.59  -0.28  


<hr>

# save training data to a csv

In [18]:
output_rows, headers_output = prepare_inputs_for_csv(inputs, solutions, mens=mens, print_report=True)

headers length: 89 

new_solutions   | rows: 25156, columns: 1
new_inputs      | rows: 25156, columns: 89
output_rows     | rows: 25156, columns: 89


In [19]:
# create the final dataframe
training_df = pd.DataFrame(output_rows, columns=headers_output)

print(training_df.shape)
training_df.head()

(25156, 89)


Unnamed: 0,team1_id,team2_id,Pts,FG%2,FGA2,FG%3,FGA3,FT%,FTA,OR,...,xOR_a,xDR_a,xAst_a,xTO_a,xStl_a,xBlk_a,xPF_a,xStr_a,xrecW%,solution
0,11332021.0,13242021.0,0.590426,0.592805,0.85,0.554705,0.560606,0.586207,0.5625,0.866667,...,0.733333,0.090909,-0.714286,-0.0,-0.916667,-2.220446e-16,-0.0,-0.0,-0.231378,1
1,13242021.0,11332021.0,0.542553,0.533241,0.66,0.282976,0.818182,0.748863,0.359375,0.8,...,0.133333,0.060606,0.571429,0.75,0.333333,-0.2857143,-0.0,-0.5,-0.231378,-1
2,12282021.0,13252021.0,1.164894,0.943766,0.86,0.848012,0.742424,0.75431,0.75,1.1,...,-0.2,0.090909,0.47619,-0.15,0.25,0.7142857,-0.0,-1.0,-0.882903,1
3,13252021.0,12282021.0,0.893617,0.885057,0.59,0.726891,0.80303,0.633677,0.84375,0.7,...,-0.266667,-0.121212,0.095238,0.1,-0.333333,0.4285714,-0.0,-1.0,-0.882903,1
4,12992021.0,11522021.0,0.702128,0.578101,0.83,0.56381,0.666667,0.719212,0.4375,0.4,...,0.666667,0.30303,0.333333,0.2,0.416667,-0.5714286,-0.0,-0.0,-0.188285,1


In [20]:
# to csv
file_name = "prepped/M_training_data_embedding_.csv"
training_df.to_csv(file_name, index=False)

<hr>

# using final team dictionary entries, create input data for final solutions

In [21]:
# import the sample answer sheet
#sample_submission_file_name = 'SampleSubmissionWarmup.csv'
sample_submission_file_name = 'SampleSubmission2023.csv'

In [22]:
file_path = given_data_folder + '/' + sample_submission_file_name 
sample_submission_df  = pd.read_csv(file_path)
print(len(sample_submission_df))

sample_submission_df.head(5)

130683


Unnamed: 0,ID,Pred
0,2023_1101_1102,0.5
1,2023_1101_1103,0.5
2,2023_1101_1104,0.5
3,2023_1101_1105,0.5
4,2023_1101_1106,0.5


In [23]:
# missing data in the womens dictionary
if (not mens) and (2021 in seasons) and (2019 in seasons):
    team_dictionary['3169_2021'] = team_dictionary['3169_2019']
    team_dictionary['3197_2021'] = team_dictionary['3197_2019']

In [24]:
final_inputs, headers = submission_input_data(sample_submission_df, team_dictionary, mens=mens, print_report=True)

headers length: 84 

matchup_column  | rows: 130683, columns: 1
final_inputs    | rows: 131406, columns: 84


In [25]:
# create the final dataframe
testing_df = pd.DataFrame(final_inputs, columns=headers)

print(testing_df.shape)
testing_df.head()

(131406, 84)


Unnamed: 0,team1_id,team2_id,Pts,FG%2,FGA2,FG%3,FGA3,FT%,FTA,OR,...,xFGA3_a,xFTA_a,xOR_a,xDR_a,xAst_a,xTO_a,xStl_a,xBlk_a,xPF_a,xrecW%
0,1101_2023,1102_2023,0.805319,0.716581,0.842,0.60805,0.542424,0.758318,0.63125,0.513333,...,0.224242,-0.975,-0.813333,-0.187879,0.047619,-0.835,0.308333,-0.228571,-0.842857,-0.2
1,1102_2023,1101_2023,0.680851,0.837502,0.54,0.647953,0.669697,0.716463,0.421875,0.273333,...,-0.375758,1.00625,0.793333,-0.366667,-0.661905,0.325,0.108333,0.057143,0.703571,-0.4
2,1101_2023,1103_2023,0.805319,0.716581,0.842,0.60805,0.542424,0.758318,0.63125,0.513333,...,0.363636,-0.025,-0.286667,-0.112121,0.609524,-0.125,0.416667,-0.285714,-0.407143,-0.7
3,1103_2023,1101_2023,0.796809,0.750199,0.652,0.714995,0.760606,0.755332,0.5375,0.6,...,-0.79697,0.24375,-0.413333,-0.124242,-0.6,0.415,-0.433333,0.957143,0.089286,-0.4
4,1101_2023,1104_2023,0.805319,0.716581,0.842,0.60805,0.542424,0.758318,0.63125,0.513333,...,-0.045455,-0.596875,0.473333,-0.190909,-0.071429,-0.755,0.391667,-0.157143,-0.671429,-0.8


In [26]:
# to csv
file_name = "prepped/M_testing_embedding_2023_.csv"
testing_df.to_csv(file_name, index=False)