In [1]:
import pandas as pd
import numpy as np

In [2]:
massey = pd.read_csv('./MasseyOrdinals.csv')
season = pd.read_csv('./DataFiles/RegularSeasonDetailedResults.csv')
teams = pd.read_csv('./DataFiles/Teams.csv')
coaches = pd.read_csv('./DataFiles/TeamCoaches.csv')
tourney = pd.read_csv('./DataFiles/NCAATourneyCompactResults.csv')
seeds = pd.read_csv('./DataFiles/NCAATourneySeeds.csv')

In [3]:
def get_records(year):
    df_season = season[season['Season'] == year].copy()
    #pulls the games for the given year
    wins = {}
    losses = {}
    teamlist = list(df_season['WTeamID'].unique())
    for d in df_season['LTeamID'].unique():
        teamlist.append(d)
    #creates a teamlist - needed for undefeated, winless teams that otherwise don't show up in both WTeamID and LTeamID
    for team in set(teamlist):
    #turned into a set to kill all the duplicates
        try:
            losses[team] = df_season['LTeamID'].value_counts()[team]
        except:
            losses[team] = 0
        try:
            wins[team] = df_season['WTeamID'].value_counts()[team]
        except:
            wins[team] = 0
    # pulls the value_counts based on team ID to get wins and losses
    # try/excepts necessary for undefeated, winless teams, otherwise throws an error
    df_wins = pd.DataFrame.from_dict(wins, orient='index')
    df_wins.reset_index(inplace = True)
    df_wins.rename(columns={"index": "TeamID", 0 : "Wins"}, inplace = True)
    df_losses = pd.DataFrame.from_dict(losses, orient='index')
    df_losses.reset_index(inplace = True)
    df_losses.rename(columns={"index": "TeamID", 0 : "Losses"}, inplace = True)
    record = df_wins.merge(df_losses, on="TeamID")
    #builds dataframes from created dictionaries, then merges them to get a full record
    record['Percentage'] = round((record['Wins'] / (record['Wins'] + record['Losses'])), 3)
    #adds a percentage that can be used for sorting for best teams, likely dropped in actual model
    return record
    #returns the final dataframe

In [4]:
def get_rankings(year):
    df_season = massey[(massey['Season'] == year)].copy()
    #pulls all the rankings for the given year
    final_rank = df_season[df_season['RankingDayNum'] == df_season['RankingDayNum'].max()]
    #pulls the rankings from the last day of the season for any particular year. This is because we only
    #want the final rankings that are closest to the tournament date, which reflect the entire season
    final_rank.reset_index(drop = True, inplace = True)
    rankings = final_rank['SystemName'].value_counts()
    incomplete_rankings = rankings[rankings != rankings.max()]
    drops = []
    for n in final_rank.index:
        if final_rank.loc[n, 'SystemName'] in incomplete_rankings:
            drops.append(n)
    final_rank.drop(drops, inplace = True)
    #drops any rankings that don't take into account every team, as they otherwise screw up the mean
    rank = {}
    for team in final_rank['TeamID'].unique():
        rank[team] = round(final_rank[final_rank['TeamID'] == team]['OrdinalRank'].mean(), 4)
    #calculates the mean of all rankings, then rounds them
    df_rankings = pd.DataFrame.from_dict(rank, orient='index')
    df_rankings.reset_index(inplace = True)
    df_rankings.rename(columns={"index": "TeamID", 0 : "MeanRank"}, inplace = True)
    #creates a df from the dictionary
    df_sort = df_rankings.sort_values('MeanRank')
    df_sort['AbsMeanRank'] = list(range(1, len(df_sort) + 1))
    #creates a value that goes from 1 to (max number of teams), turn the mean rankings into a list ranking
    #for sorting. Also likely dropped in any actual model, but fun to look act for comparative purposes
    return df_sort.sort_values("TeamID")
    #returns the final dataframe, this time resorted by the TeamID instead of the mean ranking.

In [5]:
def get_team_stat(teamID, year):
    df_season = season[season['Season'] == year]
    #make sure we're only using the given year
    df_team = pd.concat([df_season[df_season['WTeamID']  == teamID], df_season[df_season['LTeamID'] == teamID]])
    #take all the games the team shows up in, whether they won or lost
    df_team.drop('WLoc', axis=1, inplace = True)
    #because the NCAA tournament is played entirely at neutral sites, I'm going to ignore the location and drop it.
    #other columns that aren't relevant are otherwise dropped, but this one needs to be done manually
    statcol = []
    oppcol = []
    for c in df_team.columns:
        if 'W' in c:
            statcol.append(c.replace("W", ""))
    for s in statcol:
        oppcol.append("Opp" + s)
    df_games = pd.DataFrame(columns=statcol)
    df_opp = pd.DataFrame(columns=oppcol)
    #creates a column list, then create the empty dataframes we'll fill up
    gm = {}
    oppgm = {}
    for t in df_team.index:
        if df_team.loc[t, 'WTeamID'] == teamID:
            for c in statcol:
                    gm[c] = df_team.loc[t, ("W" + c)]
                    oppgm["Opp" + c] = df_team.loc[t, "L" + c]
        else:
            for c in statcol:
                    gm[c] = df_team.loc[t, ("L" + c)]
                    oppgm["Opp" + c] = df_team.loc[t, "W" + c]
    #gathers the data based off teams, rather than the result of the game
        df_games.loc[t] = gm
        df_opp.loc[t] = oppgm
    #puts it in the dataframe
    return df_games.join(df_opp)
    #returns a dataframe of the team's stats for each game, as well as their opponent's stats

In [6]:
def get_stat_avg(df_teamseas, teamID):
    df_teamseas = df_teamseas.drop(["TeamID", "OppTeamID"], axis = 1)
    #drops the teamIDs, as they don't need to be averaged
    df_team = pd.DataFrame(columns=df_teamseas.columns)
    avgs = {}
    for col in df_teamseas.columns:
        avgs[col] = round(df_teamseas[col].mean(), 3)
    df_team.loc[teamID] = avgs
    #compresses the columns into their averages
    return df_team
    #returns a one row dataframe with all the averages, with the team ID as index

In [7]:
def get_opp_avg(oppteamID, baseteamID, year):
    #this is a two-part function that calculates how a team did against it's opponent's averages in any given game
    df_oppteam = get_team_stat(oppteamID, year)
    #pulls all the game data for the opposing team
    dropgm = []
    for n in df_oppteam.index:
        if df_oppteam.loc[n, 'OppTeamID'] == baseteamID:
            dropgm.append(n)
    df_oppteam.drop(dropgm, inplace=True)
    #drops the game (or games) the teams played against each other, to prevent that biasing the average
    df_oppavg = get_stat_avg(df_oppteam, oppteamID)
    #compresses the remaining data into averages
    statnames = []
    for col in df_oppavg.columns:
        if 'Opp' not in col:
            statnames.append("OppAvg" + col)
        else:
            statnames.append(col.replace("Opp","OppDAvg"))
    df_oppavg.columns = statnames
    #renames all the stats appropriately
    return df_oppavg
    #returns a one-line dataframe, with opponent's teamID as the index along with their average
    #performance against all the teams other than the one that they're currently being compared to

In [8]:
def against_opp_avg(teamID, year):
    team_df = get_team_stat(teamID, year)
    #pulls the dataset that returns all the games
    aoa_col = []
    for col in team_df.columns:
            if 'TeamID' not in col:
                if 'Opp'not in col:
                    aoa_col.append("AgstOppDefAvg" + col)
                else:
                    aoa_col.append("AgstOppOffAvg" + col)
    aoa_df = pd.DataFrame(columns = aoa_col)
    #making a column list and an empty dataframe to be filled
    for i in team_df.index:
        oppteam = team_df.loc[i, 'OppTeamID']
        agst_opp_avg = {}
        oppteam_df = get_opp_avg(oppteam, teamID, year)
        for col in team_df.columns:
            if 'TeamID' not in col:
                if 'Opp' not in col:
                    agst_opp_avg["AgstOppDefAvg" + col] = round(team_df.loc[i, col] - oppteam_df.loc[oppteam, "OppDAvg" + col], 3)
    #calculating how well the team performed against the opponent's average opponent
    #given that most of these stats are offensive, we'll generically label them as
    #'against the opponent's defensive averages', even if that's not entirely accurate
                else:
                    agst_opp_avg["AgstOppOffAvg" + col] = round(team_df.loc[i, col] - oppteam_df.loc[oppteam, "OppAvg" + col.replace("Opp", "")], 3)
    #calculating how well the opponent performed in comparison to their normal performance
    #again, given that these stats are mostly offensive, they're going to be generically labeled
    #as 'against the opponent's offensive averages.' For our purposes, it's a close enough label
        aoa_df.loc[i] = agst_opp_avg
    return aoa_df
    #returns a dataframe of all the team's games, showing how the team did against each opponent's averages.
    #this is worth calculating because a team that consistently does better than what their opponent usually
    #does is, in all likelihood, a better team. We can also see which ones are good offensively and defensively
    #if you wanted to test, perhaps, the idiom of 'defense wins championships' and all that.

In [9]:
def get_stats(teamID, year):
    df_team = get_team_stat(teamID, year)
    #pulls all the team's stats over their games
    df_aoa = against_opp_avg(teamID, year)
    #pulls how they're doing against their opponent's averages
    return get_stat_avg(df_team.join(df_aoa), teamID)
    #combines them together and compresses them into averages, with the index being the teamID

In [10]:
def get_season_stats(allteams, year):
    #since our set of functions do it team by team, we'll input a dataframe that contains a list
    #of all the teams that played in any given season, and iterate through them.
    allteams = allteams.reset_index(drop = True)
    df_season = season[season['Season'] == year]
    df_stats = get_stats(allteams.loc[0, 'TeamID'], year)
    for team in range(1, len(allteams)):
        df_stats = pd.concat([df_stats, get_stats(allteams.loc[team, 'TeamID'], year)])
     # concat was the only method that seemed to work properly, despite several attempts.
        if (len(allteams) - team) % 4 == 0:
            print(str(len(allteams) - team) + " teams left to compile")
     # this actually can take a considerable amount of time, so this is in there to check progress while things
     # are progressing, to make sure that everything is still working. Since each team has to look at each 
     # opponent they play for the averages I've calculated, it's pretty time intensive, all told, and why 
     # after all of these stats are compiled and merged together, they're saved to csvs. Since the stats don't
     # change, I don't need to spend a bunch of time run these over and over every time I want to tweak the model
    return df_stats.reset_index().rename(columns={"index": "TeamID"})
     # returns all relevant stats from all teams entered in

In [11]:
def get_coach(teamID, year):
    #while turnover of rosters means year to year historical data can be misleading for a team, usually
    #a coach's history can be a major factor in how well a team will do in the tourney
    seas_coaches = coaches[coaches['Season'] == year]
    team_coach = seas_coaches[seas_coaches['TeamID'] == teamID]
    if len(team_coach.index) > 1:
        if year != 2018:
            team_coach = team_coach[team_coach['LastDayNum'] == coaches['LastDayNum'].max()]
        else:
            team_coach = team_coach[team_coach['LastDayNum'] == coaches[coaches['Season'] == 2018]['LastDayNum'].max()]
    #the data in 2018 doesn't go all the way to the end, so it needs an exception
    coach = team_coach.loc[team_coach.index, "CoachName"].values[0]
    return coach
    #returns a simple string with the coach's name, to be able to be utilized in other functions

In [12]:
def get_coach_exp(teamID, year):
    #gathers the coach's experience, both in ovr. years, and how far (and how many times) he's gone in the tournament
    coach = get_coach(teamID, year)
    coach_exp = coaches[coaches['Season'] <= year]['CoachName'].value_counts()
    #gets the coach general experience
    seasons_played = coaches[coaches['Season'] < year]
    coach_hist = seasons_played[seasons_played['CoachName'] == coach]
    #makes a dataframe of every year the coach has coached up to the year
    #specified, regardless of which team he was coaching at the time
    coach_trny = pd.DataFrame(columns=['made_tournament', 'first_round', 'second_round', 'sweet_sixteen', 'elite_eight', 'final_four', 'championship_game', 'champions'])
    #makes an empty dataframe to be filled
    for yr in coach_hist.index:
        coach_trny = pd.concat([coach_trny, get_tourney_results(coach_hist.loc[yr, 'TeamID'], coach_hist.loc[yr, 'Season'])])
    #calls the tourney results function, to tell how far the coach's team got in the tournament
    coach_trny_exp = {'coach_exp': coach_exp[coach]}
    for col in coach_trny.columns:
        coach_trny_exp[col] = coach_trny[col].sum()
    #sums all the data
    df_coach_trnyexp = pd.DataFrame(columns=['coach_exp', 'made_tournament', 'first_round', 'second_round', 'sweet_sixteen', 'elite_eight', 'final_four', 'championship_game', 'champions'])
    #puts it all together
    df_coach_trnyexp.loc[teamID] = coach_trny_exp
    return df_coach_trnyexp
    #returns a dataframe using the team's ID, so it can be conjoined with all the other functions

In [13]:
def get_tourney_results(teamID, year):
    #calculates how far a team went in the tourney in a given year
    tourney_yr = tourney[tourney['Season'] == year]
    #pulls the relevant tourney year
    results = {}
    #empty dictionary to be filled via the if/else statements
    championship = tourney_yr[tourney_yr['DayNum'] == 154]
    #the day numbers are fixed, according to the data dictionary, making it easier to pick out
    if teamID not in list(championship['WTeamID']):
        results['champions'] = 0
    else:
        results['champions'] = 1
    #simplest one - did they win the championship game? Then they're champions. If not, they get a 0.
    final_four = tourney_yr[tourney_yr['DayNum'] > 150]
    if teamID not in list(final_four['WTeamID']):
        if teamID not in list(final_four['LTeamID']):
            results['championship_game'] = 0
            results['final_four'] = 0
        else:
            results['championship_game'] = 0
            results['final_four'] = 1
    else:
        results['championship_game'] = 1
        results['final_four'] = 1
    #pulls the "Final Four". If they're not either in the winner or loser list, they didn't reach this stage
    #and get a zero for the Final Four column. If they won at this stage, they proceeded to the championship
    #game and get a 1 there, else they get a one in the Final Four for reaching, but 0 for not proceeding on.
    sweet_sixteen = tourney_yr[tourney_yr['DayNum'] > 140]
    if teamID not in list(sweet_sixteen['WTeamID']):
        if teamID not in list(sweet_sixteen['LTeamID']):
            results['elite_eight'] = 0
            results['sweet_sixteen'] = 0
        else:
            results['elite_eight'] = 0
            results['sweet_sixteen'] = 1
    else:
        results['elite_eight'] = 1
        results['sweet_sixteen'] = 1
    #likewise, pulls the "Sweet Sixteen", and follows the same logic as the Final Four scenario.
    first_four = tourney_yr[tourney_yr['DayNum'] <= 135]
    full_tourney = tourney_yr[tourney_yr['DayNum'] > 135]
    if teamID not in list(full_tourney['WTeamID']):
        if teamID not in list(tourney_yr['LTeamID']):
            results['made_tournament'] = 0
            results['first_round'] = 0
            results['second_round'] = 0
        elif teamID in list(first_four['LTeamID']):
            results['made_tournament'] = 1
            results['first_round'] = 0
            results['second_round'] = 0
        else:
            results['made_tournament'] = 1
            results['first_round'] = 1
            results['second_round'] = 0
    else:
        results['made_tournament'] = 1
        results['first_round'] = 1
        results['second_round'] = 1
    #there's a few more if/else and an elif because of the existence of the 'First Four', a series of play-in
    #games. While all are considered to have 'made the tournament', if they don't win their First Four game,
    #they won't have technically have reached the first round. Teams that just don't show up in any of these
    #lists can be considered to have not made the tournament. Otherwise, if they've won at least one game in 
    #the tournament proper, they can be considered to have reached the second round. Otherwise, just the first.
    df_results = pd.DataFrame(columns=['made_tournament', 'first_round', 'second_round', 'sweet_sixteen', 'elite_eight', 'final_four', 'championship_game', 'champions'])
    df_results.loc[year] = results
    return df_results
    #organizes the results in a logical order and returns a dataframe, indexed on the year

In [14]:
def get_coaches(allteams, year):
    #runs through all the teams specified and pulls the team's coach's experience
    allteams = allteams.reset_index(drop = True)
    df_coaches = get_coach_exp(allteams.loc[0, 'TeamID'], year)
    for n in range(1, len(allteams)):
        df_coaches = pd.concat([df_coaches, get_coach_exp(allteams.loc[n, 'TeamID'], year)])
    df_coaches.reset_index(inplace = True)
    return df_coaches.rename(columns={'index': 'TeamID'})
    #returns aggregated data with a TeamID column

In [15]:
def get_seeds(year):
    #takes a given tourney year, pulls out the seed the team earned
    tourney_seeds = seeds[seeds['Season'] == year]
    team_seeds = pd.DataFrame(columns=['Seed'])
    for i in tourney_seeds.index:
        team_seeds.loc[tourney_seeds.loc[i, 'TeamID']] = int(tourney_seeds.loc[i, 'Seed'][1:3])
    #pulls the seed from the string and converts it into an int
    return team_seeds.reset_index().rename(columns={"index":"TeamID"})
    #returns all the seeds for a given year, with a TeamID column

In [16]:
def assemble_rankings(year):
    d1_teams = teams[teams['FirstD1Season'] <= year].copy()
    d1_teams = d1_teams[d1_teams['LastD1Season'] >= year]
    df = d1_teams.drop(['FirstD1Season', 'LastD1Season'], axis=1)
    #pulls the teams based off of which teams are playing in a given year, dropping the rest of the data
    record = get_records(year)
    ranking = get_rankings(year)
    df_w_record = df.merge(record, on='TeamID')
    df_w_ranking = df_w_record.merge(ranking, on='TeamID')
    return df_w_ranking
    #ranking requires the full dataset, before we can cut down to the tournament teams. As for the record, 
    #it doesn't really matter whether it becomes before or after - but as written, it goes before. The other 
    #two can cut down drastically on calculation time if instead of the full dataset, we just use relevant teams

In [17]:
def assemble_teams(df_rankings, year):
    #cuts down from a full list of rankings from all teams and cuts down to the relevant teams from the tournament
    df_tourney_teams = seeds[seeds['Season'] == year]
    tourney_teams = list(df_tourney_teams['TeamID'])
    teamlist = df_rankings['TeamID']
    drops = []
    for i in df_rankings.index:
        t = df_rankings.loc[i, 'TeamID']
        if t not in tourney_teams:
            drops.append(i)
    #since seeding requires only the teams that made it (otherwise would need to generate nulls), it comes in here
    tourney_seeds = get_seeds(year)
    return df_rankings.drop(drops).merge(tourney_seeds, on="TeamID")
    #returns a dataset with all the tournament teams and their seeds

In [18]:
def assemble_stats(df_teams, year):
    #to cut down on calculation time, goes from just the tourney teams and starts doing the calculations
    coaches = get_coaches(df_teams, year)
    stats = get_season_stats(df_teams, year)
    df_w_coaches = df_teams.merge(coaches, on='TeamID')
    df_w_stats = df_w_coaches.merge(stats, on='TeamID')
    #uses the previously built functions to gather the information
    return df_w_stats
    #should be what we use for the model... eventually

In [19]:
model_df2017 = assemble_stats(assemble_teams(assemble_rankings(2017), 2017), 2017)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


64 teams left to compile
60 teams left to compile
56 teams left to compile
52 teams left to compile
48 teams left to compile
44 teams left to compile
40 teams left to compile
36 teams left to compile
32 teams left to compile
28 teams left to compile
24 teams left to compile
20 teams left to compile
16 teams left to compile
12 teams left to compile
8 teams left to compile
4 teams left to compile


In [20]:
model_df2017

Unnamed: 0,TeamID,TeamName,Wins,Losses,Percentage,MeanRank,AbsMeanRank,Seed,coach_exp,made_tournament,...,AgstOppOffAvgOppFGA3,AgstOppOffAvgOppFTM,AgstOppOffAvgOppFTA,AgstOppOffAvgOppOR,AgstOppOffAvgOppDR,AgstOppOffAvgOppAst,AgstOppOffAvgOppTO,AgstOppOffAvgOppStl,AgstOppOffAvgOppBlk,AgstOppOffAvgOppPF
0,1112,Arizona,30,4,0.882,12.0286,10,2,13,9,...,0.128,-2.660,-3.536,-0.922,-5.587,-2.019,-0.681,-1.337,-1.074,1.683
1,1116,Arkansas,25,9,0.735,35.4857,30,8,15,7,...,1.108,1.145,1.697,1.555,-1.936,-0.563,-0.030,-1.317,-0.740,-0.072
2,1124,Baylor,24,7,0.774,14.4857,14,3,15,6,...,-0.992,-2.788,-4.474,-0.813,-6.069,-1.322,-1.827,-1.582,-0.427,-0.460
3,1137,Bucknell,26,8,0.765,76.5571,74,13,2,0,...,-3.524,-0.761,0.013,0.638,-0.835,-1.400,0.225,0.609,0.167,0.660
4,1139,Butler,23,8,0.742,21.8857,23,4,6,2,...,-0.784,-0.454,-0.365,-1.267,-3.106,-4.179,0.724,-1.419,-0.937,0.621
5,1153,Cincinnati,29,5,0.853,19.0571,20,6,14,8,...,-0.841,-2.769,-4.525,-0.226,-3.171,-2.621,1.017,-2.418,-0.575,-1.207
6,1166,Creighton,24,9,0.727,25.8857,26,6,16,6,...,1.495,-3.317,-3.547,1.051,0.233,-0.918,1.471,0.274,-0.532,-1.056
7,1173,Dayton,23,7,0.767,38.6286,33,7,6,3,...,1.154,-1.107,-1.471,-0.483,-0.607,-0.713,1.763,-1.665,-0.240,0.991
8,1181,Duke,27,8,0.771,8.6429,6,2,33,31,...,-3.860,-1.849,-2.375,-0.194,-3.120,-2.420,-0.270,-1.486,-1.559,2.063
9,1190,ETSU,25,7,0.781,64.9429,63,13,2,0,...,-1.299,1.222,1.541,-0.879,-1.807,-3.065,2.897,0.629,0.612,1.620


In [21]:
model_df2017.to_csv('./GeneratedDatasets/generated2017.csv')

In [22]:
def compile_stats(year):
    model_year = assemble_stats(assemble_teams(assemble_rankings(year), year), year)
    model_year.to_csv(('./GeneratedDatasets/generated' + str(year) + '.csv'))

In [23]:
for yr in range(2003, 2017):
    compile_stats(yr)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


64 teams left to compile
60 teams left to compile
56 teams left to compile
52 teams left to compile
48 teams left to compile
44 teams left to compile
40 teams left to compile
36 teams left to compile
32 teams left to compile
28 teams left to compile
24 teams left to compile
20 teams left to compile
16 teams left to compile
12 teams left to compile
8 teams left to compile
4 teams left to compile
64 teams left to compile
60 teams left to compile
56 teams left to compile
52 teams left to compile
48 teams left to compile
44 teams left to compile
40 teams left to compile
36 teams left to compile
32 teams left to compile
28 teams left to compile
24 teams left to compile
20 teams left to compile
16 teams left to compile
12 teams left to compile
8 teams left to compile
4 teams left to compile
64 teams left to compile
60 teams left to compile
56 teams left to compile
52 teams left to compile
48 teams left to compile
44 teams left to compile
40 teams left to compile
36 teams left to compile
32 t

In [24]:
#newly updated data for 2018, have to switch the - all previous data should remain accurate
massey = pd.read_csv('./MasseyOrdinals_thruSeason2018_Day128.csv')
season = pd.read_csv('./Stage2UpdatedDataFiles/RegularSeasonDetailedResults.csv')
coaches = pd.read_csv('./Stage2UpdatedDataFiles/TeamCoaches.csv')
seeds = pd.read_csv('./Stage2UpdatedDataFiles/NCAATourneySeeds.csv')

In [26]:
compile_stats(2018)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


64 teams left to compile
60 teams left to compile
56 teams left to compile
52 teams left to compile
48 teams left to compile
44 teams left to compile
40 teams left to compile
36 teams left to compile
32 teams left to compile
28 teams left to compile
24 teams left to compile
20 teams left to compile
16 teams left to compile
12 teams left to compile
8 teams left to compile
4 teams left to compile


In [29]:
model_df2018 = pd.read_csv('./GeneratedDatasets/generated2018.csv')

In [30]:
model_df2018.drop()

Unnamed: 0.1,Unnamed: 0,TeamID,TeamName,Wins,Losses,Percentage,MeanRank,AbsMeanRank,Seed,coach_exp,...,AgstOppOffAvgOppFGA3,AgstOppOffAvgOppFTM,AgstOppOffAvgOppFTA,AgstOppOffAvgOppOR,AgstOppOffAvgOppDR,AgstOppOffAvgOppAst,AgstOppOffAvgOppTO,AgstOppOffAvgOppStl,AgstOppOffAvgOppBlk,AgstOppOffAvgOppPF
0,0,1104,Alabama,19,15,0.559,54.8361,54,9,3,...,0.110,-0.504,0.414,0.683,-1.201,-2.441,0.552,0.544,-0.570,2.165
1,1,1112,Arizona,27,7,0.794,23.2459,23,4,14,...,-1.162,-1.244,-2.164,-1.519,-5.460,-1.259,-1.341,0.125,-1.248,1.558
2,2,1113,Arizona St,20,11,0.645,42.1311,42,11,5,...,3.128,-0.282,-0.431,1.332,0.073,1.719,1.997,-0.569,-0.260,3.339
3,3,1116,Arkansas,23,11,0.676,30.3443,28,7,16,...,1.794,2.536,3.442,0.175,-1.120,-0.352,1.456,-1.378,-0.524,1.130
4,4,1120,Auburn,25,7,0.781,12.0984,11,4,14,...,1.319,1.693,1.829,-0.402,-0.943,-0.717,2.584,0.277,-0.711,2.411
5,5,1137,Bucknell,25,9,0.735,98.7377,95,14,3,...,-2.277,1.928,3.075,1.210,0.278,-1.057,-0.355,0.572,0.289,3.547
6,6,1138,Buffalo,25,8,0.758,72.3279,71,13,3,...,-1.593,4.007,4.893,0.240,0.044,-1.730,1.096,-0.517,0.435,0.785
7,7,1139,Butler,20,13,0.606,31.9016,30,10,2,...,-1.588,0.367,0.128,-2.058,-1.048,-2.315,1.690,-1.198,-0.293,-0.779
8,8,1153,Cincinnati,30,4,0.882,5.9508,5,2,15,...,-1.764,-3.749,-4.876,-0.132,-3.762,-3.708,2.104,-1.339,-0.491,0.490
9,9,1155,Clemson,23,9,0.719,19.5410,18,5,16,...,-0.186,-3.215,-4.095,-0.968,-1.567,-3.268,-0.317,-0.517,-0.245,0.643


In [31]:
for yr in range(2003, 2019):
    model_yr = pd.read_csv('./GeneratedDatasets/generated' + str(yr) + '.csv')
    model_yr.drop('Unnamed: 0', axis = 1, inplace = True)
    model_yr.to_csv('./GeneratedDatasets/generated' + str(yr) + '.csv', index=False)

In [32]:
model_df2018 = pd.read_csv('./GeneratedDatasets/generated2018.csv')

In [33]:
model_df2018

Unnamed: 0,TeamID,TeamName,Wins,Losses,Percentage,MeanRank,AbsMeanRank,Seed,coach_exp,made_tournament,...,AgstOppOffAvgOppFGA3,AgstOppOffAvgOppFTM,AgstOppOffAvgOppFTA,AgstOppOffAvgOppOR,AgstOppOffAvgOppDR,AgstOppOffAvgOppAst,AgstOppOffAvgOppTO,AgstOppOffAvgOppStl,AgstOppOffAvgOppBlk,AgstOppOffAvgOppPF
0,1104,Alabama,19,15,0.559,54.8361,54,9,3,0,...,0.110,-0.504,0.414,0.683,-1.201,-2.441,0.552,0.544,-0.570,2.165
1,1112,Arizona,27,7,0.794,23.2459,23,4,14,10,...,-1.162,-1.244,-2.164,-1.519,-5.460,-1.259,-1.341,0.125,-1.248,1.558
2,1113,Arizona St,20,11,0.645,42.1311,42,11,5,1,...,3.128,-0.282,-0.431,1.332,0.073,1.719,1.997,-0.569,-0.260,3.339
3,1116,Arkansas,23,11,0.676,30.3443,28,7,16,8,...,1.794,2.536,3.442,0.175,-1.120,-0.352,1.456,-1.378,-0.524,1.130
4,1120,Auburn,25,7,0.781,12.0984,11,4,14,8,...,1.319,1.693,1.829,-0.402,-0.943,-0.717,2.584,0.277,-0.711,2.411
5,1137,Bucknell,25,9,0.735,98.7377,95,14,3,1,...,-2.277,1.928,3.075,1.210,0.278,-1.057,-0.355,0.572,0.289,3.547
6,1138,Buffalo,25,8,0.758,72.3279,71,13,3,1,...,-1.593,4.007,4.893,0.240,0.044,-1.730,1.096,-0.517,0.435,0.785
7,1139,Butler,20,13,0.606,31.9016,30,10,2,0,...,-1.588,0.367,0.128,-2.058,-1.048,-2.315,1.690,-1.198,-0.293,-0.779
8,1153,Cincinnati,30,4,0.882,5.9508,5,2,15,9,...,-1.764,-3.749,-4.876,-0.132,-3.762,-3.708,2.104,-1.339,-0.491,0.490
9,1155,Clemson,23,9,0.719,19.5410,18,5,16,4,...,-0.186,-3.215,-4.095,-0.968,-1.567,-3.268,-0.317,-0.517,-0.245,0.643
