In [1]:
import pandas as pd

In [2]:
# Import the master data file into a dataframe
master_df = pd.read_csv('data/masterFinalData11_24_2020.csv')

In [3]:
print(master_df.columns)
master_df.head()

Index(['Unnamed: 0', 'gameID', 'statsID', 'playerID', 'playerFullName',
       'playerFirstName', 'playerLastName', 'playerPosition', 'age',
       'playerHeightFeet', 'playerHeightInches', 'playerWeightPounds',
       'assists', 'blocks', 'defensiveRebounds', 'threePointFieldGoalPercent',
       'threePointFieldGoalAttempted', 'threePointFieldGoal',
       'fieldGoalPercent', 'fieldGoalAttempt', 'fieldGoalMade',
       'freeThrowPercent', 'freeThrowAttempt', 'freeThrowMade', 'minutes',
       'offensiveRebound', 'personalFouls', 'points', 'rebounds', 'steals',
       'teamID', 'turnovers', 'winner_x', 'date', 'homeTeamScore',
       'visitorTeamScore', 'season_y', 'time', 'homeTeamID', 'homeTeamAbbrev',
       'homeTeamCity', 'homeTeamName', 'visitorTeamID', 'visitorTeamAbbrev',
       'visitorTeamCity', 'visitorTeamName', 'team_abbreviation',
       'playerHeight', 'playerWeight', 'college', 'country', 'draft_year',
       'draft_round', 'draft_number', 'gp', 'pts', 'reb', 'ast', 'ne

Unnamed: 0.1,Unnamed: 0,gameID,statsID,playerID,playerFullName,playerFirstName,playerLastName,playerPosition,age,playerHeightFeet,...,gp,pts,reb,ast,net_rating,oreb_pct,dreb_pct,usg_pct,ts_pct,ast_pct
0,0,27276,708256,364,Chandler Parsons,Chandler,Parsons,F,26.0,6.0,...,66.0,15.7,4.9,2.4,4.6,0.032,0.132,0.205,0.567,0.111
1,1,27276,708257,346,Dirk Nowitzki,Dirk,Nowitzki,F,37.0,7.0,...,77.0,17.3,5.9,1.9,4.3,0.021,0.206,0.251,0.56,0.101
2,2,27276,708258,94,Tyson Chandler,Tyson,Chandler,C,32.0,7.0,...,75.0,10.3,11.5,1.1,5.5,0.141,0.271,0.128,0.697,0.053
3,3,27276,708259,1635,Monta Ellis,Monta,Ellis,,29.0,,...,80.0,18.9,2.4,4.1,4.2,0.014,0.063,0.277,0.509,0.2
4,4,27276,708260,1533,Jameer Nelson,Jameer,Nelson,,33.0,,...,63.0,8.3,2.3,4.0,-0.6,0.025,0.091,0.199,0.503,0.278


In [4]:
# Create the empty dataframe that will be used for machine learning
columns = ['gameID', 'homeTeamID', 'visitorTeamID', 'homeTeamHeightAverage', 'homeTeamWeightAverage', 'homeTeamAgeAverage', 'visitorTeamHeightAverage', 'visitorTeamWeightAverage', 'visitorTeamAgeAverage', 'homeTeamWin']
ml_df = pd.DataFrame(columns=columns)
ml_df.head()



Unnamed: 0,gameID,homeTeamID,visitorTeamID,homeTeamHeightAverage,homeTeamWeightAverage,homeTeamAgeAverage,visitorTeamHeightAverage,visitorTeamWeightAverage,visitorTeamAgeAverage,homeTeamWin


In [5]:
# Get rid of unneeded columns
#jason added minutes

master_clean_df = master_df[['gameID', 'playerID', 'teamID', 'age', 'playerHeight', 'playerWeight', 'minutes','homeTeamScore', 
                             'visitorTeamScore', 'homeTeamID', 'visitorTeamID',  'winner_x',]]
master_clean_df.head()

Unnamed: 0,gameID,playerID,teamID,age,playerHeight,playerWeight,minutes,homeTeamScore,visitorTeamScore,homeTeamID,visitorTeamID,winner_x
0,27276,364,7,26.0,205.74,102.965384,28:29,108,87,25,7,0
1,27276,346,7,37.0,213.36,111.13004,27:04,108,87,25,7,0
2,27276,94,7,32.0,215.9,108.86208,28:35,108,87,25,7,0
3,27276,1635,7,29.0,190.5,83.91452,26:32,108,87,25,7,0
4,27276,1533,7,33.0,182.88,86.18248,25:22,108,87,25,7,0


In [6]:
#minutes is an object, so splitting it up into minutes and seconds
splitMinutes = master_clean_df['minutes'].str.split(":", n=1, expand = True)
df_withTime = master_clean_df.copy()
df_withTime['minutesPlayed'] = splitMinutes[0]
df_withTime['secondsPlayed'] = splitMinutes[1]

In [7]:
#dropping any players who didn't actually play in the game.
df_withTime = df_withTime.dropna()

In [8]:
#Filling remaining NaN with 0
df_withTime['minutesPlayed'].fillna(0)
df_withTime['secondsPlayed'].fillna(0)

#casting as integers and converting into minutes with fractional minutes from seconds.
final_df = df_withTime.copy()
final_df['minutesPlayed']= final_df['minutesPlayed'].astype('int32')
final_df['secondsPlayed']= final_df['secondsPlayed'].astype('int32')
final_df['gamePlaytime'] = final_df['minutesPlayed'] + (final_df['secondsPlayed']/60)

In [9]:
#converting the time into a weighted coefficient. 240 is total players game time so values when added will give
#a weighted average age, height and weight.  Must be SUMMED though not averaged as the weight already averages.

final_df = final_df.drop(columns = ['minutesPlayed', 'secondsPlayed', 'minutes'])
final_df['gameAge'] = final_df['gamePlaytime']/240 * final_df['age']
final_df['gameHeight'] = final_df['gamePlaytime']/240 * final_df['playerHeight']
final_df['gameWeight'] = final_df['gamePlaytime']/240 * final_df['playerWeight']

#cleaning up dataframe and putting the columns exactly how the input df want it labeled.
final_df = final_df.drop(columns = ['age', 'playerHeight', 'playerWeight', 'gamePlaytime'])
final_df = final_df.rename(columns = {'gameAge': 'age', 'gameHeight': 'playerHeight', 'gameWeight': 'playerWeight'})

In [10]:
#checking out the stuff.
final_df.head()
             

Unnamed: 0,gameID,playerID,teamID,homeTeamScore,visitorTeamScore,homeTeamID,visitorTeamID,winner_x,age,playerHeight,playerWeight
0,27276,364,7,108,87,25,7,0,3.085694,24.417338,12.219989
1,27276,346,7,108,87,25,7,0,4.172778,24.062267,12.532999
2,27276,94,7,108,87,25,7,0,3.811111,25.71309,12.965171
3,27276,1635,7,108,87,25,7,0,3.206111,21.060833,9.277216
4,27276,1533,7,108,87,25,7,0,3.487917,19.3294,9.109009


In [11]:
# Group by game ID
group_by_gameID = final_df.groupby(['gameID'])
group_by_gameID

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002595E2E7240>

In [12]:
# Iterate over the games
for name, group in group_by_gameID:
    # gather the data we need to fill our dataframe
#     print(group)
    gameID = name
    homeTeamID = group['homeTeamID'].values[0]
    visitorTeamID = group['visitorTeamID'].values[0]
    homeTeamScore = group['homeTeamScore'].values[0]
    visitorTeamScore = group['visitorTeamScore'].values[0]
    
    # group by the team to get team average values
    groupByTeamsHomeAvgs = group.groupby(['teamID']).get_group(homeTeamID).sum()
    groupByTeamsVisAvgs = group.groupby(['teamID']).get_group(visitorTeamID).sum()
    homeHeightAvg = groupByTeamsHomeAvgs['playerHeight']
    homeWeightAvg = groupByTeamsHomeAvgs['playerWeight']
    homeAgeAvg = groupByTeamsHomeAvgs['age']
    visHeightAvg = groupByTeamsVisAvgs['playerHeight']
    visWeightAvg = groupByTeamsVisAvgs['playerWeight']
    visAgeAvg = groupByTeamsVisAvgs['age']

    # did home team win? 1 else 0
    homeTeamWin = 1 if (homeTeamScore - visitorTeamScore) > 0 else 0

    # Fill the dataframe row!
    dictRow = { 'gameID': gameID, 'homeTeamID': homeTeamID, 'visitorTeamID': visitorTeamID, 
               'homeTeamHeightAverage': homeHeightAvg, 'homeTeamWeightAverage': homeWeightAvg, 
               'homeTeamAgeAverage': homeAgeAvg, 'visitorTeamHeightAverage': visHeightAvg, 
               'visitorTeamWeightAverage': visWeightAvg, 'visitorTeamAgeAverage': visAgeAvg, 
               'homeTeamWin': homeTeamWin }

    # add row to df
    ml_df = ml_df.append(dictRow, ignore_index=True)
    
ml_df.head()

KeyError: 19

In [13]:
#note new file name to distinguish time weighted average.
ml_df.to_csv('data/machineLearningDataSetTime.csv')