In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statistics
import seaborn as sns


%matplotlib inline

In [None]:
df = pd.read_csv('/kaggle/input/nba-players-stats/Seasons_Stats.csv',index_col ='Unnamed: 0')
dfv = pd.read_csv('/kaggle/input/nba-mvp-voting-data/NBA MVP Voting Data.csv', encoding='latin-1')
dfw = pd.read_csv('/kaggle/input/nba-team-wins/NBA_team_wins.csv',index_col ='Unnamed: 0', encoding='latin-1')

In [None]:
#removing two blank columns
df = df[['Year', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'PER', 'TS%',
       '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%',
       'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM',
       'BPM', 'VORP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA',
       '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL',
       'BLK', 'TOV', 'PF', 'PTS']]

#67 rows with all NA. Aligns with the Years that are NAN, will remove these rows.
df = df.drop(list(df[df['Year'].isna()].index))
#5 rows with 0 MP. Messing with adv stats. Remove rows.
df = df.drop(list(df[df['MP'] == 0].index))
#Most missing 'PER' in 1950-51, ignore these. Delete 12 values from 1956-1962
df = df.drop(list(df.loc[(df['PER'].isna()) & df['Year'].isin(list(range(1956,1963)))].index))

#Norm Van needs to be updated to Norm Van Lier
df.loc[df['Player']=='Norm Van', 'Player'] = 'Norm Van Lier'
#Jo Jo needs to be updated to Jo Jo White
df.loc[df['Player'].str.contains('Jo Jo'), 'Player'] = 'Jo Jo White'
#World B. to be updated to World B. Free
df.loc[df['Player'].str.contains('World B.'), 'Player'] = 'World B. Free'
#Micheal Ray to be updated to Micheal Ray Richardson
df.loc[df['Player'].str.contains('Micheal Ray'), 'Player'] = 'Micheal Ray Richardson'
#Joe Barry to be updated to Joe Barry Carroll
df.loc[df['Player'].str.contains('Joe Barry'), 'Player'] = 'Joe Barry Carroll'
#Peja Stojakovic updated in dfv
dfv.loc[dfv['Player'].str.contains('Peja'),'Player'] = 'Peja Stojakovic'
#Manu Ginóbili updated in df
df.loc[df['Player'].str.contains('Manu G'), 'Player'] = 'Manu Ginóbili'
dfv.loc[dfv['Player'].str.contains('Manu G'), 'Player'] = 'Manu Ginóbili'
#Goran Dragic updated in dfv
dfv.loc[dfv['Player'].str.contains('Goran D'), 'Player'] = 'Goran Dragic'
#8 missing values for age can be manually populated given the players actual age on feb 1st of that season. All prior to 1958, won't impact later analyses
df.loc[df['Age'].isna(), 'Age'] = [24, 22, 25, 23, 22, 22, 22, 23]

#99 missing 3PAr - when FGA == 0, fill with 0.
    #83 post 1980. 3PAr = 0 prior to 1980 is messing with feature selection.
df.loc[(df['Year'] >= 1980) & (df['3PAr'].isna()),'3PAr'] = 0
#3511 missing 3P% after 1980. Due to 0 3PA. Fill with 0
df.loc[(df['3P%'].isna()) & (df['3PA'] == 0), '3P%'] = 0
#128 missing 2P%. Due to 0 2PA. Fill with 0
df.loc[(df['2P%'].isna()) & (df['2PA'] == 0), '2P%'] = 0
#99 missing eFG%. Due to 0 FGA. Fill with 0
df.loc[(df['eFG%'].isna()) & (df['FGA'] == 0), 'eFG%'] = 0
#858 missing FT%. Due to 0 FTA. Fill with 0
df.loc[(df['FT%'].isna()) & (df['FTA'] == 0), 'FT%'] = 0
#99 missing FTr. All due to 0 FGA. Can fill with 0
df.loc[(df['FTr'].isna()) & (df['FGA'] == 0), 'FTr'] = 0
#86 values with nan TS%, 0 pts/FGA leads to no TS%. Fill with 0
df.loc[df['TS%'].isna(),'TS%'] = 0
#63 missing TOV% when >= 1978. All 0 TOVs, and less than 4 games. Can fill with 0.
df.loc[(df['TOV%'].isna()) & (df['Year']>=1978), 'TOV%'] = 0
#93 missing FG% after 1970, all have 0 FG and 0 FGA. Fill with 0
df.loc[(df['Year']>1970) & (df['FG%'].isna()), 'FG%'] = 0
#1980-1, 13 filled GS values, fill with NaN to keep uniformity with the rest of 80-81
df.loc[((df['Year']==1980) | (df['Year']==1981)) & (df['GS'].notnull()), 'GS'] = np.NaN

#Charlotte Bobcats become a team in 2005 became Charlotte Hornets in 2015
#Change Charlotte Bobcats Tm to CHB
df.loc[(df['Tm'] == 'CHA') & (df['Year'] >= 2005 ) & (df['Year'] < 2015 ),['Tm']] = 'CHB'
dfv.loc[(dfv['Tm'] == 'CHA') & (dfv['Year'] >= 2005 ) & (dfv['Year'] < 2015 ),['Tm']] = 'CHB'
#Remove Astericks from Players 
#Will throw a warning but I can't figure out how to return a value (and not series or df) using iloc
for index in df.index:
    if '*' in df['Player'][index]:
        df['Player'][index] = df['Player'][index].replace('*','')

#Removing seperator rows from dfw
dfw = dfw[dfw['Wins']!='W']
#Updating SHE team_name in dfw
dfw.loc[dfw['Team_Name']=='Sheboygan Red Skins', 'Team_Name'] = 'Sheboygan Redskins'

In [None]:
APPENDIX

In [None]:
#How will we handle outliers? 
#I think high end outliers will probably relate to being voted for MVP, so I don't think we should remove
#Add MVP feature and check

#Update feature data types to remove unnecessary decimals. 
#Turns out dtype int64 cannot handle NaN. Will leave as float.
#What affect does NaN have on ML algorithms?

#Consider creating a moving time frame for train set. Start with year one, predict year two. Train with year one and two, predict year three, etc.
    #Consider regression for predicting MVP Share, classification for MVP/Runnerup
    #Will I have issues with unbalanced classes? MVP/Runnerups is small proportion <10% of NBA players
#What kinds of changes in descriptive statistics can we identify between seasons?
    #What about between MVP group and Non MVP?
#Do the weights of certain features change between years? This could show a change in strategies as the NBA has evolved (e.g. 3pt attempts and makes)
#How do I quantify voter fatigue? How often a player has previously appeared in contention for MVP.
#How do I quantify media narratives? Social media scrapping to identify how often a player is mentioned with 'MVP'
#How do I quantify superstar?
#Should I add a previous award metric (DPOY, MVP, Rookie, etc) - I expect that previous MVPs are less likely to gain the award again
    #Show the change in Share after a player has won MVP, compare to our model's prediction


#MVP voting has changed over the years, from 247 total votes in '76-77 to 1010 in '18-19. 
    #Should use proportion of total votes as your metric
#MVP is for regular season performance
    #Should I create a championship feature? Probably not
#MVP voting - Each ballot has 5 spots: 1st place - 10pts, 2nd - 7pts, 3rd - 5pts, 4th - 3pts, 5th - 1pt
    #Until '79-80 MVP voted by NBA players
    #In 80-81 100 independent media members in USA and Canada receive votes
    #In '10-11 online voting by fans was aggregated into a single ballot in addition to media votes
    #Do I need to normalize/standardize this? Is there a difference in predicted outcome with the fan vote implemented?
#MVP metrics: 3 features
    #% of total MVP votes (this will account for runner ups)
    #1 for who won the MVP that season, 2 for runnnner-ups
    #MVP win-share minus average win share for non MVP players on that team. (Quantifying carrying a bad team)

#WS has several formulas depending on what stats were being tracked in certain seasons
    #'77-'78 to present - PTS, offensive possessions, marginal offense, marginal points/win
    #'72-'73 - '76-'77 - TOV had to be estimated
    #'49 -'50 - '72 -'73 - estimated player modified points, player's modified shot attempts, league points/shot attempt, marginal offense, marginal pts/win 
#Advanced statistics start '96-97 - offrtg, defrtg, netrtg, ast%, ast/to, ast ratio, oreb%, dreb%, reb%, TOV%, EFG%, TS%, PACE, PIE
    #consider modeling with base stats from '73-74 and adv stats from '96-97
    
#1950-1970 Two divisions, Eastern/Western
#1971-2004 four divisions, Atlantic/Central, Midwest/Pacific
#2005-2019 six divisions, Atlantic/Central/Southeast, Midwest/Pacific/Southwest