In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from scipy.spatial import distance
from sklearn import preprocessing as pre

<h2> Here we are getting the data organized based on a specified starting year </h2>

In [None]:
#THIS IS THE YEAR WE WANT TO START AND FINISH CALCULATING THE AVERAGE FROM
#min year is 1980 because the stats are pretty sketchy prior

start_year = 1980
end_year = 2020

In [None]:
all_players_df = pd.read_csv("csv_files/player_stats_"+str(start_year)+".csv")
all_players_df = all_players_df.drop_duplicates('NAME', keep='first')
all_players_df['YEAR']= start_year

for year in range(start_year+1, end_year+1):
    
    #store current year in df
    current_year_players_df = pd.read_csv("csv_files/player_stats_"+str(year)+".csv")

    #since some players were traded during each season their stats are split first showing TOT (total season)
    #and the following rows showing their stats specific for their teams
    #since the total is always first we can drop all duplicates following the first instance (ASSUMING NO PLAYERS HAVE THE SAME NAME)
    current_year_players_df = current_year_players_df.drop_duplicates('NAME', keep='first')
    current_year_players_df['YEAR'] = year
    #appending the current year to all previous years 
    all_players_df = all_players_df.append(current_year_players_df, ignore_index=True)
    
    

all_players_df
    

In [None]:
#HERE WE WILL SET CRITERIA FOR OUR DATA BASE AND GET RID OF NANs

#filling null values with 0
all_players_df = all_players_df.fillna(0)

######### UNCOMMENT IF YOU ONLY WANT TO LOOK AT CERTAIN POSITIONS #############
# position = 'C'
# players_df = all_players_df[all_players_df['POS'] == ' ' + position]

######### UNCOMMENT IF YOU ONLY WANT TO LOOK AT CERTAIN TEAM #############
# team = 'TOR'
# players_df = all_players_df[all_players_df['TEAM'] == ' ' + team]

######### UNCOMMENT IF WE WANT ALL PLAYERS #############
players_df = all_players_df

players_df = players_df.reset_index()



<h2> Adding helper methods </h2>
<p> The stats that we want from players to include are pts, reb, ast, stl, blk, tov, pf which are already in this data frame. But to make the most accurate comparisons we are going to want TS% 3PAr and FTr which are not in the df therefore have to be calculated. These helper methods make such calculations </p>

In [None]:
#Helper methods that take in the original data frame and output each stat 
def add_true_shooting(df):
    list_of_TS = []
    #goes through each row of the df and uses data to calculate each stat
    for index, player in df.iterrows():
        #this if statement prevents a divide by zero exception
        if(player['FGA']==0):
            list_of_TS.append(0.00000)
            continue
        ts = player['PTS']/(2*(player['FGA']+(0.44*player['FTA'])))
        #appends this calculated stat to the list 
        list_of_TS.append(ts)
    #adds list to dataframe with proper header    
    df['TS'] = list_of_TS
    return df

#these are methods are similar to the one above just implementing their own stat
def add_3PAr(df):
    list_of_3PAr = []
    for index, player in df.iterrows():
        if(player['FGA']==0):
            list_of_3PAr.append(0.00000)
            continue
        val = player['3PA']/player['FGA']
        list_of_3PAr.append(val)
       
    df['3PAr'] = list_of_3PAr
    return df

def add_FTr(df):
    list_of_FTr = []
    for index, player in df.iterrows():
        if(player['FGA']==0):
            list_of_FTr.append(0.00000)
            continue
        val = player['FTA']/player['FGA']
        list_of_FTr.append(val)
       
    df['FTr'] = list_of_FTr
    return df

def make_per_36(df): 
    for index , player in df.iterrows():
        if player['MP'] == 0:
            continue
        else:
            multiplier = 36 / player['MP']

        df.loc[index,['MP','FG','FGA','3P','3PA','2P','2PA','FT','FTA','ORB','DRB','TRB','AST','STL','BLK','TOV','PF','PTS']] *= multiplier
       
        
        if index % int(len(df)*0.25) == 0:
            print(str(index) + " players stats adjusted")
        
    return df
        
    
    

   <h3> Here we are going to add the stats from helper methods if the league tracked those stats at that time</h3>

In [None]:
#adding true shooting 3PAr and FTr to our data frame that is not adjusted per 36
if (start_year >= 1980):
    players_df = add_true_shooting(add_3PAr(add_FTr(players_df )))



<h2> Here we are going to decide if we want the stats adjusted per 36 or not </h2>

In [None]:
#################### Here we change the boolean value ###########
per_36 = True
#################################################################


if per_36: 
    players_df = make_per_36(players_df)
    
    
    


<h2> Finding averages for player logistics </h2>

<h3> Looking at games played </h3>

In [None]:
#checking what is the average number of games played 
print("Average Games Played is: " + str(players_df['G'].mean()))

#tor_players_df.hist(column='GP').show()
plt.style.use('fivethirtyeight')

#making the plot 
gamesPlayedHist, g = plt.subplots()

g.hist(players_df['G'],color='red',bins=range(0,83),rwidth=1)
g.set_xlabel("Games Played")
g.set_ylabel("Number of Players")
g.set_title("Distribution of Games Played per Season")
g.plot()

In [None]:
# print(str(len(players_df.loc[players_df['G']== 82])))

# players_df.loc[players_df['G']== 82].head(10)

<h3> Now we are looking at average age </h3>

In [None]:
#checking what is the average number of games played 
print("Average Age is: " + str(all_players_df['AGE'].mean()))


plt.style.use('fivethirtyeight')

#making the plot 
gamesPlayedHist, g = plt.subplots()

g.hist(all_players_df['AGE'],color='red',rwidth=0.75, bins=range(18,46))
g.set_xlabel("Games Played")
g.set_ylabel("Number of Players")
g.set_title("Distribution of Player Ages")
g.plot()

<h2> Finding average stats for nba players (no adjustment for minutes) </h2>

<h3> Finding stats that won't be used in comparison </h3>

In [None]:

players_stats_df = players_df[['PTS', 'AST', 'TRB', 'STL','BLK','TS','3PAr', 'FTr','TOV','PF']]
#players_stats_NA_df = players_df[['PTS', 'AST', 'TRB']]

#Taking the mean for each column of our players dataframe
avg_player = players_stats_df.mean()

avg_player

#players_stats_NA_df

<h3> Finding most average player </h3> 
<p> Here we are going do normalize the dataframes so that the distance measurements will be of equal weights. Then we will find how similar each player's stats are to the average by finding the distance between stat vectors. This measurement will be made using eucliadian distance and cosine similarity </p> 

In [None]:
#Firsr we have to append the mean average player to a dataframe
avg_player_df = pd.DataFrame([avg_player])

df_to_normalize = pd.concat([players_stats_df,avg_player_df])



In [None]:
#now we have to normalize the data 
x = df_to_normalize.values #returns a numpy array
min_max_scaler = pre.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df_normalized_data = pd.DataFrame(x_scaled)

#printing normalized data
#df_normalized_data


In [None]:
#we are going to use test this using euclidian distance and cosine distance 
#I read euclidian is more useful so that will be the baseline 
cos_dist = []
e_dist = []
for i in range(len(players_df)):
    name = players_df.iloc[i]['NAME']
    year = players_df.at[i,'YEAR']
    e_dist.append([name,year, distance.euclidean(df_to_normalize.iloc[-1],df_to_normalize.iloc[i]),i] )
    cos_dist.append([name,year,1 - distance.cosine(df_to_normalize.iloc[-1],df_to_normalize.iloc[i]),i] )
    
e_most_simlar = sorted(e_dist, key=lambda x: x[2])
cos_most_simlar = sorted(cos_dist, key=lambda x: x[2], reverse= True)

#e_most_simlar[-100:-1]

In [None]:
e_most_simlar