In [1]:
#find the most similar NBA players to Lebron James.
import sys
# Python version
print('Python: {}'.format(sys.version))

sys.path.append('/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/')
sys.path.append('/Library/Python/2.7/site-packages/')
import math
import pandas
with open("./data/nba_2013.csv", 'r') as csvfile:
    nba = pandas.read_csv(csvfile)

# The names of all the columns in the data.
print(nba.columns.values)

#Select Lebron James from our dataset
selected_player = nba[nba["player"] == "LeBron James"].iloc[0]

# Choose only the numeric columns (we'll use these to compute euclidean distance)
distance_columns = ['age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.', 'ft', 'fta', 'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts']

def euclidean_distance(row):
    """
    A simple euclidean distance function
    """
    inner_value = 0
    for k in distance_columns:
        inner_value += (row[k] - selected_player[k]) ** 2
    return math.sqrt(inner_value)

#because a variable having larger values doesn't necessarily make it better at predicting what rows are similar.
#A simple way to deal with this is to normalize all the columns to have a mean of 0, and a standard deviation of 1. 
#This will ensure that no single column has a dominant impact on the euclidean distance calculations.

# Select only the numeric columns from the NBA dataset
nba_numeric = nba[distance_columns]

# Normalize all of the numeric columns
nba_normalized = (nba_numeric - nba_numeric.mean()) / nba_numeric.std()

# Find the distance from each player in the dataset to lebron.
lebron_distance = nba_normalized.apply(euclidean_distance, axis=1)

# Create a new dataframe with distances.
distance_frame = pandas.DataFrame(data={"dist": lebron_distance, "idx": lebron_distance.index})
distance_frame.sort("dist", inplace=True)
# Find the most similar player to lebron (the lowest distance to lebron is lebron, the second smallest is the most 
#similar non-lebron player)
second_smallest = distance_frame.iloc[1]["idx"]
most_similar_to_lebron = nba.loc[int(second_smallest)]["player"]

print most_similar_to_lebron

Python: 2.7.13 (default, May 29 2017, 09:41:27) 
[GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.57)]
['player' 'pos' 'age' 'bref_team_id' 'g' 'gs' 'mp' 'fg' 'fga' 'fg.' 'x3p'
 'x3pa' 'x3p.' 'x2p' 'x2pa' 'x2p.' 'efg.' 'ft' 'fta' 'ft.' 'orb' 'drb'
 'trb' 'ast' 'stl' 'blk' 'tov' 'pf' 'pts' 'season' 'season_end']
Carmelo Anthony
