The classic imports

In [71]:
import numpy as np
import pandas as pd 

In [91]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
%matplotlib inline 

Helper functions that will do a little bit of preprocessing on the dataframe and will create a matrix of vectors that describes all the NBA players in the dataset

In [77]:
def preprocessString(s):
    return s[:s.index('\\')]

def preprocessDataframe(original):
    original['Player'] = original['Player'].apply(preprocessString)
    original.drop('Tm', axis=1, inplace=True)
    original.dropna(inplace=True)
    return original 

def integerizeFeatures(original):
    original = pd.concat([original, pd.get_dummies(original['Pos'])], axis=1)
    original.drop('Pos', axis=1, inplace=True)
    return original

def createVectors(original):
    # Don't need the name anymore
    original.drop('Player', axis=1, inplace=True)
    return original.as_matrix(), original

Loading in the data and calling the helper functions

In [78]:
# Read in data from CSV
df = pd.read_csv('NBA2016-2017Stats.csv', encoding='utf-8')

# Do some quick preprocessing
df = preprocessDataframe(df)

# Integerize the dataframe
df = integerizeFeatures(df)

# Create a list of all the players
listPlayers = df['Player'].tolist()

# Create vectors
playerVectors, df = createVectors(df)

# Create a list of all the stat categories
listStats = df.columns.tolist()

Making sure that we're able to get the players and their corresponding stats

In [81]:
def getVector(playerName):
    indexOfPlayer = 0
    try:
        indexOfPlayer = listPlayers.index(playerName)
    except ValueError:
        print ('Player not found!')
        return
    return playerVectors[indexOfPlayer]

def showStats(vector, name):
    print ('{0}\'s stats'.format(name))
    for index,stat in enumerate(listStats):
        # Just wanna print the important stats
        if (stat in ['AST','TRB','PS/G']):
            print ('{0}: {1}'.format(stat, vector[index]))

nameOfPlayer = 'Chris Paul'
cp3Vector = getVector(nameOfPlayer)
showStats(cp3Vector, nameOfPlayer)

Chris Paul's stats
TRB: 5.0
AST: 9.2
PS/G: 18.1


Let's see if we can display these 32 dimensional vectors in a bit of a easier to visualize space

In [85]:
pca = PCA(n_components=2)
pca.fit(playerVectors)
reducedVectors = pca.transform(playerVectors)

In [92]:
reducedVectors[listPlayers.index('Chris Paul')]

array([ 38.22273777, -16.64453357])

In [96]:
xValues = reducedVectors[:,0]
yValues = reducedVectors[:,1]

plt.scatter(xValues, yValues, s=area, c=colors, alpha=0.5)

(531,)