In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Below sets the plots to inline in the jupyter notebook
#%matplotlib inline

## Use the below for other matplotlib options
#%matplotlib qt
#%matplotlib notebook

In [4]:
df = pd.read_csv('data/stats.csv')
df;

In [125]:
#Examples working with data frame
df['name'];

In [9]:
plot_PCA(df)

In [10]:
''' For the following functions, "df" is a pandas data frame.
    Function "return_data_mat" turns the data frame into numpy array with player name column removed.
    "return_names" returns the names column of the players.
'''

def plot_PCA(data):
    from matplotlib import pyplot as MPL
    data_resc, data_orig, thirdthing = PCA(data)
    col1, col2 = data_resc[:, 0], data_resc[:, 1] 
    names = return_names(data)
    clr1 =  '#2026B2'
    fig = MPL.figure()
    ax1 = fig.add_subplot(111)
    ax1.plot(col1, col2, '.', mfc=clr1, mec=clr1)
    for i in range(len(col1)):
        MPL.text(col1[i], col2[i], names[i], fontsize=8)        
    MPL.show()
    
def PCA(data_frame, dims_rescaled_data=2):
    """
    returns: data transformed in 2 dims/columns + regenerated original data
    pass in: data as 2D NumPy array
    """
    data = return_data_mat(data_frame)
    from scipy import linalg as LA
    m, n = data.shape
    # mean center the data
    data -= data.mean(axis=0)
    # calculate the covariance matrix
    R =np.cov(data, rowvar=False)
    # calculate eigenvectors & eigenvalues of the covariance matrix
    # use 'eigh' rather than 'eig' since R is symmetric, 
    # the performance gain is substantial
    evals, evecs = LA.eigh(R)
    # sort eigenvalue in decreasing order
    idx = np.argsort(evals)[::-1]
    evecs = evecs[:,idx]
    # sort eigenvectors according to same index
    evals = evals[idx]
    # select the first n eigenvectors (n is desired dimension
    # of rescaled data array, or dims_rescaled_data)
    evecs = evecs[:, :dims_rescaled_data]
    # carry out the transformation on the data using eigenvectors
    # and return the re-scaled data, eigenvalues, and eigenvectors
    return np.dot(evecs.T, data.T).T, evals, evecs

def return_data_mat(df):
    vals = df.values
    trimvals = vals[:,1:]
    data_mat = trimvals.astype(float)
    return data_mat

def return_names(df):
    vals = df.values
    names = (vals[:,0]).astype('str')
    return names

def test_PCA(data, dims_rescaled_data=2):
    '''
    test by attempting to recover original data array from
    the eigenvectors of its covariance matrix & comparing that
    'recovered' array with the original data
    '''
    _ , _ , eigenvectors = PCA(data, dim_rescaled_data=2)
    data_recovered = np.dot(eigenvectors, m).T
    data_recovered += data_recovered.mean(axis=0)
    assert np.allclose(data, data_recovered)
    
    
### These functions assisted by this link:
# http://stackoverflow.com/questions/13224362/principal-component-analysis-pca-in-python