# Exploratory Data Analysis - Part 2

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
import pandas as pd
import numpy as np
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import dionysus as d
import kmapper as km
from sklearn.cluster import DBSCAN, KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE

from src.data_pipeline import load_data, make_dummies

plt.style.use('ggplot')

In [3]:
player_df = load_data('../data/players201718.csv')

## Pre-Processing and Feature Engineering

I want to separate the `Player`, `Tm`, and `Pos` columns for now and keep them in a separate DataFrame, and create dummy variables for the `Pos` feature.

In [4]:
player_tm = player_df[['Player', 'Tm', 'Pos']].copy()
# player_df = make_dummies(player_df)
df = player_df.drop(columns=['Player', 'Tm', 'Pos', 'Age', 'FG%', '3P%', '2P%', 'FT%'])

In [5]:
df.head()

Unnamed: 0_level_0,G,GS,MP,FG,FGA,3P,3PA,2P,2PA,eFG%,...,FTA,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
abrinal01,75,8,1134,115,291,84,221,31,70,0.54,...,46,26,88,114,28,38,8,25,124,353
acyqu01,70,8,1359,130,365,102,292,28,73,0.496,...,60,40,217,257,57,33,29,60,149,411
adamsst01,76,76,2487,448,712,0,2,448,710,0.629,...,286,384,301,685,88,92,78,128,215,1056
adebaba01,69,19,1368,174,340,0,7,174,333,0.512,...,179,118,263,381,101,32,41,66,138,477
afflaar01,53,3,682,65,162,27,70,38,92,0.485,...,26,4,62,66,30,4,9,21,56,179


I believe it would be appropriate to get per-game stats for all of these players, so I will divide the appropriate columns by `G`.

In [6]:
per_game = df.copy()
per_game['GS'] = per_game['GS'] / per_game['G']
per_game['MP'] = per_game['MP'] / per_game['G']
per_game['FG'] = per_game['FG'] / per_game['G']
per_game['FGA'] = per_game['FGA'] / per_game['G']
per_game['3P'] = per_game['3P'] / per_game['G']
per_game['3PA'] = per_game['3PA'] / per_game['G']
per_game['2P'] = per_game['2P'] / per_game['G']
per_game['2PA'] = per_game['2PA'] / per_game['G']
per_game['FT'] = per_game['FT'] / per_game['G']
per_game['FTA'] = per_game['FTA'] / per_game['G']
per_game['ORB'] = per_game['ORB'] / per_game['G']
per_game['DRB'] = per_game['DRB'] / per_game['G']
per_game['TRB'] = per_game['TRB'] / per_game['G']
per_game['AST'] = per_game['AST'] / per_game['G']
per_game['STL'] = per_game['STL'] / per_game['G']
per_game['BLK'] = per_game['BLK'] / per_game['G']
per_game['TOV'] = per_game['TOV'] / per_game['G']
per_game['PF'] = per_game['PF'] / per_game['G']
per_game['PTS'] = per_game['PTS'] / per_game['G']
per_game.drop(columns='G', inplace=True)

## Topological Data Analysis

In [7]:
scaler = StandardScaler()
scaled = scaler.fit_transform(per_game)

In [26]:
mapper = km.KeplerMapper(verbose=1)

lens = mapper.project(scaled,
                      projection=PCA(n_components=1),
                      scaler=None,
                     )

..Projecting on data shaped (540, 20)

..Projecting data using: 
	PCA(copy=True, iterated_power='auto', n_components=1, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)



In [30]:
mapped = mapper.map(projected_X=lens,
                    inverse_X=scaled,
                    coverer=km.Cover(nr_cubes=10, overlap_perc=0.2),
                    clusterer=KMeans())

mapper.visualize(graph=mapped,
                 path_html='../data/nba_visualization.html',
                 title='NBA TDA',
                 inverse_X=per_game.values,
                 inverse_X_names=list(per_game.columns),
#                  projected_X=lens,
#                  projected_X_names=list(mapped['nodes'].keys())
                );

Mapping on data shaped (540, 20) using lens shaped (540, 1)

Creating 10 hypercubes.

Created 43 edges and 64 nodes in 0:00:00.283234.
Wrote visualization to: ../data/nba_visualization.html




In [9]:
cluster_ids = list(mapped['nodes'].keys())

In [10]:
clusters = []

for cluster_id in cluster_ids:
    clusters.append(mapper.data_from_cluster_id(cluster_id=cluster_id,graph=mapped,data = per_game.iloc))

In [11]:
cluster_stats = pd.DataFrame(per_game.mean(), columns=['mean'])

for i, cluster in enumerate(clusters):
    cluster_stats['{}_mean'.format(i)] = cluster.mean()

In [None]:
mapper.data_from_cluster_id()

In [13]:
cluster_stats

Unnamed: 0,mean,0_mean,1_mean,2_mean,3_mean,4_mean,5_mean,6_mean,7_mean,8_mean,...,54_mean,55_mean,56_mean,57_mean,58_mean,59_mean,60_mean,61_mean,62_mean,63_mean
GS,0.361651,0.004274,0.003531,0.0281,0.0,0.0,0.0,0.0,0.0,0.047181,...,0.79878,0.620253,1.0,1.0,1.0,1.0,1.0,1.0,0.986667,1.0
MP,19.19258,4.827294,6.090063,5.511887,2.9,2.5,3.944311,3.75,2.5,13.744396,...,26.568848,33.486076,33.453333,34.276202,35.282733,34.277867,33.21554,35.310802,33.150864,32.354167
FG,3.03548,0.26971,0.613888,0.653575,0.0,1.0,0.567948,0.0,0.833333,1.849098,...,6.111145,7.358544,9.16,8.000341,7.73464,8.449503,6.671318,7.912236,6.718025,8.125
FGA,6.716015,1.223377,1.821316,1.412392,0.766667,1.0,1.104725,0.25,1.0,4.357763,...,11.531182,16.980775,17.96,17.834691,16.636335,18.162369,12.278156,17.427679,12.893827,18.520833
3P,0.81177,0.019959,0.243283,0.096932,0.0,1.0,0.173273,0.0,0.333333,1.024489,...,0.060976,2.620965,0.36,2.268341,1.14947,3.633226,0.49192,2.617637,0.74,1.875
3PA,2.290283,0.410829,0.978812,0.270982,0.341667,1.0,0.459011,0.25,0.5,2.723433,...,0.318694,7.034731,1.226667,5.996551,3.488665,9.219984,1.234352,6.747764,1.934568,4.75
2P,2.22371,0.249751,0.370605,0.556643,0.0,0.0,0.394675,0.0,0.5,0.824609,...,6.05017,4.737579,8.8,5.732,6.585169,4.816277,6.179398,5.294599,5.978025,6.25
2PA,4.425732,0.812548,0.842504,1.14141,0.425,0.0,0.645714,0.0,0.5,1.63433,...,11.212488,9.946044,16.733333,11.83814,13.147669,8.942385,11.043804,10.679916,10.959259,13.770833
eFG%,0.498554,0.227769,0.402609,0.497625,0.0,1.5,0.6065,0.0,1.0,0.539385,...,0.534,0.5105,0.52,0.511,0.5,0.5685,0.560333,0.529,0.5495,0.489
FT,1.276639,0.17759,0.271067,0.231207,0.1,0.0,0.070798,0.0,0.666667,0.435137,...,2.948364,5.002927,4.453333,4.302661,5.957521,6.102203,3.798292,4.099241,2.952346,4.541667
