In [64]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

full_data = pd.read_csv('~/Downloads/shot_logs.csv')

In [65]:
# Helper functions

def getDataMatch(df, col, val):
    return df[df[col] == val]

def getDataGreater(df, col, val):
    return df[df[col] > val]

def getDataLesser(df, col, val):
    return df[df[col] < val]

def returnGrouped(df, groupByCol):
    groupByObj = df.groupby(groupByCol)
    df_grouped = groupByObj.agg({
        'SHOT_NUMBER': 'count',
        'PTS': 'sum',
        'FGM': 'sum',
        'SHOT_DIST': 'mean',
        'DRIBBLES': 'mean',
        'TOUCH_TIME': 'mean',
    }).reset_index()
    return df_grouped

def plotData(combined_data, sortAscending, sortBy, title):
    shooting_percentage_sorted = combined_data.sort_values(by=sortBy, ascending=sortAscending)
    top_shots = shooting_percentage_sorted.head(50)
    
    plt.figure(figsize=(20, 12))
    plt.grid()
    sns.barplot(x='player_name', y='diff', data=top_shots, palette='viridis')
    plt.xlabel('Player Name')
    plt.ylabel('Shooting percentage')
    plt.title(title)
    plt.xticks(rotation=45)  # Rotate player names for better readability
    plt.show()

In [66]:
players = returnGrouped(full_data, 'player_name')
players

Unnamed: 0,player_name,SHOT_NUMBER,PTS,FGM,SHOT_DIST,DRIBBLES,TOUCH_TIME
0,aaron brooks,561,555,233,15.088592,5.402852,4.959180
1,aaron gordon,104,119,55,10.062500,1.355769,2.068269
2,al farouq aminu,258,248,111,12.893411,0.515504,1.401163
3,al horford,715,783,387,11.973566,0.537063,1.809231
4,al jefferson,800,766,382,9.315375,0.696250,2.627250
...,...,...,...,...,...,...,...
276,wesley matthews,748,845,336,18.522193,1.179144,2.124465
277,wilson chandler,733,714,304,15.371214,1.499318,2.363847
278,zach lavine,371,337,157,14.776011,3.967655,4.217251
279,zach randolph,671,661,328,8.174516,0.843517,2.334277


In [93]:
players['sp'] = players['FGM'] / players['SHOT_NUMBER'] * 100

In [94]:
features = ['SHOT_DIST', 'DRIBBLES', 'TOUCH_TIME', 'sp']

In [95]:
from sklearn.cluster import KMeans

In [96]:
cluster = KMeans(n_clusters = 5, random_state=42)

In [97]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_features = scaler.fit_transform(players[features])

cluster.fit(scaled_features)

KMeans(n_clusters=5, random_state=42)

In [129]:
target_player = 'rudy gobert'  # Pick any player here

In [131]:
target_data = players[players['player_name'] == target_player][features]
target_scaled = scaler.transform(target_data)

import numpy as np
distances_to_centroids = np.linalg.norm(cluster.cluster_centers_ - target_scaled, axis=1)
nearest_cluster = np.argmin(distances_to_centroids)

cluster_labels = cluster.labels_
cluster_members = players[cluster_labels == nearest_cluster]

cluster_members_scaled = scaler.transform(cluster_members[features])
target_scaled = scaler.transform(target_data)
distances_to_target = np.linalg.norm(cluster_members_scaled - target_scaled, axis=1)

similar_players = cluster_members.assign(distance_to_target=distances_to_target).sort_values(by='distance_to_target').head(6)
print('For ' + target_player + ', the players most similar are:')
print(list(similar_players['player_name'])[1:])

For rudy gobert, the players most similar are:
['greg smith', 'ed davis', 'mason plumlee', 'tyson chandler', 'john henson']


In [116]:
mask = players['player_name'].str.endswith('kd')
players[mask]

Unnamed: 0,player_name,SHOT_NUMBER,PTS,FGM,SHOT_DIST,DRIBBLES,TOUCH_TIME,sp


In [132]:
similar_players

Unnamed: 0,player_name,SHOT_NUMBER,PTS,FGM,SHOT_DIST,DRIBBLES,TOUCH_TIME,sp,distance_to_target
235,rudy gobert,267,334,167,3.551311,0.265918,1.077528,62.546816,0.0
98,greg smith,47,58,29,2.670213,0.170213,1.076596,61.702128,0.249586
82,ed davis,350,422,211,4.418571,0.354286,1.212857,60.285714,0.443571
188,mason plumlee,403,472,236,4.272953,0.55335,1.530521,58.560794,0.795939
270,tyson chandler,340,460,230,4.130588,0.114706,0.882941,67.647059,0.887912
132,john henson,241,280,140,4.968465,0.526971,1.537759,58.091286,0.90458
