In [27]:
# Import the modules
import pandas as pd
from pathlib import Path
import hvplot.pandas

# Import the K-means algorithm
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [75]:
# Read in the CSV file as a Pandas Dataframe
df = pd.read_csv(Path("resources/baseballData.csv"))
df.set_index("last_name, first_name", inplace=True)
df.drop(columns=["player_id"], inplace=True)


In [76]:


df = df.loc[df['year'] == 2023]
df.head()

Unnamed: 0_level_0,year,player_age,ab,pa,hit,single,double,triple,home_run,strikeout,...,batted_ball,f_strike_percent,groundballs_percent,groundballs,flyballs_percent,flyballs,linedrives_percent,linedrives,popups_percent,popups
"last_name, first_name",Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Turner, Justin",2023,38,558,626,154,100,31,0,23,110,...,454,60.7,35.7,162,33.0,150,25.1,114,6.2,28
"Santana, Carlos",2023,37,550,619,132,75,33,1,23,104,...,450,56.2,43.3,195,24.2,109,22.2,100,10.2,46
"Goldschmidt, Paul",2023,35,593,687,159,103,31,0,25,161,...,435,61.3,41.4,180,26.9,117,25.7,112,6.0,26
"Freeman, Freddie",2023,33,637,730,211,121,59,2,29,121,...,521,62.0,35.9,187,29.6,154,30.9,161,3.6,19
"LeMahieu, DJ",2023,34,497,562,121,81,22,3,15,125,...,374,60.6,55.6,208,17.4,65,22.5,84,4.5,17


In [77]:
# Use the `StandardScaler()` module from scikit-learn to normalize the data from the CSV file
# Standard scale helps normalize the data by replacing N/A values with true values, helps for morre prediction, for instance if some values for percentage are not divided by 100 then it can help remedy that
#1. Fit Scaler into dataframe
scaler = StandardScaler()
scaler.fit(df)

In [78]:
# Create a list with the number of k-values from 1 to 11
# Create a DataFrame with the scaled data
normal = scaler.transform(df)
# Create an empty list to store the inertia values
kValue = list(range(1, 12))

# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `df_market_data_scaled`
# 3. Append the model.inertia_ to the inertia list
inertia = []
for x in kValue:
    kmeans = KMeans(n_clusters=x, random_state=0)
    kmeans.fit(df)
    inertia.append(kmeans.inertia_)
print(inertia)


[8657437.16747104, 4568929.548357278, 3659633.2421452813, 3323239.1467085015, 3084114.1389654735, 2803741.263760714, 2642483.0797080607, 2562802.5026099863, 2325578.368102037, 2203953.797262132, 2108320.7076913184]


In [79]:
# Create a dictionary with the data to plot the Elbow curve
elbowDict = {'k' : kValue, 'inertia': inertia}

# Create a DataFrame with the data to plot the Elbow curve
elbowDf = pd.DataFrame(elbowDict)
elbowDf

Unnamed: 0,k,inertia
0,1,8657437.0
1,2,4568930.0
2,3,3659633.0
3,4,3323239.0
4,5,3084114.0
5,6,2803741.0
6,7,2642483.0
7,8,2562803.0
8,9,2325578.0
9,10,2203954.0


In [80]:
originalElbow = elbowDf.hvplot.line(
    x='k',
    y='inertia',
    xlabel='k-values',
    ylabel='inertia',
    title='inertia for each k-value',
)
originalElbow

In [84]:
# Create and initialize the K-means model instance for 3 clusters
model = KMeans(n_clusters=2, random_state=1)

# Print the model
model
# Fit the data to the instance of the model
model.fit(df)
# Make predictions about the data clusters using the trained model
customer_segment_3 = model.predict(df)

# Print the predictions
print(customer_segment_3)

[1 1 1 1 0 0 1 1 1 0 1 0 1 0 0 0 1 0 1 0 0 0 0 0 1 0 1 0 0 1 0 1 1 0 0 1 1
 1 1 1 1 0 1 1 0 1 1 1 0 0 0 0 1 1 0 0 1 0 0 0 1 1 0 1 0 1 0 0 1 1 0 0 1 0
 0 0 0 1 1 0 1 0 0 0 0 0 1 1 1 0 0 0 1 1 1 1 0 1 0 1 0 0 1 1 1 1 1 0 0 0 0
 0 0 0 0 0 0 0 1 0 0 1 1 1 1 1 1 0 0 1 0 0 0 0]


In [85]:
predictDF = df
predictDF['pred_cluster'] = customer_segment_3

In [86]:
cryptoCluster = predictDF.hvplot.scatter(
    x='on_base_percent',
    y='slg_percent',
    by='pred_cluster',
    hover_cols='last_name, first_name',
    xlabel='On Base Percent',
    ylabel='Slugging',
    alpha=0.5
)
cryptoCluster