In [27]:
# Import the modules
import pandas as pd
from pathlib import Path
import hvplot.pandas

# Import the K-means algorithm
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [33]:
# Read in the CSV file as a Pandas Dataframe
df = pd.read_csv(Path("resources/baseballData.csv"))


In [34]:

df.set_index("last_name, first_name", inplace=True)
df.drop(columns=["player_id"])
df.head()

Unnamed: 0_level_0,player_id,year,player_age,ab,pa,hit,single,double,triple,home_run,...,batted_ball,f_strike_percent,groundballs_percent,groundballs,flyballs_percent,flyballs,linedrives_percent,linedrives,popups_percent,popups
"last_name, first_name",Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Hunter, Torii",116338,2015,39,521,567,125,81,22,0,22,...,421,64.0,49.4,208,22.6,95,20.9,88,7.1,30
"Ortiz, David",120074,2015,39,528,614,144,70,37,0,37,...,442,52.1,37.6,166,25.6,113,28.7,127,8.1,36
"Rodriguez, Alex",121347,2015,39,523,620,131,75,22,1,33,...,385,60.3,43.6,168,24.9,96,24.9,96,6.5,25
"Ramirez, Aramis",133380,2015,37,475,516,117,68,31,1,17,...,412,65.5,37.6,155,24.5,101,29.1,120,8.7,36
"Beltré, Adrián",134181,2015,36,567,619,163,109,32,4,18,...,510,61.9,42.4,216,18.4,94,31.6,161,7.6,39


In [35]:
# Use the `StandardScaler()` module from scikit-learn to normalize the data from the CSV file
# Standard scale helps normalize the data by replacing N/A values with true values, helps for morre prediction, for instance if some values for percentage are not divided by 100 then it can help remedy that
#1. Fit Scaler into dataframe
scaler = StandardScaler()
scaler.fit(df)

In [38]:
# Create a list with the number of k-values from 1 to 11
# Create a DataFrame with the scaled data
normal = scaler.transform(df)
# Create an empty list to store the inertia values
kValue = list(range(1, 12))

# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `df_market_data_scaled`
# 3. Append the model.inertia_ to the inertia list
inertia = []
for x in kValue:
    kmeans = KMeans(n_clusters=x, random_state=0)
    kmeans.fit(df)
    inertia.append(kmeans.inertia_)
print(inertia)


[10728303988601.203, 3758960329180.6714, 2320160219454.504, 1051153854403.6672, 447302037021.35535, 362984465074.7594, 270344735437.2236, 209711153525.44952, 158565914349.83514, 139818033998.80936, 102564011888.85078]


In [39]:
# Create a dictionary with the data to plot the Elbow curve
elbowDict = {'k' : kValue, 'inertia': inertia}

# Create a DataFrame with the data to plot the Elbow curve
elbowDf = pd.DataFrame(elbowDict)
elbowDf

Unnamed: 0,k,inertia
0,1,10728300000000.0
1,2,3758960000000.0
2,3,2320160000000.0
3,4,1051154000000.0
4,5,447302000000.0
5,6,362984500000.0
6,7,270344700000.0
7,8,209711200000.0
8,9,158565900000.0
9,10,139818000000.0


In [40]:
originalElbow = elbowDf.hvplot.line(
    x='k',
    y='inertia',
    xlabel='k-values',
    ylabel='inertia',
    title='inertia for each k-value',
)
originalElbow

In [45]:
# Create and initialize the K-means model instance for 3 clusters
model = KMeans(n_clusters=5, random_state=1)

# Print the model
model
# Fit the data to the instance of the model
model.fit(df)
# Make predictions about the data clusters using the trained model
customer_segment_3 = model.predict(df)

# Print the predictions
print(customer_segment_3)

[2 2 2 ... 3 3 3]


In [46]:
predictDF = df
predictDF['pred_cluster'] = customer_segment_3

In [48]:
cryptoCluster = predictDF.hvplot.scatter(
    x='on_base_percent',
    y='slg_percent',
    by='pred_cluster',
    hover_cols='coin_id',
    xlabel='On Base Percent',
    ylabel='Slugging',
)
cryptoCluster