In [14]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import hvplot.pandas
import seaborn as sns

# Load the cleaned breed data
file_path = "Resources/akc-data-cleaned_2.csv"
breed_df_cleaned = pd.read_csv(file_path)

breed_df_cleaned.head()

Unnamed: 0,Dog Breed,min_height,max_height,min_weight,max_weight,min_expectancy,max_expectancy,grooming_frequency_category,shedding_category,energy_level_category,trainability_category,demeanor_category
0,Affenpinscher,22.86,29.21,3.175147,4.535924,12.0,15.0,2-3 Times a Week Brushing,Seasonal,Regular Exercise,Easy Training,Outgoing
1,Afghan Hound,63.5,68.58,22.679619,27.215542,12.0,15.0,Daily Brushing,Infrequent,Energetic,May be Stubborn,Aloof/Wary
2,Airedale Terrier,58.42,58.42,22.679619,31.751466,11.0,14.0,2-3 Times a Week Brushing,Occasional,Regular Exercise,Eager to Please,Friendly
3,Akita,60.96,71.12,31.751466,58.967008,10.0,13.0,Daily Brushing,Seasonal,Energetic,Eager to Please,Alert/Responsive
4,Alaskan Malamute,58.42,63.5,34.019428,38.555351,10.0,14.0,2-3 Times a Week Brushing,Seasonal,Energetic,Independent,Friendly


In [66]:
# Define categorical and numerical columns
categorical_columns = ['grooming_frequency_category', 'shedding_category', 'energy_level_category', 
                        'trainability_category', 'demeanor_category']
numerical_columns = ['min_height', 'max_height', 'min_weight', 'max_weight', 'min_expectancy', 'max_expectancy']

# One-Hot Encode categorical features
encoder = OneHotEncoder()
encoded_categorical = encoder.fit_transform(breed_df_cleaned[categorical_columns]).toarray()
encoded_categorical_df = pd.DataFrame(encoded_categorical, columns=encoder.get_feature_names_out(categorical_columns))

# Normalize numerical features
scaler = StandardScaler()
scaled_numerical = scaler.fit_transform(breed_df_cleaned[numerical_columns])
scaled_numerical_df = pd.DataFrame(scaled_numerical, columns=numerical_columns)


# Combine processed features
processed_df = pd.concat([encoded_categorical_df, scaled_numerical_df], axis=1)

# Store breed names separately
breed_names = breed_df_cleaned['Dog Breed']



# Preview processed_df holding scaled data
processed_df.head()

Unnamed: 0,grooming_frequency_category_2-3 Times a Week Brushing,grooming_frequency_category_Daily Brushing,grooming_frequency_category_Occasional Bath/Brush,grooming_frequency_category_Specialty/Professional,grooming_frequency_category_Weekly Brushing,shedding_category_Frequent,shedding_category_Infrequent,shedding_category_Occasional,shedding_category_Regularly,shedding_category_Seasonal,...,demeanor_category_Aloof/Wary,demeanor_category_Friendly,demeanor_category_Outgoing,demeanor_category_Reserved with Strangers,min_height,max_height,min_weight,max_weight,min_expectancy,max_expectancy
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,-1.419335,-1.395485,-1.174016,-1.192944,0.364592,0.55809
1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.398736,1.046422,0.457357,0.043919,0.364592,0.55809
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.046478,0.416252,0.457357,0.291292,-0.176563,0.073865
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.222607,1.203965,1.216135,1.775528,-0.717718,-0.41036
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.046478,0.731337,1.405829,0.662351,-0.717718,0.073865


In [67]:
# Find optimal k using the Elbow Method
inertia = []
k_values = range(1, 11)  # Testing values from 2 to 14 clusters

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(processed_df)
    inertia.append(kmeans.inertia_)
print(inertia)

[2245.4406779661012, 1550.45841932594, 1307.943754666376, 1229.5528405875368, 1170.65270903872, 1121.4086571674195, 1075.194826281173, 1047.7790436341681, 1005.1921113204288, 977.263283222072]


In [68]:
# Create a dictionary with the data to plot the Elbow curve
elbow_data = {"k": k_values, "inertia": inertia}


# Create a DataFrame with the data to plot the Elbow curve
df_elbow = pd.DataFrame(elbow_data)
df_elbow.head()

Unnamed: 0,k,inertia
0,1,2245.440678
1,2,1550.458419
2,3,1307.943755
3,4,1229.552841
4,5,1170.652709


In [69]:
# Plot a line chart with all the inertia values computed with
# the different values of k to visually identify the optimal value for k.
breed_elbow = df_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)
breed_elbow

In [72]:
# Initialize the K-Means model using the best value for k (k=3)
model = KMeans(n_clusters=3, random_state=42, n_init=10)
breed_clusters = model.fit_predict(processed_df)
breed_clusters

array([0, 2, 2, 1, 2, 1, 2, 0, 2, 2, 2, 0, 1, 2, 2, 2, 2, 0, 2, 2, 0, 0,
       2, 2, 0, 2, 1, 2, 2, 2, 2, 2, 2, 1, 0, 1, 1, 1, 2, 1, 2, 0, 2, 0,
       1, 0, 1, 2, 0, 2, 2, 2, 0, 2, 2, 1, 0, 2, 2, 0, 1, 0, 0, 2, 0, 0,
       2, 2, 2, 0, 2, 0, 2, 0, 2, 1, 2, 0, 2, 0, 1, 1, 2, 0, 2, 0, 2, 2,
       2, 0, 2, 2, 2, 2, 0, 1, 0, 2, 1, 2, 0, 2, 0, 2, 2, 2, 1, 1, 1, 2,
       2, 2, 2, 0, 2, 2, 2, 2, 2, 1, 0, 0, 0, 2, 2, 2, 2, 1, 0, 1, 2, 0,
       0, 0, 2, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 1, 0, 1, 0, 2, 2,
       0, 0, 2, 1, 1, 0, 0, 0, 0, 2, 0, 2, 2, 2, 2, 0, 0, 0, 2, 0, 2, 2,
       0, 0, 0, 1, 0, 2, 1, 1, 1, 0, 0, 0, 1, 2, 2, 2, 0, 1, 0, 0, 0, 0,
       0, 2, 0, 0, 2, 2, 2, 0, 2, 2, 2, 0, 0, 0, 2, 0, 2, 0, 2, 1, 0, 0,
       1, 0, 2, 2, 2, 0, 0, 0, 2, 0, 2, 2, 2, 0, 2, 0], dtype=int32)

In [73]:
# Create a copy of the scaled DataFrame
predicted_df = processed_df.copy()

# Add a new column to the copy of the scaled DataFrame with the predicted clusters
predicted_df['k3_Clusters'] = breed_clusters

# Display the copy of the scaled DataFrame

predicted_df.head()

Unnamed: 0,grooming_frequency_category_2-3 Times a Week Brushing,grooming_frequency_category_Daily Brushing,grooming_frequency_category_Occasional Bath/Brush,grooming_frequency_category_Specialty/Professional,grooming_frequency_category_Weekly Brushing,shedding_category_Frequent,shedding_category_Infrequent,shedding_category_Occasional,shedding_category_Regularly,shedding_category_Seasonal,...,demeanor_category_Friendly,demeanor_category_Outgoing,demeanor_category_Reserved with Strangers,min_height,max_height,min_weight,max_weight,min_expectancy,max_expectancy,k3_Clusters
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,-1.419335,-1.395485,-1.174016,-1.192944,0.364592,0.55809,0
1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.398736,1.046422,0.457357,0.043919,0.364592,0.55809,2
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.046478,0.416252,0.457357,0.291292,-0.176563,0.073865,2
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.222607,1.203965,1.216135,1.775528,-0.717718,-0.41036,1
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.046478,0.731337,1.405829,0.662351,-0.717718,0.073865,2


In [75]:
# Create a scatter plot using hvPlot
scatter_plot = predicted_df.hvplot.scatter(
    x="min_weight",  # Feature for x-axis
    y="max_height",  # Feature for y-axis
    by="k3_Clusters",  # Color points by cluster
    hover_cols=["Dog Breed"],  # Show breed name on hover
    title="Dog Breed Clusters by Height and Weight",
    size=100,
    alpha=0.7,
    colormap="Category10"
)

scatter_plot


i dont know if this model is correct, use at own risk. Need to try random forest classifer again and generate synthetic user data