# Dog Breeds
## Tidy Tuesday

Richard Bamattre
2024-10-08

In [1]:
import pandas as pd

StatementMeta(, , -1, SessionStarting, , SessionStarting)

In [None]:
breed_traits = pd.read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-02-01/breed_traits.csv')

trait_description = pd.read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-02-01/trait_description.csv')

breed_rank_all = pd.read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-02-01/breed_rank.csv')


StatementMeta(, , , Waiting, , Waiting)

In [None]:
breed_traits.shape

StatementMeta(, , , Waiting, , Waiting)

In [None]:
trait_description.shape

StatementMeta(, , , Waiting, , Waiting)

In [None]:
breed_rank_all.shape

StatementMeta(, , , Waiting, , Waiting)

In [None]:
display(breed_traits)

StatementMeta(, , , Waiting, , Waiting)

In [None]:
display(trait_description)

StatementMeta(, , , Waiting, , Waiting)

In [None]:
display(breed_rank_all)

StatementMeta(, , , Waiting, , Waiting)

In [None]:
breed_rank_all.columns

StatementMeta(, , , Waiting, , Waiting)

In [None]:
# need to pivot ranks

rank = breed_rank_all.melt(id_vars = ['Breed', 'links', 'Image'], value_name = 'rank')

rank['year'] = rank['variable'].str.replace(' Rank', '')
rank['year'] = pd.to_numeric(rank['year'])

rank = rank.drop('variable', axis = 1)

rank

StatementMeta(, , , Waiting, , Waiting)

In [None]:
# get the top rank per breed

top_rank = rank.groupby('Breed').agg({'rank': 'min'})

top_rank.sort_values('rank')

StatementMeta(, , , Waiting, , Waiting)

In [None]:
# get top 10 per year

top10 = top_rank[top_rank['rank'] <= 10]

top10.sort_values('rank')

StatementMeta(, , , Waiting, , Waiting)

In [None]:
top10.index

StatementMeta(, , , Waiting, , Waiting)

In [None]:
import seaborn as sns

# get full data, but only for top 10 breeds

plot_data = rank[rank['Breed'].isin(top10.index)]

g = sns.lineplot(plot_data, x = 'year', y = 'rank', hue = 'Breed', palette = "Dark2")

g.invert_yaxis()

sns.move_legend(g, "upper left", bbox_to_anchor=(1, 1))

StatementMeta(, , , Waiting, , Waiting)

In [None]:
traits_top10 = breed_traits.loc[breed_traits['Breed'].isin(plot_data['Breed'])]

# there's not a clear matchup between the traits and ranks?
# especially when there are characters

traits_top10

StatementMeta(, , , Waiting, , Waiting)

In [None]:
# this looks like a manageable way to plot variables among many breeds:
# https://seaborn.pydata.org/examples/pairgrid_dotplot.html

traits_head = breed_traits.head(n = 25)

sns.set_theme(style="whitegrid")

g = sns.PairGrid(traits_head.sort_values('Affectionate With Family'), x_vars = traits_top10.columns[1:6], y_vars = 'Breed', height = 10, aspect = .25)

g.map(sns.stripplot, size = 10, orient = 'h', linewidth = 1, edgecolor = 'w')

g.set(xlim = (0, 6))

for ax in g.axes.flat:

    # Make the grid horizontal instead of vertical
    ax.xaxis.grid(False)
    ax.yaxis.grid(True)

    # Set x axis labels
    ax.set_xticks([1, 3, 5])

g

StatementMeta(, , , Waiting, , Waiting)

In [None]:
# use for other variables

g = sns.PairGrid(traits_head.sort_values('Barking Level'), x_vars = traits_top10.columns[9:16], y_vars = 'Breed', height = 10, aspect = .25)

g.map(sns.stripplot, size = 10, orient = 'h', linewidth = 1, edgecolor = 'w')

g.set(xlim = (0, 6))

for ax in g.axes.flat:

    # Make the grid horizontal instead of vertical
    ax.xaxis.grid(False)
    ax.yaxis.grid(True)

    # Set x axis labels
    ax.set_xticks([1, 3, 5])

g

StatementMeta(, , , Waiting, , Waiting)

In [None]:
# more classical use of PairGrid - scatterplot

g = sns.PairGrid(breed_traits.iloc[:, 9:16])

g.map(sns.regplot)

StatementMeta(, , , Waiting, , Waiting)

In [None]:
# more classical use of PairGrid - scatterplot

g = sns.PairGrid(breed_traits.iloc[:, 1:6])

g.map(sns.regplot)

StatementMeta(, , , Waiting, , Waiting)

## Clustering

In [None]:
breeds = breed_traits.drop(columns = ['Coat Type', 'Coat Length']) # drop non-numeric columns

breeds

StatementMeta(, , , Waiting, , Waiting)

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters = 5, random_state = 0, n_init = 'auto').fit(breeds.drop('Breed', axis = 1))

StatementMeta(, , , Waiting, , Waiting)

In [None]:
breeds['kmean'] = kmeans.labels_

breeds

StatementMeta(, , , Waiting, , Waiting)

In [None]:
# conduct PCA for mapping

from sklearn.decomposition import PCA

pca = PCA(n_components = 2).fit_transform(breeds.drop('Breed', axis = 1))

StatementMeta(, , , Waiting, , Waiting)

In [None]:
pd.DataFrame(pca)

StatementMeta(, , , Waiting, , Waiting)

In [None]:
breeds_final = pd.merge(breeds, pd.DataFrame(pca), left_index = True, right_index = True)

breeds_final['image'] = breed_rank_all['Image']

breeds_final

StatementMeta(, , , Waiting, , Waiting)

In [None]:
breeds_final.columns

StatementMeta(, , , Waiting, , Waiting)

In [None]:
sns.scatterplot(breeds_final, x = 0, y = 1, hue = 'kmean', palette = 'Accent')

StatementMeta(, , , Waiting, , Waiting)

In [None]:
breeds_head = breeds_final.head(n = 25)

#breeds_final.iloc(breeds_final['kmean'] == 1).head(n = 25)

StatementMeta(, , , Waiting, , Waiting)

In [None]:
import matplotlib.pyplot as plt
from matplotlib.offsetbox import OffsetImage, AnnotationBbox

def getImage(path, zoom=.05):
    return OffsetImage(plt.imread(path, format = 'jpeg'), zoom=zoom)

paths = breeds_head['image']

x = breeds_head[0]
y = breeds_head[1]
color = breeds_head['kmean']

fig, ax = plt.subplots()
ax.scatter(x, y, c = color) 

for x0, y0, path in zip(x, y,paths):
    
    ab = AnnotationBbox(getImage(requests.get(path, stream = True).raw), (x0, y0), frameon=False)
    ax.add_artist(ab)

fig.set_size_inches(8, 8)

StatementMeta(, , , Waiting, , Waiting)