In [1]:
import pandas as pd
import numpy as np
from ucimlrepo import fetch_ucirepo
from sklearn.cluster import KMeans
from kmodes.kmodes import KModes

In [2]:
# fetch dataset
zoo = fetch_ucirepo(id=111)

# data (as pandas dataframes)
X = zoo.data.features
y = zoo.data.targets

In [3]:
""" X data is the information on all of the various animals in our zoo.
X data is what will be given to the model to predict type. """

X.head()

Unnamed: 0,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize
0,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1
1,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1
2,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0
3,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1
4,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1


In [4]:
y.head()

Unnamed: 0,type
0,1
1,1
2,4
3,1
4,1


In [5]:
df = zoo.data.original

In [6]:
# Checking the various types of animal we have
y['type'].value_counts()

type
1    41
2    20
4    13
7    10
6     8
3     5
5     4
Name: count, dtype: int64

In [7]:
# What does the zoo look like, how is it split up?
zoo.data.keys()

dict_keys(['ids', 'features', 'targets', 'original', 'headers'])

In [8]:
# What does the 'ids' key contain as values?
zoo.data['ids'].head()

Unnamed: 0,animal_name
0,aardvark
1,antelope
2,bass
3,bear
4,boar


Trying to match the animal's features to the type of animal they are.
Types are given a number one through seven. The number represents mammal, reptile, bird, etc.
This includes insect for six and invertebrate for seven. Since the data is categorical and we know the number of categories,
we can use K-means approach to group the data into clusters. NOTE, this was my thinking before doing any research, it is clear based on the following that K-means does not work well for categorical features since it uses euclidean distance and mean to update centroids.

In [9]:
# Assumption of K-means modelling is that there is no missing data. Check if we can make this assumption.
nulls_original_df = zoo.data['original'][zoo.data['original'].isna().any(axis=1)]
nulls_original_df

Unnamed: 0,animal_name,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,type


No empty data in the 'original' dataframe. Check the X and y data also.

In [10]:
# Checking X data for null values
nulls_X_df = X[X.isna().any(axis=1)]
nulls_X_df

Unnamed: 0,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize


In [11]:
# Checking y data for null values
nulls_y_df = y[y.isna().any(axis=1)]
nulls_y_df

Unnamed: 0,type


No empty data so we good

Now for some feature engineering

In [12]:
# Renaming the type column to the name of the type
type_dict = {"Mammal":1, "Bird":2, "Reptile":3, "Fish":4, "Amphibian":5, "Insect":6, "Invertebrate":7}
df['type'] = df['type'].replace(to_replace = type_dict.values(), value = type_dict.keys())

In [13]:
df.groupby(['domestic', 'type']).size()

domestic  type        
0         Amphibian        4
          Bird            17
          Fish            12
          Insect           7
          Invertebrate    10
          Mammal          33
          Reptile          5
1         Bird             3
          Fish             1
          Insect           1
          Mammal           8
dtype: int64

In [14]:
X['six_legs'] = (X['legs'] == 6).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['six_legs'] = (X['legs'] == 6).astype(int)


In [15]:
X_restricted = X.drop(columns=['eggs', 'venomous', 'domestic', 'predator', 'legs'])

In [16]:
X_restricted.head()

Unnamed: 0,hair,feathers,milk,airborne,aquatic,toothed,backbone,breathes,fins,tail,catsize,six_legs
0,1,0,1,0,0,1,1,1,0,0,1,0
1,1,0,1,0,0,1,1,1,0,1,1,0
2,0,0,0,0,1,1,1,0,1,1,0,0
3,1,0,1,0,0,1,1,1,0,0,1,0
4,1,0,1,0,0,1,1,1,0,1,1,0


Now we can start modeling, since all columns are numeric and relevant.
Hang on, do we need use K means since our features are not numeric but binary? 
My prediciton is that the model will do extremely well for K means, since the features are either 1 or 0,
the clusters are just binary.

In [17]:
k_means = KMeans(n_clusters=7, random_state=42).fit(X_restricted)

  super()._check_params_vs_input(X, default_n_init=10)
found 0 physical cores < 1
  File "C:\Users\benda\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 217, in _count_physical_cores
    raise ValueError(


In [18]:
df['kmeans_results'] = k_means.labels_

In [19]:
df.groupby(['kmeans_results', 'type']).size()

kmeans_results  type        
0               Bird            20
                Reptile          1
1               Mammal           9
                Reptile          3
2               Fish            13
                Mammal           2
                Reptile          1
3               Insect           8
                Invertebrate     3
4               Mammal          30
5               Amphibian        4
6               Invertebrate     7
dtype: int64

Hmm the model is not doing well at all. I have tried various different feature engineering strategies to see if that is the problem but similar issues with results. I think it is because K-means uses euclidian disctance between points in a cluster to move the centroids. Since our features are binary, we have only ones and zeros in "number of column" dimensional space. Thus a centroid picked at random is always in between binary data points and is not itself binary. Lets try Kmodes instead.


In [20]:
kmode = KModes(n_clusters=7, init = "random", n_init = 5, verbose=1).fit_predict(X_restricted)

Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 14, cost: 71.0
Run 1, iteration: 2/100, moves: 2, cost: 71.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 2, iteration: 1/100, moves: 18, cost: 92.0
Run 2, iteration: 2/100, moves: 10, cost: 92.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 3, iteration: 1/100, moves: 19, cost: 87.0
Run 3, iteration: 2/100, moves: 2, cost: 87.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 4, iteration: 1/100, moves: 13, cost: 88.0
Run 4, iteration: 2/100, moves: 2, cost: 88.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 5, iteration: 1/100, moves: 19, cost: 81.0
Run 5, iteration: 2/100, moves: 8, cost: 80.0
Run 5, iteration: 3/100, moves: 0, cost: 80.0
Best run was number 1


In [21]:
df['kmodes_results'] = kmode

In [22]:
df.groupby(['kmodes_results', 'type']).size()

kmodes_results  type        
0               Amphibian        4
                Reptile          4
1               Insect           8
2               Invertebrate     7
3               Bird            20
4               Fish            13
                Mammal           2
                Reptile          1
5               Invertebrate     3
6               Mammal          39
dtype: int64

Kmodes is doing much better than Kmeans, as expected since it uses mode instead of mean to update the clusters. We are still seeing some errors, but im not sure if anything can be done if we persist with Kmodes. It is ambiguous to the model which categories are the most defining. The algorithm chooses clusters based on number of dissimilarities, each feature providing equal weight. We need something that allows us to specify the importance of distinct features, or order the grouping. Note, I dont think we can get any better for things like amphibian vs reptile since the difference is amphibians lay their eggs in water and need water to live. The data provides no way to specify between aquatic reptiles like sea snake or sea turtle, and amphibians like a frog. Both are aquatic, both lay eggs, etc

In [29]:
df_reptile_amphibian = df[(df['type'] == 'Amphibian') | (df['type'] == 'Reptile')]

In [30]:
df_reptile_amphibian

Unnamed: 0,animal_name,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,type,kmeans_results,kmodes_results
25,frog,0,0,1,0,0,1,1,1,1,1,0,0,4,0,0,0,Amphibian,5,0
26,frog,0,0,1,0,0,1,1,1,1,1,1,0,4,0,0,0,Amphibian,5,0
52,newt,0,0,1,0,0,1,1,1,1,1,0,0,4,1,0,0,Amphibian,5,0
62,pitviper,0,0,1,0,0,0,1,1,1,1,1,0,0,1,0,0,Reptile,1,0
76,seasnake,0,0,0,0,0,1,1,1,1,0,1,0,0,1,0,0,Reptile,2,4
80,slowworm,0,0,1,0,0,0,1,1,1,1,0,0,0,1,0,0,Reptile,1,0
89,toad,0,0,1,0,0,1,0,1,1,1,0,0,4,0,0,0,Amphibian,5,0
90,tortoise,0,0,1,0,0,0,0,0,1,1,0,0,4,1,0,1,Reptile,0,0
91,tuatara,0,0,1,0,0,0,1,1,1,1,0,0,4,1,0,0,Reptile,1,0


Based on this information, how is a seasnake not a fish? 
A toad and a frog are the same animal barring what they eat? That is false.
You see they have very little dissimilarites, so they get grouped together.
Plus the data itself does not allow one to distinguish. What makes an amphibian not a reptile?