# Zoo Animal Clasification


Use Machine Learning Methods to Correctly Classify Animals Based Upon Attributes.
Dataset by Kaggle. More information can be found [here](https://www.kaggle.com/uciml/zoo-animal-classification).

In [42]:
# Get or create a spark session

from pyspark.sql import SparkSession 

spark = SparkSession.builder \
    .master('local[*]') \
    .appName("Intro") \
    .getOrCreate()

# Provide custome schema for the data 

In [43]:

from pyspark.sql.types import StructField, StructType, StringType, DoubleType
# notice that although the most of the columns are of integet type, the custome schema will use integer type.
# This is because this is the statistic functionality expected numeric type. 

custom_schema = StructType([
    StructField("animal_name", StringType(), True),
    StructField("hair", DoubleType(), True),
    StructField("feathers", DoubleType(), True),
    StructField("eggs", DoubleType(), True),
    StructField("milk", DoubleType(), True),
    StructField("airborne", DoubleType(), True),
    StructField("aquatic", DoubleType(), True),
    StructField("predator", DoubleType(), True),
    StructField("toothed", DoubleType(), True),
    StructField("backbone", DoubleType(), True),
    StructField("breathes", DoubleType(), True),
    StructField("venomous", DoubleType(), True),
    StructField("fins", DoubleType(), True),
    StructField("legs", DoubleType(), True),
    StructField("tail", DoubleType(), True),
    StructField("domestic", DoubleType(), True),
    StructField("catsize", DoubleType(), True),
    StructField("class_type", StringType(), True)])

In [44]:
# load data

zoo_data = spark.read.format("csv")\
    .schema(custom_schema) \
    .option("header", True) \
    .load("../datasets/zoo.csv")

In [45]:
zoo_data.take(1)

[Row(animal_name='aardvark', hair=1.0, feathers=0.0, eggs=0.0, milk=1.0, airborne=0.0, aquatic=0.0, predator=1.0, toothed=1.0, backbone=1.0, breathes=1.0, venomous=0.0, fins=0.0, legs=4.0, tail=0.0, domestic=0.0, catsize=1.0, class_type='1')]

In [46]:
zoo_data.printSchema()

root
 |-- animal_name: string (nullable = true)
 |-- hair: double (nullable = true)
 |-- feathers: double (nullable = true)
 |-- eggs: double (nullable = true)
 |-- milk: double (nullable = true)
 |-- airborne: double (nullable = true)
 |-- aquatic: double (nullable = true)
 |-- predator: double (nullable = true)
 |-- toothed: double (nullable = true)
 |-- backbone: double (nullable = true)
 |-- breathes: double (nullable = true)
 |-- venomous: double (nullable = true)
 |-- fins: double (nullable = true)
 |-- legs: double (nullable = true)
 |-- tail: double (nullable = true)
 |-- domestic: double (nullable = true)
 |-- catsize: double (nullable = true)
 |-- class_type: string (nullable = true)



# Calculate statistics
for this, we will use the Summarizer functionality

In [47]:
# Statistic functionaly can only work on vector.
# Hence we will drop the columns of type string we dont need at the moment.

zoo_data_for_statistics = zoo_data.drop('animal_name','lass_type')

## Turn the columns into a vector

Notice that for simplifying the example, we are going to examin the following columns:

* feathers
* milk
* fins
* domestic 

In [48]:
from pyspark.ml.feature import VectorAssembler

# use vector transformer as describe in the book under transofrmers in chapter 3
vecAssembler = VectorAssembler(outputCol="features")
# assemble only part of the columns for the example
vecAssembler.setInputCols(["feathers","milk","fins","domestic"])

vector_df = vecAssembler.transform(zoo_data_for_statistics)


In [49]:
vector_df.printSchema()

root
 |-- hair: double (nullable = true)
 |-- feathers: double (nullable = true)
 |-- eggs: double (nullable = true)
 |-- milk: double (nullable = true)
 |-- airborne: double (nullable = true)
 |-- aquatic: double (nullable = true)
 |-- predator: double (nullable = true)
 |-- toothed: double (nullable = true)
 |-- backbone: double (nullable = true)
 |-- breathes: double (nullable = true)
 |-- venomous: double (nullable = true)
 |-- fins: double (nullable = true)
 |-- legs: double (nullable = true)
 |-- tail: double (nullable = true)
 |-- domestic: double (nullable = true)
 |-- catsize: double (nullable = true)
 |-- class_type: string (nullable = true)
 |-- features: vector (nullable = true)



In [50]:
from pyspark.ml.stat import Summarizer
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors

# create summarizer for multiple metrics "mean","variance","normL1","normL2","std" and "sum".
summarizer = Summarizer.metrics("mean","variance","normL1","normL2","std","sum","numNonZeros","max","min")


# compute statistics for multiple metrics with weight
statistics_df = vector_df.select(summarizer.summary(vector_df.features))

statistics_df.show(truncate=False)


+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|aggregate_metrics(features, 1.0)                                                                                                                                                                                                                                                                                                                                                                                                           |
+-------------------------------------------------------------------------------------------------------------------------------------------

Notice that statistics dataframe has only one column named aggregate_metrics, where aggregate_metrics coluumns has more columns, where each one of them is a vector.

In [51]:
statistics_df.printSchema()

root
 |-- aggregate_metrics(features, 1.0): struct (nullable = false)
 |    |-- mean: vector (nullable = false)
 |    |-- variance: vector (nullable = false)
 |    |-- normL1: vector (nullable = false)
 |    |-- normL2: vector (nullable = false)
 |    |-- std: vector (nullable = false)
 |    |-- sum: vector (nullable = false)
 |    |-- numNonZeros: vector (nullable = false)
 |    |-- max: vector (nullable = false)
 |    |-- min: vector (nullable = false)



For enabling easier access to the data, we use explode functionality that flattens one hirarchy:

In [52]:
# compute statistics for single metric "std" without the rest
vector_df.select(Summarizer.std(vector_df.features)).show(truncate=False)

+-------------------------------------------------------------------------------+
|std(features)                                                                  |
+-------------------------------------------------------------------------------+
|[0.4004947435409863,0.4935223970962651,0.37601348195757744,0.33655211592363116]|
+-------------------------------------------------------------------------------+



From [wikipedia](https://en.wikipedia.org/wiki/Standard_deviation) std - Standard deviation is a measure of the amount of variation or dispersion of a set of values. A low standard deviation indicates that the values tend to be close to the mean (also called the expected value) of the set, while a high standard deviation indicates that the values are spread out over a wider range. 

Looking at the vector results, the distance from the among each individual feature is lower than 0.5
Our features: "feathers","milk","fins","domestic"

The reson for it, mainly is, the data should be represented in boolean since each feature is a yes/no fearure.
Feathers =1 , means that this animal has feathers and so on.

Now that we know this, let's take a look at count, which will tell us how many animals in the database has feathers, milk, fins or domestic.

In [53]:
# compute statistics for single metric "sum" without the rest
vector_df.select(Summarizer.sum(vector_df.features)).show(truncate=False)

+---------------------+
|sum(features)        |
+---------------------+
|[20.0,41.0,17.0,13.0]|
+---------------------+



`sum` provides us with a more relatable information that we can use to understand the data. 

In [54]:
# compute statistics for single metric "variance" without the rest
vector_df.select(Summarizer.variance(vector_df.features)).show(truncate=False)

+-------------------------------------------------------------------------------+
|variance(features)                                                             |
+-------------------------------------------------------------------------------+
|[0.1603960396039604,0.24356435643564356,0.1413861386138614,0.11326732673267326]|
+-------------------------------------------------------------------------------+



In [55]:
# compute statistics for single metric "count" without the rest
vector_df.select(Summarizer.count(vector_df.features)).show(truncate=False)

+---------------+
|count(features)|
+---------------+
|101            |
+---------------+



In [56]:
# compute statistics for single metric "numNonZeros" without the rest
vector_df.select(Summarizer.numNonZeros(vector_df.features)).show(truncate=False)

+---------------------+
|numNonZeros(features)|
+---------------------+
|[20.0,41.0,17.0,13.0]|
+---------------------+



In [57]:
# compute statistics for single metric "max" without the rest
vector_df.select(Summarizer.max(vector_df.features)).show(truncate=False)

+-----------------+
|max(features)    |
+-----------------+
|[1.0,1.0,1.0,1.0]|
+-----------------+



In [58]:
# compute statistics for single metric "normL1" without the rest
vector_df.select(Summarizer.normL1(vector_df.features)).show(truncate=False)

+---------------------+
|normL1(features)     |
+---------------------+
|[20.0,41.0,17.0,13.0]|
+---------------------+



In [59]:
# compute statistics for single metric "normL2" without the rest
vector_df.select(Summarizer.normL2(vector_df.features)).show(truncate=False)

+-------------------------------------------------------------------------+
|normL2(features)                                                         |
+-------------------------------------------------------------------------+
|[4.47213595499958,6.4031242374328485,4.123105625617661,3.605551275463989]|
+-------------------------------------------------------------------------+



# Testing features correlations
As part of understanding each featres statistics on its own, let's understand the correlation between the features. 

### Notice
This functionality also requires a vector, we will use the one from the earlier computation - `vector_df`

In [60]:
from pyspark.ml.stat import Correlation
from pyspark.ml.stat import KolmogorovSmirnovTest

r1 = Correlation.corr(vector_df, "features").head()
print("Pearson correlation matrix:\n" + str(r1[0])+ "\n")

r2 = Correlation.corr(vector_df, "features", "spearman").head()
print("Spearman correlation matrix:\n" + str(r2[0]))

Pearson correlation matrix:
DenseMatrix([[ 1.        , -0.41076061, -0.22354106,  0.03158624],
             [-0.41076061,  1.        , -0.15632771,  0.16392762],
             [-0.22354106, -0.15632771,  1.        , -0.09388671],
             [ 0.03158624,  0.16392762, -0.09388671,  1.        ]])

Spearman correlation matrix:
DenseMatrix([[ 1.        , -0.41076061, -0.22354106,  0.03158624],
             [-0.41076061,  1.        , -0.15632771,  0.16392762],
             [-0.22354106, -0.15632771,  1.        , -0.09388671],
             [ 0.03158624,  0.16392762, -0.09388671,  1.        ]])


Breakdown of the correlation metrix is in the book, chapter 3 under statistics. 


In [61]:
from pyspark.ml.stat import KolmogorovSmirnovTest

## ChiSquareTest

Testing the p-value of the columns:

This requeires vector as well Hence we will use the prcompute vector from before. 

Notice that label in this case, has to be of type numberic.
To tranform the label into numberic, we will use the StringIndexer transofmer functionality

In [62]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol="class_type", outputCol="label")
indexed_lable = indexer.fit(vector_df).transform(vector_df)


In [63]:
indexed_lable.printSchema()

root
 |-- hair: double (nullable = true)
 |-- feathers: double (nullable = true)
 |-- eggs: double (nullable = true)
 |-- milk: double (nullable = true)
 |-- airborne: double (nullable = true)
 |-- aquatic: double (nullable = true)
 |-- predator: double (nullable = true)
 |-- toothed: double (nullable = true)
 |-- backbone: double (nullable = true)
 |-- breathes: double (nullable = true)
 |-- venomous: double (nullable = true)
 |-- fins: double (nullable = true)
 |-- legs: double (nullable = true)
 |-- tail: double (nullable = true)
 |-- domestic: double (nullable = true)
 |-- catsize: double (nullable = true)
 |-- class_type: string (nullable = true)
 |-- features: vector (nullable = true)
 |-- label: double (nullable = false)



In [64]:
indexed_lable.select("features").take(1)

[Row(features=SparseVector(4, {1: 1.0}))]

In [65]:
indexed_lable

DataFrame[hair: double, feathers: double, eggs: double, milk: double, airborne: double, aquatic: double, predator: double, toothed: double, backbone: double, breathes: double, venomous: double, fins: double, legs: double, tail: double, domestic: double, catsize: double, class_type: string, features: vector, label: double]

In [66]:

from pyspark.ml.stat import ChiSquareTest

chiSqResult = ChiSquareTest.test(indexed_lable, 'features', 'label')
chiSqResult.select("degreesOfFreedom").collect()[0]

Row(degreesOfFreedom=[6, 6, 6, 6])

In [67]:

chiSqResult = ChiSquareTest.test(indexed_lable, 'features', 'label', True)
row = chiSqResult.orderBy("featureIndex").collect()
row[0].statistic


100.99999999999999

In [68]:
row

[Row(featureIndex=0, pValue=0.0, degreesOfFreedom=6, statistic=100.99999999999999),
 Row(featureIndex=1, pValue=0.0, degreesOfFreedom=6, statistic=101.0),
 Row(featureIndex=2, pValue=3.4638958368304884e-14, degreesOfFreedom=6, statistic=75.21350003415999),
 Row(featureIndex=3, pValue=0.5681588672220808, degreesOfFreedom=6, statistic=4.8118701947677085)]

Reminder that for simplifying the example, we used the following columns:
* feathers
* milk
* fins
* domestic