# Distances

Alípio Jorge and Inês Dutra, 2020, Department of Computer Science, FCUP <br>
Beatriz Gamboa Pereira

In [2]:
import pandas as pd
import numpy as np
import scipy.spatial as sp
iris = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv')
iris

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [2]:
iris.iloc[0]

sepal_length       5.1
sepal_width        3.5
petal_length       1.4
petal_width        0.2
species         setosa
Name: 0, dtype: object

## Calculating the distance between two rows from scratch

In [5]:
iris.iloc[0:2,0:4]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2


In [6]:
c1 = iris.iloc[0,0:4]
c2 = iris.iloc[1,0:4]
print(c1)

sepal_length    5.1
sepal_width     3.5
petal_length    1.4
petal_width     0.2
Name: 0, dtype: object


### Manhattan distance

$$d(X, Y)=\sum_{i}\left|X_{i}-Y_{i}\right|$$

In [8]:
print("Mahanttan from scratch: ", round(sum(abs(c1-c2)),5))

Mahanttan from scratch:  0.7


### Euclidean distance

$$d(X, Y)=\sqrt{\sum_{i}\left(X_{i}-Y_{i}\right)^{2}}$$

In [9]:
print("Euclidean from scratch: ", round(np.sqrt(sum((c1-c2)**2)),5))

Euclidean from scratch:  0.53852


### Supremum distance

$$\max _{i}\left|X_{i}-Y_{i}\right|$$

In [10]:
print("Supremum from scratch: ", round(max(abs(c1-c2)),5))

Supremum from scratch:  0.5


### Minkowski distance

$$d(X, Y)=\sqrt{\sum_{i}\left(X_{i}-Y_{i}\right)^{p}}, \quad h \geq 1$$
for $p=1$ we get the Manhattan distance, and for $p=2$ we get the Eucledian distance

In [14]:
p=3
print("Minkowski from scratch: ", round((sum((c1-c2)**p)**(1/p)),5))

Minkowski from scratch:  0.51045


## Pre-defined functions

In [20]:
sm = iris.iloc[0:5,0:4]

print('Euclidean: ')
dm = sp.distance.pdist(sm, metric='euclidean')
dm

Euclidean: 


array([0.53851648, 0.50990195, 0.64807407, 0.14142136, 0.3       ,
       0.33166248, 0.60827625, 0.24494897, 0.50990195, 0.64807407])

In [22]:
sf = sp.distance.squareform(dm)
sf

array([[0.        , 0.53851648, 0.50990195, 0.64807407, 0.14142136],
       [0.53851648, 0.        , 0.3       , 0.33166248, 0.60827625],
       [0.50990195, 0.3       , 0.        , 0.24494897, 0.50990195],
       [0.64807407, 0.33166248, 0.24494897, 0.        , 0.64807407],
       [0.14142136, 0.60827625, 0.50990195, 0.64807407, 0.        ]])

In [23]:
sm

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [24]:
sf[0,1]

0.5385164807134502

## Nominal attributes

In [26]:
s1 = iris.iloc[48:52,4:5]
print(s1)
catdist = sp.distance.pdist(s1,lambda x,y: x==y)
sp.distance.squareform(catdist)

       species
48      setosa
49      setosa
50  versicolor
51  versicolor


array([[0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.]])

In [27]:
# a hybrid distance

num = iris.iloc[48:52,0:4]
numdist = sp.distance.pdist(num, metric='euclidean')

#normalizing the euclidean distance
numdist = numdist/max(numdist)
print('Distance with numeric only: ',numdist)

hybridist = (4 * numdist + catdist) / 5
print('Hybrid distanc: ',hybridist)

Distance with numeric only:  [0.12614227 0.95235327 0.8623074  1.         0.90117562 0.15840391]
Hybrid distanc:  [0.30091382 0.76188261 0.68984592 0.8        0.7209405  0.32672313]
