# Overview

This notebook is a scratch space.

In [15]:
import pandas as pd
import numpy as np

In [26]:
data = {'Inst':['Institution A','Institution B',
                'Institution C','Institution D'],
        'Size':[19000,11500,7750,23000],
        'Cost':[22000,19000,12000,10500],
        'isBig':[1,0,0,1],
        'isExpensive':[1,1,0,0,],
        'isSelect':[1,0,0,1]}

In [27]:
df = pd.DataFrame(data)
df

Unnamed: 0,Inst,Size,Cost,isBig,isExpensive,isSelect
0,Institution A,19000,22000,1,1,1
1,Institution B,11500,19000,0,1,0
2,Institution C,7750,12000,0,0,0
3,Institution D,23000,10500,1,0,1


## Jaccard Index

In [4]:
# Goal: Count the number of matching dimensions among two observations
#       Caluclate Jaccard Index distance
a = df[['isBig','isExpensive','isSelect']].iloc[3]
b = df[['isBig','isExpensive','isSelect']].iloc[0]
matches = sum([1 if x == y else 0 for x, y in zip(a,b)])
distance = 1 - (matches / len(b))
print('Number of matching observations : {}'.format(matches))
print('Jacard Index distance           : {}'.format(distance))

Number of matching observations : 2
Jacard Index distance           : 0.33333333333333337


In [29]:
def jaccard(data, ref, cols):
    '''Calculates Jarcard index for all observations
    relative to the referenced observation. Returns a list
    of distances.
    
    data = A dataframe.
    ref = An observation specified with pandas.iloc[]
    cols = A set of comparison columns.'''

    distances = []
    length = len(ref[cols])
    ref_observation = ref[cols]
    for row in data.index.to_list():
        comp_observation = data[cols].iloc[row]
        matches = sum([1 if x == y else 0 for x, y in zip(comp_observation,
                                                          ref_observation)])
        distances.append(1 - (matches / length))
    return(distances)

In [30]:
ref_institution = df.iloc[3]
df['Jaccard'] = jaccard(data=df, ref=ref_institution, 
                        cols=['isBig', 'isExpensive', 'isSelect'])
df

Unnamed: 0,Inst,Size,Cost,isBig,isExpensive,isSelect,Hamming,Jaccard
0,Institution A,19000,22000,1,1,1,2,0.333333
1,Institution B,11500,19000,0,1,0,0,1.0
2,Institution C,7750,12000,0,0,0,1,0.666667
3,Institution D,23000,10500,1,0,1,3,0.0


## Hamming Distance

In [24]:
# Goal: Count the number of matching dimensions among two observations
def hamming(data, ref, cols):
    '''Calculates Hamming distance for all observations
    relative to the referenced observation. Returns a list
    of distances.
    
    data = A dataframe.
    ref = An observation specified with pandas.iloc[]
    cols = A set of comparison columns.'''

    distances = []
    ref_observation = ref[cols]
    for row in data.index.to_list():
        comp_observation = data[cols].iloc[row]
        matches = sum([1 if x == y else 0 for x, y in zip(comp_observation,
                                                          ref_observation)])
        distances.append(matches)
    return(distances)

In [28]:
ref_institution = df.iloc[3]
df['Hamming'] = hamming(data=df, ref=ref_institution, 
                        cols=['isBig', 'isExpensive', 'isSelect'])
df

Unnamed: 0,Inst,Size,Cost,isBig,isExpensive,isSelect,Hamming
0,Institution A,19000,22000,1,1,1,2
1,Institution B,11500,19000,0,1,0,0
2,Institution C,7750,12000,0,0,0,1
3,Institution D,23000,10500,1,0,1,3


## Euclidean Distance

In [32]:
# Goal: Caluclate Jaccard Index distance
from math import sqrt
sqrt(3**2 + 5**2)

5.830951894845301