# Overview

This notebook is a scratch space.

In [1]:
import pandas as pd
import numpy as np

from math import sqrt
from scipy.spatial import distance
from scipy.stats import zscore

In [2]:
data = {'Inst':['Institution A','Institution B',
                'Institution C','Institution D'],
        'Size':[19000,11500,7750,23000],
        'Cost':[22000,19000,12000,10500],
        'Accept Rt':[.25,.45,.76,.99],
        'isBig':[1,0,0,1],
        'isExpensive':[1,1,0,0,],
        'isSelect':[1,0,0,1]}

In [3]:
df = pd.DataFrame(data)
df

Unnamed: 0,Inst,Size,Cost,Accept Rt,isBig,isExpensive,isSelect
0,Institution A,19000,22000,0.25,1,1,1
1,Institution B,11500,19000,0.45,0,1,0
2,Institution C,7750,12000,0.76,0,0,0
3,Institution D,23000,10500,0.99,1,0,1


## Hamming Distance

In [4]:
# Goal: Count the number of matching dimensions among two observations
def hamming(data, ref, cols):
    '''Calculates Hamming distance for all observations
    relative to the referenced observation. Returns a list
    of hamming distances.
    
    data = A dataframe.
    ref  = A reference observation. Specify by the axis 0 index.
    cols = A set of comparison columns.'''

    if type(ref) == int:
        ref = ref
    if type(ref) == str:
        ref = df.index.get_loc(ref)

    distances = []
    ref_observation = data[cols].iloc[ref]
    for row in range(len(data)):
        comp_observation = data[cols].iloc[row]
        matches = sum([1 if x == y else 0 for x, y in zip(comp_observation,
                                                          ref_observation)])
        distances.append(matches)
    return(distances)

In [5]:
# ref_institution = df.iloc[3]
df['Hamming'] = hamming(data=df, ref=3, 
                        cols=['isBig', 'isExpensive', 'isSelect'])
df

Unnamed: 0,Inst,Size,Cost,Accept Rt,isBig,isExpensive,isSelect,Hamming
0,Institution A,19000,22000,0.25,1,1,1,2
1,Institution B,11500,19000,0.45,0,1,0,0
2,Institution C,7750,12000,0.76,0,0,0,1
3,Institution D,23000,10500,0.99,1,0,1,3


## Jaccard Index

In [6]:
# Goal: Count the number of matching dimensions among two observations
#       Caluclate Jaccard Index distance
a = df[['isBig','isExpensive','isSelect']].iloc[3]
b = df[['isBig','isExpensive','isSelect']].iloc[0]
matches = sum([1 if x == y else 0 for x, y in zip(a,b)])
dist = 1 - (matches / len(b))
print('Number of matching observations : {}'.format(matches))
print('Jacard Index distance           : {}'.format(dist))

Number of matching observations : 2
Jacard Index distance           : 0.33333333333333337


In [9]:
def jaccard(data, ref, cols):
    '''Calculates Jarcard index for all observations
    relative to the referenced observation. Returns a list
    of Jaccardian distances.
    
    data = A dataframe.
    ref  = A reference observation. Specify by the axis 0 index.
    cols = A set of comparison columns.'''

    distances = hamming(data, ref, cols)
    length = len(cols)
    distances = [1 - (x/length) for x in distances]
    
    return(distances)

In [10]:
# ref_institution = df.iloc[3]
df['Jaccard'] = jaccard(data=df, ref=3, 
                        cols=['isBig', 'isExpensive', 'isSelect'])
df

Unnamed: 0,Inst,Size,Cost,Accept Rt,isBig,isExpensive,isSelect,Hamming,Jaccard
0,Institution A,19000,22000,0.25,1,1,1,2,0.333333
1,Institution B,11500,19000,0.45,0,1,0,0,1.0
2,Institution C,7750,12000,0.76,0,0,0,1,0.666667
3,Institution D,23000,10500,0.99,1,0,1,3,0.0


## Euclidean Distance

In [9]:
# Goal: Count the number of matching dimensions among two observations

# Sample data for test one
a = [2,10]
b = [10,10]

# Calculate from scratch
rise = a[0] - b[0]
run1 = a[1] - b[1]
scatch_dist = sqrt(rise**2 + run1**2)
print('Distance from scratch : {}'.format(scatch_dist))

# Calculate with scipy assist
assist_dist = distance.euclidean(a, b)
print('Distance with assist : {}'.format(scatch_dist))

Distance from scratch : 8.0
Distance with assist : 8.0


In [10]:
# Sample data for test one
a = df[['Size','Cost','Accept Rt']].apply(zscore).iloc[0]
b = df[['Size','Cost','Accept Rt']].apply(zscore).iloc[3]

# Calculate from scratch
rise = a[0] - b[0]
run1 = a[1] - b[1]
run2 = a[2] - b[2]
scatch_dist = sqrt(rise**2 + run1**2 + run2**2)
print('Distance from scratch : {}'.format(scatch_dist))

# Calculate with scipy assist
assist_dist = distance.euclidean(a, b)
print('Distance with assist : {}'.format(scatch_dist))

Distance from scratch : 3.6118250646712147
Distance with assist : 3.6118250646712147


In [11]:
def euclidian(data, ref, cols):
    '''Calculates Euclidian distance for all observations
    relative to the referenced observation. Returns a list
    of euclidian distances.
    
    data = A dataframe.
    ref  = A reference observation. Specify by the axis 0 index.
    cols = A set of comparison columns.'''

    if type(ref) == int:
        ref = ref
    if type(ref) == str:
        ref = df.index.get_loc(ref)
    
    distances = []
    ref_observation = data[cols].apply(zscore).iloc[ref]
    for row in range(len(data)):
        comp_observation = data[cols].apply(zscore).iloc[row]
        dist = distance.euclidean(ref_observation, comp_observation)
        distances.append(dist)
    return(distances)

In [12]:
df['Euclidians'] = euclidian(data=df, ref=3, cols=['Size','Cost','Accept Rt'])
df

Unnamed: 0,Inst,Size,Cost,Accept Rt,isBig,isExpensive,isSelect,Jaccard,Hamming,Euclidians
0,Institution A,19000,22000,0.25,1,1,1,0.333333,2,3.611825
1,Institution B,11500,19000,0.45,0,1,0,1.0,0,3.233217
2,Institution C,7750,12000,0.76,0,0,0,0.666667,1,2.682701
3,Institution D,23000,10500,0.99,1,0,1,0.0,3,0.0


# More Categorical Examples

Demonstrate that this implementation does not require one hot encoding. Can operate with text categoricals.

In [13]:
df = pd.DataFrame(data)
df

Unnamed: 0,Inst,Size,Cost,Accept Rt,isBig,isExpensive,isSelect
0,Institution A,19000,22000,0.25,1,1,1
1,Institution B,11500,19000,0.45,0,1,0
2,Institution C,7750,12000,0.76,0,0,0
3,Institution D,23000,10500,0.99,1,0,1


In [14]:
df['Urban'] = ['Urban','Urban','Rural','Rural']
df['Research'] = ['Low','Low','High','High']
df

Unnamed: 0,Inst,Size,Cost,Accept Rt,isBig,isExpensive,isSelect,Urban,Research
0,Institution A,19000,22000,0.25,1,1,1,Urban,Low
1,Institution B,11500,19000,0.45,0,1,0,Urban,Low
2,Institution C,7750,12000,0.76,0,0,0,Rural,High
3,Institution D,23000,10500,0.99,1,0,1,Rural,High


In [15]:
df['Jaccard2'] = jaccard(data=df, ref=3, cols=['isBig','isExpensive',
                                              'isSelect','Urban','Research'])
df

Unnamed: 0,Inst,Size,Cost,Accept Rt,isBig,isExpensive,isSelect,Urban,Research,Jaccard2
0,Institution A,19000,22000,0.25,1,1,1,Urban,Low,0.6
1,Institution B,11500,19000,0.45,0,1,0,Urban,Low,1.0
2,Institution C,7750,12000,0.76,0,0,0,Rural,High,0.4
3,Institution D,23000,10500,0.99,1,0,1,Rural,High,0.0


In [16]:
df['hamming2'] = hamming(data=df, ref=3, cols=['isBig','isExpensive',
                                               'isSelect','Urban','Research'])
df

Unnamed: 0,Inst,Size,Cost,Accept Rt,isBig,isExpensive,isSelect,Urban,Research,Jaccard2,hamming2
0,Institution A,19000,22000,0.25,1,1,1,Urban,Low,0.6,2
1,Institution B,11500,19000,0.45,0,1,0,Urban,Low,1.0,0
2,Institution C,7750,12000,0.76,0,0,0,Rural,High,0.4,3
3,Institution D,23000,10500,0.99,1,0,1,Rural,High,0.0,5


# Update Index to tinker with more readable references

In [17]:
df = pd.DataFrame(data)
df.set_index('Inst', inplace=True)
df

Unnamed: 0_level_0,Size,Cost,Accept Rt,isBig,isExpensive,isSelect
Inst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Institution A,19000,22000,0.25,1,1,1
Institution B,11500,19000,0.45,0,1,0
Institution C,7750,12000,0.76,0,0,0
Institution D,23000,10500,0.99,1,0,1


In [18]:
df['Euclidians2'] = euclidian(data=df, ref='Institution D', 
                              cols=['Size','Cost','Accept Rt'])
df

Unnamed: 0_level_0,Size,Cost,Accept Rt,isBig,isExpensive,isSelect,Euclidians2
Inst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Institution A,19000,22000,0.25,1,1,1,3.611825
Institution B,11500,19000,0.45,0,1,0,3.233217
Institution C,7750,12000,0.76,0,0,0,2.682701
Institution D,23000,10500,0.99,1,0,1,0.0
