# Outline
 Benchmarking different approaches will happeb here. Profiling might still happen in the main simulations notebook.

In [1]:
import numpy as np
import scipy
import pandas as pd
import matplotlib.pyplot as plt
import ipdb
import ipywidgets as widgets
from IPython.display import display
from tqdm import tqdm
import matplotlib.path as path
import matplotlib.patches as patches
import matplotlib.animation as animation
from IPython.display import HTML
from line_profiler import LineProfiler
from scipy.spatial.distance import pdist, squareform
from sklearn.metrics.pairwise import euclidean_distances


# 1D benchmarking

In [4]:
def distance_multiplier(df, r1, r2, thresh, power):
    """
    returns a distance multiplier based on two individuals, to be smacked 
    onto the transmission rate, that is always less than 1. 
    
    if the metric distance (defined within) is below thresh, multiplier of 1.
    Anything greater falls off as distance to the power law. Power is some law greater than 0.
    
    Inputs:
        df : (pandas DataFrame) object holding all values of infected people. Each
                            column of "infected day _" corresponds to a different day, 
                            with "_" being some integer or float. The "name" column
                            assigns a name to each object, independent of index. In
                            the infected columns, a 0 counts as infected, while a 1 is 
                            healthy.
        r1 : (float) position of first point.
        r2 : (float) position of second point.
        thresh : (float) distance less than which infection is transmitted at the trans_rate;
                            that is, less than which this function returns a value of 1. At
                            a distance greater than this, this function returns 1/distance^power.
        power : (float) Greater than 0. Power to which the multiplier falls off if the distance
                            is greater than some threshold.
    
    Outputs:
        multiplier : (float) suppresses the rate of transmission.
    """
    def metric_distance(r1, r2):
        """
        Returns distance between two points.
        
        Inputs:
            r1 : (float) position of first point.
            r2 : (float) position of second point.
        
        Outputs:
            dist : (float) distance between the two points.
            
        """
        if type(r1) != float:
            raise AssertionError("r1 must be a float.")
        if type(r2) != float and type(r2) != np.float64 and type(r2) != np.float32:
            raise AssertionError("r2 must be a float.")
        
        dist = abs(r1 - r2)
        return dist
    # first check input types
    if type(thresh) != int and type(thresh) != float:
        raise AssertionError("wrong type for thresh.")
    if type(df) != pd.core.frame.DataFrame:
        raise AssertionError("df must be a pandas DataFrame.")
    if type(r1) != float:
        raise AssertionError("r1 must be a float.")

    if type(r2) != float and type(r2) != np.float64 and type(r2) != np.float32:
        raise AssertionError("r2 must be a float.")
    if type(power) != float and type(power) != int:
        raise AssertionError("power must be a float or integer.")
    
        
    # first check input values
    if thresh <= 0:
        raise AssertionError("thresh must be positive.")
    if power <= 0:
        raise AssertionError("power must be positive.")
        
    dist = metric_distance(r1, r2)
    if dist < thresh:
        return 1.
    else:
        return 1/pow(dist, power)

In [5]:
N = 10000
name = np.arange(N)
distrib_pop = np.random.uniform
kwargs_for_pop = {'low':-10, 'high':10}
locs = distrib_pop(size=N, **kwargs_for_pop)
zero_infected = np.ones(N)
d = {'name': np.arange(N), 'infected day 0': zero_infected, 'locs' : locs} 

df1D_test = pd.DataFrame(data=d)

In [6]:
%%timeit
power = 2
thresh = 2

r2 = df1D_test['locs'][1]
df1D_test['locs'].apply(lambda x:distance_multiplier(df1D_test, x, r2, thresh, power)) 

115 ms ± 8.58 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
%%timeit

# this is certainly around 25% faster
power = 3
thresh = 2
r2 = df1D_test['locs'][1]
df1D_test['subs'] = df1D_test['locs'].apply(lambda x:abs(x - r2))
df1D_test['metrics'] = df1D_test['subs'].apply(lambda x:1/pow(x, power) if x > thresh else 1)

# 2D benchmarking

## 2D distance metric

In [17]:
N = 10000
name = np.arange(N)
distrib_pop = np.random.uniform
kwargs_for_pop = {'low':-10, 'high':10}
y = distrib_pop(size=N, **kwargs_for_pop)
x = distrib_pop(size=N, **kwargs_for_pop)
test = [[1,2],[2,3]]
zero_infected = np.ones(N)
d = {'name': np.arange(N), 'infected day 0': zero_infected, 'x' : x, 'y' : y} 

df2D_test = pd.DataFrame(data=d)

In [8]:
%%timeit
euclidean_distances(df2D_test[['x', 'y']], df2D_test[['x', 'y']])

2.5 s ± 290 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [9]:
a = euclidean_distances(df2D_test[['x', 'y']], df2D_test[['x', 'y']])
a[1]

array([17.04329012,  0.        , 12.46071423, ...,  2.31653702,
       15.47031206, 10.59469593])

In [10]:
%%timeit
euclidean_distances([df2D_test['x'], df2D_test['y']], [df2D_test['x'], df2D_test['y']])

39 ms ± 8.43 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [11]:
a = euclidean_distances([df2D_test['x'], df2D_test['y']], [df2D_test['x'], df2D_test['y']])
a[1]

array([809.84252975,   0.        ])

In [12]:
%%timeit
euclidean_distances([df2D_test['x'].values, df2D_test['y'].values], [df2D_test['x'].values, df2D_test['y'].values])

962 µs ± 218 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [13]:
a = euclidean_distances([df2D_test['x'].values, df2D_test['y'].values], [df2D_test['x'].values, df2D_test['y'].values])
a[1]

array([809.84252975,   0.        ])

In [14]:
%%timeit
distances = pdist([df2D_test['x'].values, df2D_test['y'].values], metric='euclidean')
dist_matrix = squareform(distances)

184 µs ± 32.5 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [15]:
distances = pdist([df2D_test['x'].values, df2D_test['y'].values], metric='euclidean')
a = squareform(distances)
a[1]

array([809.84252975,   0.        ])

In [18]:
%%timeit
distances = pdist(np.concatenate((test, df2D_test[["x", "y"]].values)), metric='euclidean')
dist = squareform(distances)

3.17 s ± 398 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
distances = pdist(np.concatenate((test, df2D_test[["x", "y"]].values)), metric='euclidean')
a = squareform(distances)
a[1]

## Testing subtraction stuff

In [19]:
%%timeit
df2D_test['x'] - 1

502 µs ± 92.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [20]:
%%timeit
df2D_test['x'].sub(1)

544 µs ± 70.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [21]:
%%timeit
df2D_test['x']**2

561 µs ± 71.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [22]:
%%timeit
np.square(df2D_test['x'])

362 µs ± 98.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [23]:
%%timeit
np.power(df2D_test['x'], 2)

537 µs ± 110 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [24]:
%%timeit
np.power(df2D_test['x'].values, 2)

695 µs ± 37.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


## testing cKTTree
- don't calculate all the distances
- cut off within a certain region

In [25]:
from scipy.spatial import cKDTree
tree = cKDTree(df2D_test[["x", "y"]])
pairs = tree.query_pairs(5, p=2)   # 50km radius, L2 (Euclidean) norm