# Profiling and optimisation with Python

Also see: http://paris-swc.github.io/python-testing-debugging-profiling/

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## Measuring total run times

In [None]:
from python_syntax.profiling import kmeans

In [None]:
def test_kmeans_two_gaussians(points, plot=False):
    """
    Use the kmeans algorithms with two 2-d Gaussians with different means and variances.
    One of the Gaussians has twice as many points as the other.
    
    points scales the total number of points.
    
    If `plot` is True, plot the data points and clusters (don't do this for many data points!).
    """
    N1, N2 = int(2*points), int(1*points)
    data = np.vstack([np.random.multivariate_normal([-2, -2], [[1, 0],   [0, 1]], size=N1),
                      np.random.multivariate_normal([1, 1],   [[0.5, 0], [0, 2]], size=N2)])
    labels = np.concatenate([np.zeros(N1), np.ones(N2)])
    initial = np.array([[-1, 0], [1, 0]])
    correct = kmeans.evaluate(data, labels, initial, plot=plot)
    return correct

## Measuring detailed run times

### Warning: don't get lost in micro-optimisations

In [None]:
def is_prime(x):
    found = False
    for y in range(2, x):
        if np.mod(x, y) == 0:
            found = True
    return not found

def test_primes(low, high, n):
    test_data = np.random.randint(low, high, size=n)
    for number in test_data:
        # we throw away the results, we just use it to see how long it takes
        is_prime(number)

## Parallelization with multiple processes

**"Embarassingly parallel"** Problem: a problem that can be trivially separated into independent tasks

Typical examples:
  * Running the same calculation/analysis on different data sets
  * Running the same non-deterministic simulation several times
  * Parameter explorations