diff --git a/.gitignore b/.gitignore index 09a7ab178..5e1e79300 100644 --- a/.gitignore +++ b/.gitignore @@ -478,7 +478,7 @@ paket-files/ # Python Tools for Visual Studio (PTVS) __pycache__/ *.pyc - +*.ipynb_checkpoints* # Cake - Uncomment if you are using it # tools/** # !tools/packages.config diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index fc3113ecc..ca16b50f9 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -61,4 +61,5 @@ This file lists everyone, who contributed to this repo and wanted to show up her - Hugo Salou - Dimitri Belopopsky - Henrik Abel Christensen +- K. Shudipto Amin - Peanutbutter_Warrior diff --git a/SUMMARY.md b/SUMMARY.md index 4d4597b1c..644fccc56 100644 --- a/SUMMARY.md +++ b/SUMMARY.md @@ -20,9 +20,11 @@ * [Multiplication as a Convolution](contents/convolutions/multiplication/multiplication.md) * [Convolutions of Images (2D)](contents/convolutions/2d/2d.md) * [Convolutional Theorem](contents/convolutions/convolutional_theorem/convolutional_theorem.md) + * [Probability Distributions](contents/probability/distributions/distributions.md) * [Tree Traversal](contents/tree_traversal/tree_traversal.md) * [Euclidean Algorithm](contents/euclidean_algorithm/euclidean_algorithm.md) * [Monte Carlo](contents/monte_carlo_integration/monte_carlo_integration.md) + * [Metropolis](contents/metropolis/metropolis.md) * [Matrix Methods](contents/matrix_methods/matrix_methods.md) * [Gaussian Elimination](contents/gaussian_elimination/gaussian_elimination.md) * [Thomas Algorithm](contents/thomas_algorithm/thomas_algorithm.md) diff --git a/contents/metropolis/code/python/metropolis.py b/contents/metropolis/code/python/metropolis.py new file mode 100644 index 000000000..a4fc29794 --- /dev/null +++ b/contents/metropolis/code/python/metropolis.py @@ -0,0 +1,90 @@ +import numpy as np + + +def f(x, normalize=False): + ''' + Function proportional to target distribution, a sum of Gaussians. + For testing, set normalize to True, to get target distribution exactly. + ''' + # Gaussian heights, width parameters, and mean positions respectively: + a = np.array([10., 3., 1.]).reshape(3, 1) + b = np.array([ 4., 0.2, 2.]).reshape(3, 1) + xs = np.array([-4., -1., 5.]).reshape(3, 1) + + if normalize: + norm = (np.sqrt(np.pi) * (a / np.sqrt(b))).sum() + a /= norm + + return (a * np.exp(-b * (x - xs)**2)).sum(axis=0) + +def g(): + '''Random step vector.''' + return np.random.uniform(-1,1) + +def metropolis_step(x, f=f, g=g): + '''Perform one full iteration and return new position.''' + + x_proposed = x + g() + a = min(1, (f(x_proposed) / f(x)).item()) + + x_new = np.random.choice([x_proposed, x], p=[a, 1-a]) + + return x_new + +def metropolis_iterate(x0, num_steps): + '''Iterate metropolis algorithm for num_steps using iniital position x_0''' + + for n in range(num_steps): + if n == 0: + x = x0 + else: + x = metropolis_step(x) + yield x + + +def test_metropolis_iterate(num_steps, xmin, xmax, x0): + ''' + Calculate error in normalized density histogram of data + generated by metropolis_iterate() by using + normalized-root-mean-square-deviation metric. + ''' + + bin_width = 0.25 + bins = np.arange(xmin, xmax + bin_width/2, bin_width) + centers = np.arange(xmin + bin_width/2, xmax, bin_width) + + true_values = f(centers, normalize=True) + mean_value = np.mean(true_values - min(true_values)) + + x_dat = list(metropolis_iterate(x0, num_steps)) + heights, _ = np.histogram(x_dat, bins=bins, density=True) + + nmsd = np.average((heights - true_values)**2 / mean_value) + nrmsd = np.sqrt(nmsd) + + return nrmsd + + + +if __name__ == "__main__": + xmin, xmax = -10, 10 + x0 = np.random.uniform(xmin, xmax) + + num_steps = 50_000 + + x_dat = list(metropolis_iterate(x0, 50_000)) + + # Write data to file + output_string = "\n".join(str(x) for x in x_dat) + + with open("output.dat", "w") as out: + out.write(output_string) + out.write("\n") + + + # Testing + print(f"Testing with x0 = {x0:5.2f}") + print(f"{'num_steps':>10s} {'NRMSD':10s}") + for num_steps in (500, 5_000, 50_000): + nrmsd = test_metropolis_iterate(num_steps, xmin, xmax, x0) + print(f"{num_steps:10d} {nrmsd:5.1%}") diff --git a/contents/metropolis/metropolis.md b/contents/metropolis/metropolis.md new file mode 100644 index 000000000..961f1b14d --- /dev/null +++ b/contents/metropolis/metropolis.md @@ -0,0 +1,283 @@ +# The Metropolis Algorithm + +The [Monte Carlo Integration](../monte_carlo_integration/monte_carlo_integration.html) method uses random numbers to approximate the area of pretty much any shape we choose. +The Metropolis algorithm {{ "metropolis1953equation" | cite }} is a slightly more advanced Monte Carlo method which uses random numbers to approximate a [probability distribution](../probability/distributions/distributions.md): + +$$ +P(\mathbf{x}) = \frac{f(\mathbf{x})}{\displaystyle\int_D f(\mathbf{x})d\mathbf{x}}, +$$ + +where $$D$$ is the domain of $$P(\mathbf{x})$$, i.e., all possible values of the $$\mathbf{x}$$ for which $$P(\mathbf{x})$$ is defined. +$$f(\mathbf{x})$$ is a function that is proportional to $$P(x)$$, such as a statistical frequency distribution which counts the number of occurences of each $$\mathbf{x}$$. +The integral in the denominator is the __normalization factor__ which ensures that the sum of all probabilities is unity, i.e., +$$ +\int_D P(\mathbf{x}) d\mathbf{x} = 1. +$$ +A one-dimensional example is the __normal distribution__, or __Gaussian distribution__, given by + +$$ +P(x) = \frac{e^{-x^2}}{\displaystyle\int_{-\infty}^{\infty} e^{-x^2} dx} = \frac{1}{\sqrt{\pi}} e^{-x^2}. +$$ + + +In practice, it's often easy for us to know $$f(x)$$, but the integral in the denominator can be quite difficult to calculate, even numerically. +This is especially true when the coordinates ($$\mathbf{x}$$) are multidimensional, and $$f(\mathbf{x})$$ is an expensive calculation, as we shall see in the examples below. + +## An example application + +One example of a complicated probability function arises when considering a physical system of $$N$$ particles. +These could be atoms, molecules, or even star systems! +For such systems, we can usually describe the __potential energy__ {{ "potential_energy_wiki" | cite }} of the system as a function of the coordinates of all particles, $$\mathbf{x}$$, + +$$ +E(\mathbf{x}) = E(x_1, y_1, z_1, x_2, y_2, z_2, ... ,x_N, y_N, z_N), +$$ + +where $$x_i, y_i, z_i$$ are the spatial coordinates of particle $$i$$. +So altogether there are $$3N$$ coordinates – making $$E(\mathbf{x})$$ a $$3N$$ dimensional function, which can be a computationally intensive calculation on it's own. But it doesn't end there! + +The physicist Ludwig Boltzmann {{ "ludwig_boltzmann_wiki" | cite }} discovered that when such a system is in equilibrium at some temperature $$T$$, you can describe the probability density of the system for any set of coordinates $$\mathbf{x}$$ using, {{ "boltzmann_distribution_wiki" | cite }} + +$$ +P(\mathbf{x}) = \frac{\displaystyle \exp\left[{\displaystyle\frac{-E(\mathbf{x})}{T} } \right]} {Q}, +$$ + +where the numerator is called the __Boltzmann factor__, and $$Q$$ is the [normalization constant](../probability/distributions/distributions.md), + +$$ +Q = \int_D \exp\left[{\displaystyle\frac{-E(\mathbf{x})}{T} } \right] d\mathbf{x}. +$$ + +We can see now that the probability density function is a difficult calculation, particularly because of $$Q$$. +Almost always, no analytical solution exists to the integral in $$Q$$, and the numerical integration is unfeasible. + +To see that $$Q$$ is unfeasible to calculate, imagine there are just $$10$$ particles which all exist in a $$1$$D world, restricted to a line segment. + +

+ <FIG> 1D particles +

+ +Let's assume that the particles _interact_, meaning that the position of one particle affects that of another. +This could be the case, for example, if all the particles were charged, and so they would be repelling or attracting each other. +This means that the energy $$E(\mathbf{x}) = E(x_1,...,x_{10})$$ of the system is a $$10$$D function, and it would not be possible to simplify it any further due to the interactions. +Thus, the Boltzmann factor, $$\exp\left[-E(\mathbf{x})/T\right]$$, is also a $$10$$D function. To calculate $$Q$$, we would have to integrate the Boltzmann factor $$10$$ times, one for each coordinate, + +$$ +Q = \int_{x_1} \dots \int_{x_{10}} \exp\left[\frac{-E(x_1,\dots x_{10})}{T}\right]\ dx_1\dots dx_{10}. +$$ + +In most cases, there is no known analytical expression for the above integral, so it has to be done numerically. +To do so, imagine that we divide the $$1$$D line segment into only $$50$$ different intervals, allowing each particle to take on $$50$$ different positions. +This is equivalent to dividing the length of a football field into intervals of about $$2$$ meters – not a resolution you'd wanna watch a game in! +Even with such poor resolution, the number of different combinations of positions is $$10^{50}$$ – a colossal number indeed. +To see how large this number is, imagine that a single computation of $$E(\mathbf{x})$$ took only $$1$$ nanosecond on a single processor, which is much faster than most energy calculations for physical systems in practice. + With that speed, it would require $$10^{41}$$ seconds on a single processor to calculate $$Q$$ – which means that _even_ with all the processors in the world running in parallel (there could be billions or trillions of them), calculating $$Q$$ would still take longer than the age of the universe – by many orders of magnitude! + +What's really powerful about the Metropolis approach is that you don't need to know the probability function itself. +Instead, you just need a function which is _proportional_ to it. +What this means for the Boltzmann distribution is that you only need to know the term, + +$$ +f(\mathbf{x}) = \exp\left[{\displaystyle\frac{-E(\mathbf{x})}{T} } \right]. +$$ + +The Metropolis algorithm can bypass the calculation of $$Q$$ altogether and use $$f(x)$$ to generate a distribution of $$x$$ which follows the probability density $$P(x)$$. +In other words, it can sample values of $$x$$ in such away that the probability of sampling $$x$$ will follow the actual distribution $$P(x)$$. +Thus, if Metropolis was used to sample from $$x$$, the number of occurences of $$x$$ would be proportional to $$P(x)$$. +Numerical normalization can then be done by using the total number of samples instead of performing an integration. +This fact dramatically reduces the number of calculations needed to approximate the probability distribution. + +Finally, the Metropolis algorithm can be modified or implemented in other methods, and forms the basis of many advanced sampling algorithms. +The most popular is probably the Metropolis-Hastings algorithm {{ "hastings1970monte" | cite }} which is fundamentally the same. +Some other algorithms that use this method are Metropolis-adjusted Langevin algorithm {{ "mala_wiki" | cite }}, and Hamiltonian Monte Carlo {{ "hmc_wiki" | cite }}, to name a few. +They are often used for physical systems that follow a Boltzmann distribution. + + +## A Random Walk in One Dimension + +In the rest of this chapter, we will look at $$1$$D examples to understand the Metropolis algorithm. +Although the algorithm is not particularly efficient in just one dimension, it is much easier to understand in one dimension than in multiple dimensions. +The Metropolis algorithm is very similar to a random walk, so let's first see how we can get a distribution from a random walk. + +
+ +
+ +The dot in the figure above is a "walker", whose initial position is $$x=0$$. +The step size, $$g$$, is a random number in the interval $$(-1, 1)$$. +To get the next position of the walker, we simply generate $$g$$ and add it to the current position. +To get a distribution of $$x$$ from this walk, we can divide the domain into discrete locations or "bins" and count how often the walker visits each bin. +Each time it visits a bin, the frequency for that bin goes up by one. +Over many iterations, we get a frequency distribution of $$x$$. + +## A Random Walk With an Acceptance Criterion + +The Metropolis algorithm works in a similar way to the random walk, but differs crucially in one way – after choosing a random step for the walker, a decision is made about whether to __accept__ or __reject__ the step based on the function $$f(x)$$. +To understand how this works, let's call $$x_t$$ the position before the step, and $$x'$$ the position after it. +We then define the probability of __accepting the step__ to be + +$$ +A = \min \left(\frac{f(x')}{f(x_t)}, 1\right). +$$ + +The $$\min$$ function above implies that $$A=1$$ if $$f(x') \gt f(x_t)$$, which means that the move will __always__ be accepted if it is toward a higher probability position. +Otherwise, it will be accepted with a probability of $$f(x') / f(x_t)$$. +If we create a histogram of this walk for some arbitrary target function $$P(x)$$, we can see from the figure below that the frequency starts to look very much like it after many iterations! + +
+ +
+ +Although convergence occurs eventually, not all parts of the distribution achieve convergence quickly. +Note from the animation above, that the walker very quickly replicates the distribution of the two peaks on the left, but takes quite a while to even reach the third peak to the right. +This is because there is a long low probability region between the third peak and second peak that acts as a "barrier." +This may not necessarily be a bad thing – sometimes one might want to estimate how long something takes to transition from one state to another, and often these peaks represent such 'states'. +So averaging over many metropolis runs may give some estimate of these transition times. +If global sampling is the goal, the process of exploration could be sped up by choosing larger step sizes for the walker, for example by choosing step size $$g$$ from an interval like $$(-3,3)$$ instead of $$(-1,1)$$. + + +## The Algorithm for a One Dimensional Example + +Now let's dive into the actual algorithm with some example code! + +### The Initial Setup + +Let our target distribution be +$$ +P(x) = \frac{f(x)}{\int_{-10}^{10} f(x)}, +$$ + +where $$f(x)$$ is the same function we have shown above and is given by +$$ +f(x) = 10e^{-4(x+4)^2} + 3e^{-0.2(x+1)^2} + e^{-2(x-5)^2}. +$$ + +The code for defining this function is given below. + +{% method %} +{% sample lang="py" %} +[import:4-18, lang:"python"](code/python/metropolis.py) +{% endmethod %} + +Since this is an easy function to integrate, and hence get our target distribution $$P(x)$$ directly, we can use it to verify the Metropolis algorithm. +The plot of $$P(x)$$ in the figure below shows three different peaks of varying width and height, with some overlap as well. + +

+ <FIG> Plot of P(x) +

+ +Next, we define our walker's symmetric step generating function. +As in the random walk example, we will use a random real number between $$-1$$ and $$+1$$ as the step size. + +{% method %} +{% sample lang="py" %} +[import:20-22, lang:"python"](code/python/metropolis.py) +{% endmethod %} + +However, $$g$$ can be any function symmetric about $$0$$ for the above algorithm to work. +For example, it can be a number chosen randomly from a discrete list, such as $$[ -3, -1, -1, +1, +1, +3]$$. +It can also be a number chosen from a symmetric continuos distribution, like the Gaussian, $$e^{-x^2}$$. +In higher dimensions, the function should be spherically symmetric, such as a multidimensional Gaussian function, $$e^{-(x^2 +y^2 + ...)}$$. +Whatever function you choose, there are at least a couple of things to note: +1. If the function $$g$$ is discrete, you will only sample discrete values. +For example, if $$g$$ returns only $$-1$$ or $$+1$$, and nothing in between, you will sample only integer steps away from the initial $$x_0$$. +2. The average step size really matters! +A small step-size means the walker will carefully sample nearby regions more, but will walk more slowly, so might not be good at exploring far and wide. +On the other hand, a walker with a large step size may not sample nearby regions accurately – and actually has a higher chance of being rejected if the walker is already in a high probability region, since the acceptance ratio is more drastic for large steps. +The effect of step-size on the walker's efficiency is far from obvious! + +The question of how to choose an optimal $$g$$ is a research area on its own, and depends largely on what the goal of the sampling is. +Some techniques even use an "adaptive" method where $$g$$ is "trained" on-the-fly using a learning algorithm! +Some of these methods and others are discussed in Ref. {{ "rosenthal2011optimal" | cite }} and Ref. {{ "gareth2001optimal" | cite }}. +In a lot of cases, people just use trial and error, as the algorithm is not too difficult to implement. + +After chosing $$g$$, we are almost ready to iterate. +We just need to choose the domain of $$x$$, and an initial point for $$ x_0 $$ ($$x_t$$ at $$t = 0$$) chosen randomly from the domain of $$x$$. + +{% method %} +{% sample lang="py" %} +[import:70-71, lang:"python"](code/python/metropolis.py) +{% endmethod %} + +### How to Iterate + +1. Generate new proposed position $$x' = x_t + g$$. +2. Calculate the acceptance probability, +$$ +A = \min\left(1, \frac{f(x')}{f(x_t)}\right). +$$ +3. Accept proposal, $$x'$$ with probability $$A$$. If your programming language doesn't have a built-in method for this, + * Generate a random number $$u$$ between $$0$$ and $$1$$. + * If $$ u \leq A $$, then __accept__ move, and set new position, $$x_{t+1} = x' $$. + * Otherwise, __reject__ move, and set new position to current position, $$x_{t+1} = x_t $$. +4. Increment $$t \rightarrow t + 1$$ and repeat from step 1. + +The code for steps 1 to 3 is: + +{% method %} +{% sample lang="py" %} +[import:34-42, lang:"python"](code/python/metropolis.py) +{% endmethod %} + +The following plot shows the result of running the algorithm for different numbers of iterations ($$N$$), with the same initial position. +The histograms are normalized so that they integrate to $$1$$. +We can see the convergence toward $$P(x)$$ as we increase $$N$$. + +

+ <FIG> multiple histograms +

+ + +## Example Code +The following code puts everything together, and runs the Metropolis algorithm for a number of steps given by `num_steps`. +All the positions visited by the algorithm are then written to a file, which can be later read and fed into a histogram or other density calculating scheme. +The code also incorporates a few tests of the algorithm using the `test_metropolis_iterate` method. +This test will create a normalized density histogram from the generated data, and compare it to $$P(x)$$ using the Root Mean Square Deviations metric {{ "rmsd_wiki" | cite }}. + +{% method %} +{% sample lang="py" %} +[import, lang:"python"](code/python/metropolis.py) +{% endmethod %} + + + + + +### Bibliography + +{% references %} {% endreferences %} + +## License + +##### Code Examples + +The code examples are licensed under the MIT license (found in [LICENSE.md](https://github.com/algorithm-archivists/algorithm-archive/blob/master/LICENSE.md)). + + +##### Images/Graphics +- The animation "[Animated Random Walk](res/animated_random_walk.gif)" was created by [K. Shudipto Amin](https://github.com/shudipto-amin) and is licensed under the [Creative Commons Attribution-ShareAlike 4.0 International License](https://creativecommons.org/licenses/by-sa/4.0/legalcode). + +- The animation "[Animated Metropolis](res/animated_metropolis.gif)" was created by [K. Shudipto Amin](https://github.com/shudipto-amin) and is licensed under the [Creative Commons Attribution-ShareAlike 4.0 International License](https://creativecommons.org/licenses/by-sa/4.0/legalcode). + +- The image "[Plot of P(x)](res/plot_of_P.png)" was created by [K. Shudipto Amin](https://github.com/shudipto-amin) and is licensed under the [Creative Commons Attribution-ShareAlike 4.0 International License](https://creativecommons.org/licenses/by-sa/4.0/legalcode). + +- The image "[Multiple Histograms](res/multiple_histograms.png)" was created by [K. Shudipto Amin](https://github.com/shudipto-amin) and is licensed under the [Creative Commons Attribution-ShareAlike 4.0 International License](https://creativecommons.org/licenses/by-sa/4.0/legalcode). + +##### Text + +The text of this chapter was written by [K. Shudipto Amin](https://github.com/shudipto-amin) and is licensed under the [Creative Commons Attribution-ShareAlike 4.0 International License](https://creativecommons.org/licenses/by-sa/4.0/legalcode). + +[

](https://creativecommons.org/licenses/by-sa/4.0/) + +##### Pull Requests + +After initial licensing ([#560](https://github.com/algorithm-archivists/algorithm-archive/pull/560)), the following pull requests have modified the text or graphics of this chapter: +- none diff --git a/contents/metropolis/res/1D_particles.png b/contents/metropolis/res/1D_particles.png new file mode 100644 index 000000000..3d8ab7d99 Binary files /dev/null and b/contents/metropolis/res/1D_particles.png differ diff --git a/contents/metropolis/res/animated_metropolis.gif b/contents/metropolis/res/animated_metropolis.gif new file mode 100644 index 000000000..93356bb7e Binary files /dev/null and b/contents/metropolis/res/animated_metropolis.gif differ diff --git a/contents/metropolis/res/animated_metropolis.mp4 b/contents/metropolis/res/animated_metropolis.mp4 new file mode 100644 index 000000000..68bfc36ec Binary files /dev/null and b/contents/metropolis/res/animated_metropolis.mp4 differ diff --git a/contents/metropolis/res/animated_random_walk.gif b/contents/metropolis/res/animated_random_walk.gif new file mode 100644 index 000000000..f80bc11b0 Binary files /dev/null and b/contents/metropolis/res/animated_random_walk.gif differ diff --git a/contents/metropolis/res/animated_random_walk.mp4 b/contents/metropolis/res/animated_random_walk.mp4 new file mode 100644 index 000000000..d7825ec3c Binary files /dev/null and b/contents/metropolis/res/animated_random_walk.mp4 differ diff --git a/contents/metropolis/res/multiple_histograms.png b/contents/metropolis/res/multiple_histograms.png new file mode 100644 index 000000000..9180253bc Binary files /dev/null and b/contents/metropolis/res/multiple_histograms.png differ diff --git a/contents/metropolis/res/plot_of_P.png b/contents/metropolis/res/plot_of_P.png new file mode 100644 index 000000000..3e4f3d10b Binary files /dev/null and b/contents/metropolis/res/plot_of_P.png differ diff --git a/contents/probability/distributions/distributions.md b/contents/probability/distributions/distributions.md new file mode 100644 index 000000000..aba490e42 --- /dev/null +++ b/contents/probability/distributions/distributions.md @@ -0,0 +1,214 @@ +# What's a probability distribution? + +Probability distributions are mathematical functions that give the probabilities of a range or set of outcomes. +These outcomes can be the result of an experiment or procedure, such as tossing a coin or rolling dice. +They can also be the result of a physical measurement, such as measuring the temperature of an object, counting how many electrons are spin up, etc. +Broadly speaking, we can classify probability distributions into two categories - __discrete probability distributions__ and __continuous probability distributions__. + +## Discrete Probability Distributions + +It's intuitive for us to understand what a __discrete__ probability distribution is. +For example, we understand the outcomes of a coin toss very well, and also that of a dice roll. +For a single coin toss, we know that the probability of getting heads $$(H)$$ is half, or $$P(H) = \frac{1}{2}$$. +Similarly, the probability of getting tails $$(T)$$ is $$P(T) = \frac{1}{2}$$. +Formally, we can write the probability distribution for such a coin toss as, + +$$ +P(n) = \begin{matrix} + \displaystyle \frac 1 2 &;& n \in \left\{H,T\right\}. + \end{matrix} +$$ + +Here, $$n$$ denotes the outcome, and we used the "set notation", $$n \in\left\{H,T\right\}$$, which means "$$n$$ belongs to a set containing $$H$$ and $$T$$". +From the above equation, we can also assume that any other outcome for $$n$$ (such as landing on an edge) is incredibly unlikely, impossible, or simply "not allowed" (for example, just toss again if it _does_ land on its edge!). + +For a probability distribution, it's important to take note of the set of possibilities, or the __domain__ of the distribution. +Here, $$\left\{H,T\right\}$$ is the domain of $$P(n)$$, telling us that $$n$$ can only be either $$H$$ or $$T$$. + +If we use a different system, the outcome $$n$$ could mean other things. +For example, it could be a number like the outcome of a __die roll__ which has the probability distribution, + + +$$ +P(n) = \begin{matrix} + \displaystyle\frac 1 6 &;& n \in [\![1,6]\!] + \end{matrix}. +$$ +This is saying that the probability of $$n$$ being a whole number between $$1$$ and $$6$$ is $$1/6$$, and we assume that the probability of getting any other $$n$$ is $$0$$. +This is a discrete probability function because $$n$$ is an integer, and thus only takes discrete values. + +Both of the above examples are rather boring, because the value of $$P(n)$$ is the same for all $$n$$. +An example of a discrete probability function where the probability actually depends on $$n$$, is when $$n$$ is the sum of numbers on a __roll of two dice__. +In this case, $$P(n)$$ is different for each $$n$$ as some possibilities like $$n=2$$ can happen in only one possible way (by getting a $$1$$ on both dice), whereas $$n=4$$ can happen in $$3$$ ways ($$1$$ and $$3$$; or $$2$$ and $$2$$; or $$3$$ and $$1$$). + +The example of rolling two dice is a great case study for how we can construct a probability distribution, since the probability varies and it is not immediately obvious how it varies. +So let's go ahead and construct it! + +Let's first define the domain of our target $$P(n)$$. +We know that the lowest sum of two dice is $$2$$ (a $$1$$ on both dice), so $$n \geq 2$$ for sure. Similarly, the maximum is the sum of two sixes, or $$12$$, so $$n \leq 12$$ also. + +So now we know the domain of possibilities, i.e., $$n \in [\![2,12]\!]$$. +Next, we take a very common approach - for each outcome $$n$$, we count up the number of different ways it can occur. +Let's call this number the __frequency of__ $$n$$, $$f(n)$$. +We already mentioned that there is only one way to get $$n=2$$, by getting a pair of $$1$$s. +By our definition of the function $$f$$, this means that $$f(2)=1$$. +For $$n=3$$, we see that there are two possible ways of getting this outcome: the first die shows a $$1$$ and the second a $$2$$, or the first die shows a $$2$$ and the second a $$1$$. +Thus, $$f(3)=2$$. +If you continue doing this for all $$n$$, you may see a pattern (homework for the reader!). +Once you have all the $$f(n)$$, we can visualize it by plotting $$f(n)$$ vs $$n$$, as shown below. + +

+ <FIG> Die Roll +

+ +We can see from the plot that the most common outcome for the sum of two dice is a $$n=7$$, and the further away from $$n=7$$ you get, the less likely the outcome. +Good to know, for a prospective gambler! + +### Normalization + +The $$f(n)$$ plotted above is technically NOT the probability $$P(n)$$ – because we know that the sum of all probabilities should be $$1$$, which clearly isn't the case for $$f(n)$$. +But we can get the probability by dividing $$f(n)$$ by the _total_ number of possibilities, $$N$$. +For two dice, that is $$N = 6 \times 6 = 36$$, but we could also express it as the _sum of all frequencies_, + +$$ +N = \sum_n f(n), +$$ + +which would also equal to $$36$$ in this case. +So, by dividing $$f(n)$$ by $$\sum_n f(n)$$ we get our target probability distribution, $$P(n)$$. +This process is called __normalization__ and is crucial for determining almost any probability distribution. +So in general, if we have the function $$f(n)$$, we can get the probability as + +$$ +P(n) = \frac{f(n)}{\displaystyle\sum_{n} f(n)}. +$$ + +Note that $$f(n)$$ does not necessarily have to be the frequency of $$n$$ – it could be any function which is _proportional_ to $$P(n)$$, and the above definition of $$P(n)$$ would still hold. +It's easy to check that the sum is now equal to $$1$$, since + +$$ +\sum_n P(n) = \frac{\displaystyle\sum_{n}f(n)}{\displaystyle\sum_{n} f(n)} = 1. +$$ + +Once we have the probability function $$P(n)$$, we can calculate all sorts of probabilites. +For example, let's say we want to find the probability that $$n$$ will be between two integers $$a$$ and $$b$$, inclusively (also including $$a$$ and $$b$$). +For brevity, we will use the notation $$\mathbb{P}(a \leq n \leq b)$$ to denote this probability. +And to calculate it, we simply have to sum up all the probabilities for each value of $$n$$ in that range, i.e., + +$$ +\mathbb{P}(a \leq n \leq b) = \sum_{n=a}^{b} P(n). +$$ + +## Probability Density Functions + +What if instead of a discrete variable $$n$$, we had a continuous variable $$x$$, like temperature or weight? +In that case, it doesn't make sense to ask what the probability is of $$x$$ being _exactly_ a particular number – there are infinite possible real numbers, after all, so the probability of $$x$$ being exactly any one of them is essentially zero! +But it _does_ make sense to ask what the probability is that $$x$$ will be _between_ a certain range of values. +For example, one might say that there is $$50\%$$ chance that the temperature tomorrow at noon will be between $$5$$ and $$15$$, or $$5\%$$ chance that it will be between $$16$$ and $$16.5$$. +But how do we put all that information, for every possible range, in a single function? +The answer is to use a __probability density function__. + + What does that mean? +Well, suppose $$x$$ is a continous quantity, and we have a probability density function, $$P(x)$$ which looks like + +

+ <FIG> probability density +

+ +Now, if we are interested in the probability of the range of values that lie between $$x_0$$ and $$x_0 + dx$$, all we have to do is calculate the _area_ of the green sliver above. +This is the defining feature of a probability density function: + + __the probability of a range of values is the _area_ of the region under the probability density curve which is within that range.__ + + +So if $$dx$$ is infinitesimally small, then the area of the green sliver becomes $$P(x)dx$$, and hence, + +$$ +\mathbb{P}(x_0 \leq x \leq x_0 + dx) = P(x)dx. +$$ + +So strictly speaking, $$P(x)$$ itself is NOT a probability, but rather the probability is the quantity $$P(x)dx$$, or any area under the curve. +That is why we call $$P(x)$$ the probability _density_ at $$x$$, while the actual probability is only defined for ranges of $$x$$. + +Thus, to obtain the probability of $$x$$ lying within a range, we simply integrate $$P(x)$$ between that range, i.e., + +$$ +\mathbb{P}(a \leq x \leq b ) = \int_a^b P(x)dx. +$$ + +This is analagous to finding the probability of a range of discrete values from the previous section: + +$$ +\mathbb{P}(a \leq n \leq b) = \sum_{n=a}^{b} P(n). +$$ + +The fact that all probabilities must sum to $$1$$ translates to + +$$ +\int_D P(x)dx = 1. +$$ + +where $$D$$ denotes the __domain__ of $$P(x)$$, i.e., the entire range of possible values of $$x$$ for which $$P(x)$$ is defined. + +### Normalization of a Density Function + +Just like in the discrete case, we often first calculate some density or frequency function $$f(x)$$, which is NOT $$P(x)$$, but proportional to it. +We can get the probability density function by normalizing it in a similar way, except that we integrate instead of sum: + +$$ +P(\mathbf{x}) = \frac{f(\mathbf{x})}{\int_D f(\mathbf{x})d\mathbf{x}}. +$$ + +For example, consider the following __Gaussian function__ (popularly used in __normal distributions__), + +$$ +f(x) = e^{-x^2}, +$$ + +which is defined for all real numbers $$x$$. +We first integrate it (or do a quick google search, as it is rather tricky) to get + +$$ +N = \int_{-\infty}^{\infty} e^{-x^2} dx = \sqrt{\pi}. +$$ + +Now we have a Gaussian probability distribution, + +$$ +P(x) = \frac{1}{N} e^{-x^2} = \frac{1}{\sqrt{\pi}} e^{-x^2}. +$$ + +In general, normalization can allow us to create a probability distribution out of almost any function $$f(\mathbf{x})$$. +There are really only two rules that $$f(\mathbf{x})$$ must satisfy to be a candidate for a probability density distribution: +1. The integral of $$f(\mathbf{x})$$ over any subset of $$D$$ (denoted by $$S$$) has to be non-negative (it can be zero): +$$ +\int_{S}f(\mathbf{x})d\mathbf{x} \geq 0. +$$ +2. The following integral must be finite: +$$ +\int_{D} f(\mathbf{x})d\mathbf{x}. +$$ + + + +## License + +##### Images/Graphics + +- The image "[Frequency distribution of a double die roll](res/double_die_frequencies.png)" was created by [K. Shudipto Amin](https://github.com/shudipto-amin) and is licensed under the [Creative Commons Attribution-ShareAlike 4.0 International License](https://creativecommons.org/licenses/by-sa/4.0/legalcode). + +- The image "[Probability Density](res/normal_distribution.png)" was created by [K. Shudipto Amin](https://github.com/shudipto-amin) and is licensed under the [Creative Commons Attribution-ShareAlike 4.0 International License](https://creativecommons.org/licenses/by-sa/4.0/legalcode). + +##### Text + +The text of this chapter was written by [K. Shudipto Amin](https://github.com/shudipto-amin) and is licensed under the [Creative Commons Attribution-ShareAlike 4.0 International License](https://creativecommons.org/licenses/by-sa/4.0/legalcode). + +[

](https://creativecommons.org/licenses/by-sa/4.0/) + +##### Pull Requests + +After initial licensing ([#560](https://github.com/algorithm-archivists/algorithm-archive/pull/560)), the following pull requests have modified the text or graphics of this chapter: +- none + diff --git a/contents/probability/distributions/res/double_die_frequencies.png b/contents/probability/distributions/res/double_die_frequencies.png new file mode 100644 index 000000000..874633331 Binary files /dev/null and b/contents/probability/distributions/res/double_die_frequencies.png differ diff --git a/contents/probability/distributions/res/normal_distribution.png b/contents/probability/distributions/res/normal_distribution.png new file mode 100644 index 000000000..36ea67790 Binary files /dev/null and b/contents/probability/distributions/res/normal_distribution.png differ diff --git a/literature.bib b/literature.bib index 2f22c0dc8..b1a8e154e 100644 --- a/literature.bib +++ b/literature.bib @@ -346,3 +346,104 @@ @misc{3b1b_finger_count url={https://youtu.be/1SMmc9gQmHQ}, year={2015} } + +#------------------------------------------------------------------------------# +# Metropolis # +#------------------------------------------------------------------------------# + +@article{metropolis1953equation, +author = {Metropolis,Nicholas and Rosenbluth,Arianna W. and Rosenbluth,Marshall N. and Teller,Augusta H. and Teller,Edward }, +title = {Equation of State Calculations by Fast Computing Machines}, +journal = {The Journal of Chemical Physics}, +volume = {21}, +number = {6}, +pages = {1087-1092}, +year = {1953}, +doi = {10.1063/1.1699114}, + +URL = { + https://doi.org/10.1063/1.1699114 + +}, +eprint = { + https://doi.org/10.1063/1.1699114 + +} + +} + +@article{rosenthal2011optimal, + title={Optimal proposal distributions and adaptive MCMC}, + author={Rosenthal, Jeffrey S and others}, + journal={Handbook of Markov Chain Monte Carlo}, + volume={4}, + number={10.1201}, + year={2011}, + publisher={Chapman & Hall/CRC Boca Raton, FL} +} + +@article{gareth2001optimal, +author = {Gareth O. Roberts and Jeffrey S. Rosenthal}, +title = {Optimal scaling for various Metropolis-Hastings algorithms}, +volume = {16}, +journal = {Statistical Science}, +number = {4}, +publisher = {Institute of Mathematical Statistics}, +pages = {351 -- 367}, +year = {2001}, +doi = {10.1214/ss/1015346320}, +URL = {https://doi.org/10.1214/ss/1015346320} +} + +@misc{potential_energy_wiki, + title={Wikipedia: Potential Energy}, + url={https://en.wikipedia.org/wiki/Potential_energy}, + year={2021} +} + +@misc{ludwig_boltzmann_wiki, + title={Wikipedia: Ludwig Boltzmann}, + url={https://en.wikipedia.org/wiki/Ludwig_Boltzmann}, + year={2021} +} + +@misc{boltzmann_distribution_wiki, + title={Wikipedia: Boltzmann distribution}, + url={https://en.wikipedia.org/wiki/Boltzmann_distribution}, + year={2021} +} + +@article{hastings1970monte, + author = {Hastings, W. K.}, + title = "Monte Carlo sampling methods using Markov chains and their applications", + journal = {Biometrika}, + volume = {57}, + number = {1}, + pages = {97-109}, + year = {1970}, + month = {04}, + abstract = "{A generalization of the sampling method introduced by Metropolis et al. (1953) is presented along with an exposition of the relevant theory, techniques of application and methods and difficulties of assessing the error in Monte Carlo estimates. Examples of the methods, including the generation of random orthogonal matrices and potential applications of the methods to numerical problems arising in statistics, are discussed.}", + issn = {0006-3444}, + doi = {10.1093/biomet/57.1.97}, + url = {https://doi.org/10.1093/biomet/57.1.97}, + eprint = {https://academic.oup.com/biomet/article-pdf/57/1/97/23940249/57-1-97.pdf}, +} + +@misc{mala_wiki, + title={Wikipedia: Metropolis-adjusted Langevin Algorithm}, + url={https://en.wikipedia.org/wiki/Metropolis-adjusted_Langevin_algorithm}, + year={2021} +} + +@misc{hmc_wiki, + title={Wikipedia: Hamiltonian Monte Carlo}, + url={https://en.wikipedia.org/wiki/Hamiltonian_Monte_Carlo}, + year={2021} +} + +@misc{rmsd_wiki, + title={Wikipedia: Root Mean Square Deviation}, + url={https://en.wikipedia.org/wiki/Root-mean-square_deviation}, + year={2021} +} +