# Statistical Thinking in Python Functions for Reuse:

In [2]:
import numpy as np

## PMF vs PDF vs CDF vs ECDF:
__PMF__: *Distcrete* outcomes (/discrete random variables); normal/Gaussian distribution (bell-curve) \
__PDF__: *Continuous* outcomes (/continuous random variables); normal/Gaussian distribution (bell-curve) \
__CDF__: *Hypothetical* probability distribution; exponential or normal/Sigmoid \
__ECDF__: *Observed* probability distribution; exponential or normal/Sigmoid

## Binomial Distribution:
The number *r* of successes in *n* Bernoulli (success/fail) trials, with probability *p* of success, is Binomially distributed. \
np.random.binomial()

## ECDF
Empirical Cumulative Distribution Function

In [29]:
def ecdf(data):
    """Compute ECDF for a one-dimensional array of measurements."""
    
    #Number of data points: n
    n = len(data)
    # x-data for the ECDF: x
    x = np.sort(data)
    # y-data for the ECDF: y
    y = np.arange(1, n+1) / n
    
    return x, y

### Bernoulli Trials

In [30]:
def perform_bernoulli_trials(n, p):
    """Perform n Bernoulli trials with success probability p
    and return number of successes."""
    # Initialize number of successes: n_success
    n_success = 0

    # Perform trials
    for i in range(n):
        # Choose random number between zero and one: random_number
        random_number = np.random.random()

        # If less than p, it's a success so add one to n_success
        if random_number < p:
            n_success += 1

    return n_success

## Poisson: 
__Poisson process:__ The timing of the next event is completely independent of when the previous event happened (ie bus arrivals in Poissonville) \
__Poisson distribution:__ A limit of the Binomial distribution for low probabilty of success and large number of trials (ie, for rare events):
1) The number *r* of arrivals of a Poisson process in a given time interval with average rate of ? arrivals per interval is Poisson distributed. \
2) The number *r* of hits on a website in one hour with an average hit rate of 6 hits per hour is Poisson distributed.\
__np.random.poisson()__ \

The waiting time between arrivals of a Poisson process is Exponentially distributed. \

__successive_poisson() function:__

In [32]:
def successive_poisson(tau1, tau2, size=1):
    """Compute time for arrival of 2 successive Poisson processes."""
    # Draw samples out of first exponential distribution: t1
    t1 = np.random.exponential(tau1, size=1)

    # Draw samples out of second exponential distribution: t2
    t2 = np.random.exponential(tau2, size=1)

    return t1 + t2

## Exponential: 
The waiting time between arrivals of a Poisson process is Exponentially distributed. \
Parameters: mean (waiting time), size \
__np.random.exponential(scale=1.0, size=None)__

## Checking normality of distribution:

import numpy as np \
mean = np.mean(michelson_speed_of_light) \
std = np.std(michelson_speed_of_light) \
samples = np.random.normal(mean, std, size = 10000) \
x, y = ecdf(michelson_speed_of_light) \
x_theor, y_theor = ecdf(samples) \

Then, plot empirical and theoretical CDF's on the same plot to check for normal distribution. \
__This is preferrable to histogram check for normal distribution because there is no binning bias.__

## Computing Percentiles:
np.percentile(df['column'], [list of percentiles])
#### 25th, 50th, 75th percentiles: 
np.percentile(df['column'], [25, 50, 75])
#### 95% Confidence interval:
np.percentile(df['column'], [2.5, 97.5])
#### 99% Confidence interval:
np.percentile(df['column'], [0.5, 99.5])

## Pearson Correlation Coefficient:
Pearson correlation coefficient, $\rho$, ranges from -1 (for complete anti-correlation) to 1 (for complete positive correlation). $\rho$ = 0 indicates no correlation.

__Covariance:__ a measure of how two quantities vary *together.*

$\rho$ = covariance / [(std of x)(std of y)]

$\rho$ = variablity due to codependence/ independent variability

In [31]:
def pearson_r(x, y):
    """Compute Pearson correlation coefficient between two arrays."""
    # Compute correlation matrix: corr_mat
    corr_mat = np.corrcoef(x,y)

    # Return entry [0,1]
    return corr_mat[0,1]

## The np.random module:
A suite of functions based on pseudo-random number generation.

__np.random.seed()__ \
set the seed

__np.random.random(size= )__ \
draw a number between 0 and 1

__np.random.binomial(4, 0.5)__ \
__np.random.binomial(4, 0.5, 10)__ \

sampling from a Binomial distribution \
arguments: \
(4) = number of Bernoulli trials (coin flips) \
(0.5) = probability of success (50:50) \
(10) = how many times to repeat the (4 flip) experiment


__np.random.poisson(5, 10000)__ \
random.poisson(lam=1.0, size=None)Â¶

__np.random.normal(mean, std, size)__ \
np.random.normal(np.mean(height), np.std(height), size = 10000)

__random.exponential(scale=1.0, size=None)__  \
np.random.exponential(mean, 10000)

## Plots:

### Bee swarm plot:
sns.swarmplot() \
sns.swarmplot(x) \
sns.swarmplot(x, y, data) 

sns.swarmplot(*, x=None, y=None, hue=None, data=None, order=None, hue_order=None, dodge=False, orient=None, color=None, palette=None, size=5, edgecolor='gray', linewidth=0, ax=None, **kwargs)

## Other:

__np.arange()__ \
([start, ]stop, [step, ]dtype=None, *, like=None) \
Create an np array

__Simulating coin flips__:

In [None]:
n_all_heads = 0 
for i in range(10000): 
    heads = np.random.random(size=4) < 0.5
    n_heads = np.sum(heads)
    if n_heads == 4:
        n_all_heads += 1
n_all_heads/10000