In [1]:
import numpy as np
import pandas as pd
import altair as alt

In [79]:
n = 100
x = np.random.uniform(low = 0, high = 1, size = n)
sim_df = pd.DataFrame({'x': x})

In [105]:
## linear relationship

# intercept, slope
a, b = 1, -2

# noise
noise_sd = 0.5
noise = np.random.normal(loc = 0, scale = noise_sd, size = 100)

# simulate y
sim_df['y'] = a + b*x + noise

# plot
scatter = alt.Chart(
    sim_df
).mark_point().encode(
    x = 'x',
    y = 'y'
)

# compute correlation
print('correlation: ', sim_df.corr().loc['x', 'y'])
scatter

correlation:  -0.7751417785628631


In [106]:
## outlier sensitivity

sim_df.loc[100] = [3, 3]

# plot
scatter = alt.Chart(
    sim_df
).mark_point().encode(
    x = 'x',
    y = 'y'
)

# compute correlation
print('correlation: ', sim_df.corr().loc['x', 'y'])
sim_df = sim_df.loc[0:99].copy()
scatter


correlation:  -0.3019971006847511


In [107]:
## quadratic relationship

# center x, center y, scale
a, b, c = 0.5, 0.5, 3

# noise
noise_sd = 0.1
noise = np.random.normal(loc = 0, scale = noise_sd, size = 100)

# simulate y
sim_df['y'] = c*(x - a)*(x - b) + noise

# plot
scatter = alt.Chart(
    sim_df
).mark_point().encode(
    x = 'x',
    y = 'y'
)

# compute correlation
print('correlation: ', sim_df.corr().loc['x', 'y'])
scatter

correlation:  -0.038293509416259405


In [108]:
## log relationship

# offset, scale
a, b = 0.5, 0.5

# noise
noise_sd = 0.1
noise = np.random.normal(loc = 0, scale = noise_sd, size = 100)

# simulate y
sim_df['y'] = a + b*np.log(x) + noise

# plot
scatter = alt.Chart(
    sim_df
).mark_point().encode(
    x = 'x',
    y = 'y'
)

# compute correlation
print('correlation: ', sim_df.corr().loc['x', 'y'])
scatter

correlation:  0.8699655736164486
