In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.linear_model import LinearRegression

In [None]:
rng = np.random.RandomState(0)

In [None]:
# source: https://www.uio.no/studier/emner/matnat/math/STK1000/h17/normalfordelingen-tetthetskurver-ch-1.4-11.09.pdf
n = 500
mean = 168.7
sd = 5.8

In [None]:
data = rng.normal(size=n, loc=mean, scale=sd).astype(int)

In [None]:
plt.figure(figsize=(10, 6))

sns.distplot(data, bins = np.linspace(140.5, 199.5, 60), kde=False, norm_hist=True)

# calculate the pdf over a range of values
xx = np.arange(140.5, 199.5, 0.1)                                                   
yy = stats.norm.pdf(xx, loc=np.mean(data), scale=np.std(data))                                                         
# and plot on the same axes that seaborn put the histogram
plt.plot(xx, yy, 'blue')   
plt.xlabel('Høyde (i cm)')
plt.ylabel('Tetthet')
plt.tight_layout()

plt.savefig('height.png')

In [None]:
plt.figure(figsize=(10, 6))

sns.distplot(data, bins = np.linspace(140.5, 199.5, 60), kde=False, norm_hist=True)
plt.plot(xx, yy, 'blue')   
plt.plot([np.mean(data), np.mean(data)], [0, 0.08], c='r')
plt.xlabel('Høyde (i cm)')
plt.ylabel('Tetthet')
plt.tight_layout()

plt.savefig('height_mean.png')

In [None]:
plt.figure(figsize=(10, 6))

sns.distplot(data, bins = np.linspace(140.5, 199.5, 60), kde=False, norm_hist=True)
plt.plot(xx, yy, 'blue')   
plt.plot([np.median(data), np.median(data)], [0, 0.08], c='r')
plt.xlabel('Høyde (i cm)')
plt.ylabel('Tetthet')
plt.tight_layout()

plt.savefig('height_median.png')

In [None]:
plt.figure(figsize=(10, 6))

sns.distplot(data, bins = np.linspace(140.5, 199.5, 60), kde=False, norm_hist=True)
plt.plot(xx, yy, 'blue')   
plt.plot([stats.mode(data)[0], stats.mode(data)[0]], [0, 0.08], c='r')
plt.xlabel('Høyde (i cm)')
plt.ylabel('Tetthet')
plt.tight_layout()

plt.savefig('height_mode.png')

In [None]:
plt.figure(figsize=(10, 6))

sns.distplot(data, bins = np.linspace(140.5, 199.5, 60), kde=False, norm_hist=True)
plt.plot(xx, yy, 'blue')  
plt.plot([np.mean(data), np.mean(data)], [0, 0.08], label='mean')
plt.plot([np.median(data), np.median(data)], [0, 0.08], label='median')
plt.plot([stats.mode(data)[0], stats.mode(data)[0]], [0, 0.08], label='mode')
plt.xlabel('Høyde (i cm)')
plt.ylabel('Tetthet')
plt.legend()
plt.tight_layout()

plt.savefig('height_mmm.png')

In [None]:
n=1000
data = np.hstack((stats.norm(loc=0, scale=3).rvs(size=n).astype(int), 
                  stats.norm(loc=10, scale=3).rvs(size=n).astype(int)))
dmax = 0.15

xx = np.arange(-15, 25, .02) 
yy = 0.5*(stats.norm(loc=0, scale=3).pdf(x=xx) + stats.norm(loc=10, scale=3).pdf(x=xx))

plt.figure(figsize=(10, 6))
sns.distplot(data, bins = np.arange(-15.5, 25.5, 1), kde=False, norm_hist=True)
plt.plot(xx, yy, 'blue')   

plt.plot([np.mean(data), np.mean(data)], [0, dmax], label='mean')
plt.plot([np.median(data), np.median(data)], [0, dmax], label='median')
plt.plot([stats.mode(data)[0], stats.mode(data)[0]], [0, dmax], label='mode')
plt.ylabel('Tetthet')
#plt.xlim(0, 5)
plt.legend()
plt.tight_layout()

plt.savefig('bimodal.png')

In [None]:
n=10000
data =  stats.lognorm(s=1, loc=0, scale=0.5).rvs(size=n)
dmax = 1.3

xx = np.arange(0, 5, .02)                                              
yy = stats.lognorm(s=1, loc=0, scale=0.5).pdf(x=xx) 

plt.figure(figsize=(10, 6))
sns.distplot(data, bins = np.arange(0, 5, .1), kde=False, norm_hist=True)
plt.plot(xx, yy, 'blue')   

plt.plot([np.mean(data), np.mean(data)], [0, dmax], label='mean')
plt.plot([np.median(data), np.median(data)], [0, dmax], label='median')
plt.plot([stats.mode(data)[0], stats.mode(data)[0]], [0, dmax], label='mode')
plt.ylabel('Tetthet')
plt.xlim(0, 5)
plt.legend()
plt.tight_layout()

plt.savefig('lognormal.png')

In [None]:
xx = np.arange(-15, 25, .02) 
plt.figure(figsize=(10, 6))
for i in range(6):
    yy = stats.norm(loc=5, scale=2*i).pdf(x=xx) 
    plt.plot(xx, yy)   

plt.plot([5, 5], [0, 0.2], label='lokalisering')

plt.ylabel('Tetthet')
plt.legend()
plt.tight_layout()

plt.savefig('normal_sd.png')

In [None]:
xx_normal = np.arange(-5, 5, .01)
yy_normal = stats.norm(loc=0, scale=1).pdf(x=xx_normal) 

xx = np.arange(-1, 10, .01)

fig, axes = plt.subplots(2, 3, figsize=(10, 6))

yy = stats.expon(loc=0, scale=1).pdf(x=xx) 

axes[0, 0].plot(xx, yy)   
axes[0, 0].set_ylabel('Tetthet')
axes[0, 0].set_title('Tetthetsfunksjon')

clt = {}
ns = [1, 5, 10, 50, 100]
for i, n in enumerate(ns): 
    clt[n] = []
    for _ in range(1000):
        X = stats.expon(loc=0, scale=1).rvs(size=n) 
        Y = np.sqrt(n)*(np.mean(X) - 1)
        clt[n].append(Y)
        
    sns.distplot(clt[n], ax = axes[(i+1)//3, (i+1)%3], kde=False, norm_hist=True)
    axes[(i+1)//3, (i+1)%3].plot(xx_normal, yy_normal)
    axes[(i+1)//3, (i+1)%3].set_title('n={}'.format(n))

plt.tight_layout()

plt.savefig('exponential_clt.png')

In [None]:
x = [1, 4, 5.5]
print(np.var(x), np.var(x, ddof=1), stats.tvar(x))
print(np.std(x), np.std(x, ddof=1), stats.tstd(x))

In [None]:
X = rng.uniform(size=12)
q25 = np.quantile(X, 0.25)
q75 = np.quantile(X, 0.75)
plt.scatter(np.repeat(0, len(X)), X, c='black')
plt.plot([-1, 1], np.repeat(np.median(X), 2), label='median')
plt.plot([-1, 1], np.repeat(q25, 2), label='25% kvantil')
plt.plot([-1, 1], np.repeat(q75, 2), label='75% kvantil')
plt.ylim([0, 1])
plt.arrow(-0.5, q25, 0, q75-q25-0.1, head_width=0.05, head_length=0.1, fc='k', ec='k')
plt.arrow(-0.5, q75, 0, q25-q75+0.1, head_width=0.05, head_length=0.1, fc='k', ec='k')
plt.text(-0.75, 0.5, 'IQR')
plt.legend()
plt.savefig('quantiles.png')

In [None]:
# set random state
rng = np.random.RandomState(42)

# generate data
n = 100
X = rng.uniform(size=(n, 4))
y0 = 1 + np.dot(X, [2, -3, 1, -1]) 
y = y0 + rng.normal(size=n)

# fit linear regression
lr = LinearRegression().fit(X, y)
print(lr.intercept_, lr.coef_)

In [None]:
# tap
print('Tap med sanne koeffisienter : ', 
      np.sum((y - y0)**2))
print('Tap med tilpassede koeffisienter: ', 
      np.sum((y - lr.predict(X))**2))

In [None]:
data = rng.normal(loc=0, scale=1, size = 10000)

ns = range(1, len(data)+1) 
means = [np.mean(data[:(i)]) for i in ns]

plt.figure(figsize=(10, 6))

sns.lineplot(ns, means, label='mean')
sns.lineplot(ns, np.repeat(0, len(ns)), label='forventningsverdi')

plt.xlabel('antall datapunkter')
plt.ylabel('mean')

plt.tight_layout()

plt.savefig('sample_lln.png')