In [1]:
from numpy.random import seed
from numpy.random import randn
from numpy import mean
from numpy import std

seed(1)

data1 = 5 * randn(100) + 50
data2 = 5 * randn(100) + 51

print('data1: mean=%.3f stdv=%.3f' % (mean(data1), std(data1)))
print('data2: mean=%.3f stdv=%.3f' % (mean(data2), std(data2)))

data1: mean=50.303 stdv=4.426
data2: mean=51.764 stdv=4.660


* As it can be seen the mean of two data have a difference of 1 because of the difference between intercepts (51-50)
* In general by adding some number to the intercept the mean of a normal distribution increase by the same number but the STD will not change. Yet if you multiply a number to the slope then the STD would also multiply by the same number (see below code). There are some discrepency due to randn generation function but if you run same function on exactly identical data sets then you will see.

In [2]:
seed(1)

data1 = 5 * randn(100) + 50
data2 = 10 * randn(100) + 50

print('data1: mean=%.3f stdv=%.3f' % (mean(data1), std(data1)))
print('data2: mean=%.3f stdv=%.3f' % (mean(data2), std(data2)))

data1: mean=50.303 stdv=4.426
data2: mean=51.528 stdv=9.320


In [6]:
# Student's t-test
# The H0 of this test is that the mean of both populations are equal and the rejection of H0 shows they are not
from numpy.random import seed
from numpy.random import randn
from scipy.stats import ttest_ind

seed(1)

data1 = 5 * randn(100) + 50
data2 = 5 * randn(100) + 51

stat, p = ttest_ind(data1, data2) # calculating the p-value for our data samples
print('Statistics=%.3f, p=%.3f' % (stat, p))

alpha = 0.05 # -> 95% significance level
if p > alpha:
	print('Same distributions (fail to reject H0)')
else:
	print('Different distributions (reject H0)')
# The result shows that the distribution of two data samples are not the same hence rejecting the H0 

Statistics=-2.262, p=0.025
Different distributions (reject H0)


In [7]:
# Paired Student's t-test
# We use the paired test if we want to check if the two samples are related with each other in some way 
# if the two samples represent two input variables it means we are dealing with multicolinarity
from numpy.random import seed
from numpy.random import randn
from scipy.stats import ttest_rel

seed(1)

data1 = 5 * randn(100) + 50
data2 = 5 * randn(100) + 51

stat, p = ttest_rel(data1, data2)
print('Statistics=%.3f, p=%.3f' % (stat, p))

alpha = 0.05
if p > alpha:
	print('Same distributions (fail to reject H0)')
else:
	print('Different distributions (reject H0)')

Statistics=-2.372, p=0.020
Different distributions (reject H0)


In [8]:
# Analysis of Variance test -> One-way ANOVA
# ANOVA is a statistical test that assume the mean across two or more data samples are equal. If the evidence (p-vale <= alpha) suggests that H0 (equal mean) is rejected and at least one data sample has different distribution.
# The assumptions of ANOVA are 1. all data samples have normal distribution, 2. Samples are independent, and 3. all data samples have same STD 
from numpy.random import seed
from numpy.random import randn
from scipy.stats import f_oneway

seed(1)

data1 = 5 * randn(100) + 50
data2 = 5 * randn(100) + 50
data3 = 5 * randn(100) + 52 # two data samples with same mean and one different -> hence rejecting H0

stat, p = f_oneway(data1, data2, data3)
print('Statistics=%.3f, p=%.3f' % (stat, p))

alpha = 0.05
if p > alpha:
	print('Same distributions (fail to reject H0)')
else:
	print('Different distributions (reject H0)')

Statistics=3.655, p=0.027
Different distributions (reject H0)
