In [0]:
import numpy as np
import pandas as pd
import scipy.stats as ss
import math

from statsmodels.stats.power import TTestPower # There is no normal distribution here

In [47]:
from google.colab import drive
drive.mount('mydrive')

Drive already mounted at mydrive; to attempt to forcibly remount, call drive.mount("mydrive", force_remount=True).


In [48]:
gifted = pd.read_csv('/content/mydrive/My Drive/IE - Statistics and Data Analysis - DUAL - 2019/DataSets/gifted.csv')
gifted.head(5)

Unnamed: 0,score,fatheriq,motheriq,speak,count,read,edutv,cartoons
0,159,115,117,18,26,1.9,3.0,2.0
1,164,117,113,20,37,2.5,1.75,3.25
2,154,115,118,20,32,2.2,2.75,2.5
3,157,113,131,12,24,1.7,2.75,2.25
4,156,110,109,17,34,2.2,2.25,2.5


**Using the `count` variable let's answer the following: In a report you have read that the average age at which gifted children are able to count up to 20 is 31 months. However, a new research claims that the children should be denoted as gifted only if this age is actually 30 months. Find the probability that you can detect such an average age from your sample if it were actaully the case. Use a significance level of 1%. What would be the sample size needed in case you want that this probability is 99%. Assume that the population standard deviation is of 3 months.**

In [49]:
gifted['count'].mean()

30.694444444444443

The decision scheme is

\begin{equation}
H_0:\{\mu \geq 31\},\quad H_1:\{\mu < 31\}
\end{equation}

so it is a left-tailed test. Once we have the alternative value, this becomes

\begin{equation}
H_0:\{\mu = 31\},\quad H_1:\{\mu = 30\}
\end{equation}

In [0]:
# population means
mu0 = 31
mu1 = 30

# population standard deviation
sigma = 3
stdev = gifted['count'].std()

# sample size
n = len(gifted['count'])

# significance level
SL = 0.01

Let's find the **effect size** and the **z-value**

In [0]:
# Effect Size 
delta = mu0 - mu1

# z value
zval = ss.norm.isf(SL)

Now we can find the **beta** and the **power of the test**

In [52]:
# power of the test
power = ss.norm.cdf(delta/(sigma/np.sqrt(n)) - zval)
beta = 1 - power

print('The power of the test is: {:1.8f}\nThe beta is: {:1.8f}'.format(power, beta))

The power of the test is: 0.37208059
The beta is: 0.62791941


Let's find the **sample size** needed for a power of the test of 99%

In [55]:
power = 0.99

# B value
B = ss.norm.ppf(power)

# new sample sample size
new_n = ((B + zval) * sigma / delta)**2

# print the output
print('The sample size needed is {:1.2f}'.format(np.ceil(new_n)))

The sample size needed is 195.00


In [62]:
analysis = TTestPower()

# find the power
powerTest = analysis.solve_power(effect_size = (mu0 - mu1)/sigma, nobs = n, alpha = SL, alternative = 'larger')
new_n = analysis.solve_power(effect_size = (mu0 - mu1)/sigma, power = power, alpha = SL, alternative = 'larger')

# print the output
print('The probability of a Type II error is {:1.6f}\nThe power of the test is {:1.6f}'.format(1 - powerTest, powerTest))
print('The sample size for a 99% of power is: ', np.ceil(new_n))

The probability of a Type II error is 0.656650
The power of the test is 0.343350
The sample size for a 99% of power is:  198.0


**Using the `speak` variable let's answer the following: In a report you have read that the average age at which gifted children are able to speak is 17 months. However, a new research claims that the children should be denoted as gifted only if this age is actually 18.5 months. Find the probability that you can detect such an average age from your sample if it were actaully the case. Use a significance level of 5%. What would be the sample size needed in case you want that this probability is 95%**

In [5]:
gifted['speak'].mean()

18.0

The decision scheme is 

\begin{equation}
H_0:\{\mu\leq 17\},\quad H_1:\{\mu > 17\}
\end{equation}

then once we are given an alternative value, this becomes:

\begin{equation}
H_0:\{\mu = 17\},\quad H_1:\{\mu = 18.5\}
\end{equation}

In [0]:
# population means
mu0 = 17
mu1 = 18.5

# sample values
mean = gifted['speak'].mean()
stdev = gifted['speak'].std()
n = len(gifted['speak'])

# significance level
SL = 0.05

In [66]:
# critical t
tcrit = ss.t.isf(SL, n-1)

# effect size
delta = mu0 - mu1

# beta value
beta_val = delta / (stdev/(np.sqrt(n))) + tcrit

# power of the test
beta = ss.t.cdf(beta_val, n-1)
power = 1 - beta

# print the output
print('The probability of a Type II error is {:1.6f}\nThe power of the test is {:1.6f}'.format(beta, power))

The probability of a Type II error is 0.139161
The power of the test is 0.860839


The probability that we may detect that the average age at which gifted children speak is 18.5 months is 86.0.8% if that age is actually the true age.

Let's now use the `statsmodels` module to find the value of the power of the test

In [67]:
analysis = TTestPower()
powerTest = analysis.solve_power(effect_size = (mu0 - mu1)/stdev, nobs = n, alpha = SL, alternative = 'smaller')

print('The probability of a Type II error is {:1.6f}\nThe power of the test is {:1.6f}'.format(1 - powerTest, powerTest))

The probability of a Type II error is 0.137564
The power of the test is 0.862436


I have used the alternative as smaller because $\mu_1$ is greater than $\mu_0$ and then the argument for the effect size in `power()` is negative. If you want to respect the decision scheme structure you have to ALWAYS use a positive value of the `effect_size` argument

In order to find the sample size needed for a power of 95%, we are going to use the normal approximation (not needed)

In [68]:
A = ss.norm.ppf(0.05)

# critical z
zcrit = ss.norm.isf(SL)

# sample size needed
new_n = ((A - zcrit) * stdev / delta)**2

# print the output
print('The sample size needed is {:1.2f}'.format(new_n))

The sample size needed is 50.02


Let's find the sample size using `statsmodels` and the `solve_power` function

In [69]:
pwr = 0.95
sample_size = analysis.solve_power(effect_size = (mu0 - mu1)/stdev, power = pwr, alpha = SL, alternative = 'smaller')

# print the output
print('The sample size needed is {:1.2f}'.format(sample_size))

The sample size needed is 51.41


**Using the `score` variable let's answer the following: In a report you have read that the average IQ score of gifted children 155. However, you do not fully agree with this value and you claim that it is different to it. On a second approach you want to see what is the probability that you may find that the average IQ score is 160 for a 1% of significance level. What would be the sample size needed in case you want that this probability is 99.99%**

In [38]:
gifted['score'].mean()

159.13888888888889

In [0]:
# population means
mu0 = 155
mu1 = 160

# significance level
SL = 0.01
pwr = 0.99999

# sample information
n = len(gifted['score'])
stdev = gifted['score'].std()

# critical t
tcrit = ss.t.isf(SL/2, n-1)

In [40]:
# boundaries
delta = mu0 - mu1
std_delta = delta / (stdev/(np.sqrt(n)))

upp = std_delta + tcrit
low = std_delta - tcrit

# power of the test
beta = ss.t.cdf(upp, n-1) - ss.t.cdf(low, n-1)
power = 1 - beta

# print the output
print('The probability of a Type II error is {:1.6f}\nThe power of the test is {:1.6f}'.format(beta, power))

The probability of a Type II error is 0.000315
The power of the test is 0.999685


Let's find the power of the test using `statsmodels`. 

In [0]:
power_analysis = TTestPower()

In [34]:
powerTest = power_analysis.solve_power(effect_size = (mu1 - mu0)/stdev, nobs = n, alpha = SL, alternative = 'two-sided')

'''
The effect_size must be positive and it is delta/stdev
'''

# Print the output
print('The probability of a Type II error is {:1.6f}\nThe power of the test is {:1.6f}'.format(1-powerTest, powerTest))

The probability of a Type II error is 0.000169
The power of the test is 0.999831


Let's find the **sample size**

In [45]:
new_n = power_analysis.solve_power(effect_size = (mu1 - mu0)/stdev, power = pwr, alpha = SL, alternative = 'two-sided')

'''
The effect_size must be positive and it is delta/stdev
'''

# Print the output
print('The sample size needed is', np.ceil(new_n))

The sample size needed is 44.0
