In [10]:
import pandas as pd
import statistics as sts
from scipy import stats as st
import numpy as np

In [11]:
# Calculate the standard and relative standard deviation
# of LC (liquid chromatography) peak area data

In [12]:
df1 = pd.read_csv('data/SD_RSD.csv')
df1

Unnamed: 0,Data
0,2957398
1,3733127
2,2900811
3,3010190
4,2810196
5,2084063
6,2812196
7,2830106
8,2710116
9,2910196


In [13]:
#Standard deviation of the data set using the statistics module
sd1 = sts.stdev(df1['Data'])
sd1

398088.9166369884

In [14]:
#Now using the describe pandas function, I select the number from the returned dataframe
summ_df1 = df1.describe()
sd2 = df1.describe().loc['std'][0]
sd2

398088.9166369884

In [15]:
summ_df1

Unnamed: 0,Data
count,10.0
mean,2875840.0
std,398088.9
min,2084063.0
25%,2810696.0
50%,2865458.0
75%,2945598.0
max,3733127.0


In [16]:
#But what is the magnitude of the spread compared to the raw data?
#Let's calculate the RSD (relative standard deviation)
the_mean = summ_df1.loc['mean'][0]
the_RSD = round(sd1/the_mean * 100,1)
print(f'{the_RSD}%')

13.8%


In [17]:
#13.8%, not great for an LC method.  We probably need to do a bit more optimization or try and figure out the source of the variability.

In [18]:
#Now let's calculate some 95% confidence intervals using scipy.stats
#https://stackoverflow.com/questions/15033511/compute-a-confidence-interval-from-sample-data
#See the interval() method for the t continuous variable at:
#https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.t.html#scipy.stats.t
the_data = df1['Data']
interval = st.t.interval(0.95, len(the_data)-1, loc=np.mean(the_data), scale=st.sem(the_data))
interval


(2591064.2443005526, 3160615.555699447)

In [None]:
#What does this tell us?  95% of repeated confidence intervals will include the population mean.  Or, we have 95% confidence the true mean is within the limits above.

In [None]:
#Now, what if we have a rather large RSD, 14%, but we want to be confident that our sample is closely approximating our real mean (the population mean).
#Put a different way, we want to decrease our relative error of the mean to ~5%.
#Then we'll need to increase our n.  But by how much?
#Let's write a function to help us

In [19]:
#First, give ourselves a starting point with an n=10
#Compute the t statistic for 9 degrees of freedom (n=10) at 0.05
the_t = st.t.ppf(1-0.025, 9)
#Calculate our relative error with this t statistic
starting_error = the_t*the_RSD / 10**(0.5)
starting_error

9.871925302145497

In [20]:
#Our starting error is about 10%
#The function below loops until our threshold is met
def findN(N,err,RSD):
    while err > 5:
        deg_free = N - 1
        t_stat = st.t.ppf(1-0.025, deg_free)
        err = t_stat*RSD / N**(0.5)
        N += 1
        print(f'Error:{err} -- N:{N}')
   

In [21]:
findN(10,starting_error,the_RSD)

Error:9.871925302145497 -- N:11
Error:9.270966148034265 -- N:12
Error:8.76810168512821 -- N:13
Error:8.339256538651178 -- N:14
Error:7.967882779579355 -- N:15
Error:7.642185273592054 -- N:16
Error:7.3535009321796645 -- N:17
Error:7.095305283348754 -- N:18
Error:6.862578549838739 -- N:19
Error:6.651387089000705 -- N:20
Error:6.45859880859475 -- N:21
Error:6.281684687490131 -- N:22
Error:6.118577227857963 -- N:23
Error:5.9675675006420015 -- N:24
Error:5.82722893775957 -- N:25
Error:5.696360030093337 -- N:26
Error:5.573940627650098 -- N:27
Error:5.459098183747245 -- N:28
Error:5.351081375073473 -- N:29
Error:5.249239265495394 -- N:30
Error:5.153004687261778 -- N:31
Error:5.061880866422271 -- N:32
Error:4.975430569524759 -- N:33


In [6]:
#Voila, at an N = 33, our relative error of the mean is ~5% with 95% confidence
#If you performed N = 33 measurements over and over, 95% of them would have a relative error less than or equal to 5%

3.9787559717132246