In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
%matplotlib inline
import seaborn as sns

In [1]:
#A purchase manager wants to check the diameter of aluminum die cast sourced from external supplier. The required diameter is 33 and is normally distributed. A sample was taken and is observed to have
#𝑛=8
#𝑥 ̅ =31.5
#𝑠=1.3
 #Should the purchase manager accept the lot at 90% confidence?


In [6]:
n = 8
xbar = 31.5
s = 1.3
cl = 0.9
mu = 33

In [5]:
# one sample t test
# S1: h1: mu != 33 , H0:  mu = 30 (two tailed test)
# S2: alpha = 1 - 0.9
# S3: T Stat = (xbar - mu)/se

In [12]:
se = s/np.sqrt(n)
t_stat = (xbar - mu)/se
t_crit = stats.t.ppf((1-cl)/2,n-1)
print(se,t_stat,t_crit)

0.4596194077712559 -3.263569759322527 -1.8945786050613054


In [15]:
pval = stats.t.cdf(t_stat,n-1)
ll = xbar - t_crit*se
ul = xbar + t_crit*se
print(ll,ul,pval)

32.37078509643437 30.629214903565632 0.006898110567187781


### You are given the daily sugar intake of 11 diabetic patients in the following Python code.

### Is there any evidence to the claim that the average daily sugar intake of the diabetic patients is 7600 mg.

In [24]:
daily_intake = np.array([5560, 5770, 7640, 5180, 5690, 6435, 6803, 7689, 6876, 8213, 8765])
cl = 0.95
mu = 7600
n = len(daily_intake)
s = daily_intake.std(ddof=1)
xbar = daily_intake.mean()
se = s/np.sqrt(n)
t_stat = (xbar - mu)/se
t_crit = stats.t.isf((1-cl)/2,n-1)
pvalue = stats.t.cdf(t_stat,n-1) * 2
print(se,t_stat,t_crit,pvalue)

356.04918176385513 -2.2925841964555027 2.2281388519649385 0.04481624562570511


In [25]:
ll = xbar - t_crit*se
ul = xbar + t_crit*se
print(ll,ul)

5990.400257628901 7577.054287825645


In [26]:
stats.ttest_1samp(daily_intake,mu)

Ttest_1sampResult(statistic=-2.2925841964555027, pvalue=0.04481624562570511)

In [27]:
#Check whether the difference in monthly salary is at least 5000 or more for Management trainees with MBA  from Premier Institutes.

In [5]:
n1 = 120
n2 = 45
x1bar = 67500
x2bar = 58950
sigma1 = 7200
sigma2 = 4600
cl = 0.95

In [6]:
# S1: H1: mu1 - mu2 > 5000 , H0: mu1-mu2<=5000

In [7]:
# two sample z test
se = np.sqrt(((sigma1)**2/n1)+((sigma2)**2/n2))
alpha = 0.05
z_stat = ((x1bar - x2bar) - 5000)/se
z_crit = stats.norm.isf(alpha)
print(z_stat,z_crit)

3.737417313779651 1.6448536269514729


In [8]:
pvalue = stats.norm.sf(z_stat)
pvalue

9.29601013157504e-05

In [9]:
ll = (x1bar-x2bar) - stats.norm.isf(alpha/2)*se
ul = (x1bar-x2bar) + stats.norm.isf(alpha/2)*se
print(ll,ul)

6688.320781181191 10411.67921881881


In [None]:
stats.norm.interval()

In [43]:
#Here weight of 25 people were recorded before they had a new therapy and then again 6 months later. Check if new therapy leads to a change in weight.

In [47]:
Weight_Female       =  [ 53.8, 54.4, 51.2, 52.5, 61.0, 50.6, 51.6, 70.0]
Weight_Male         =  [ 72.5, 80.3, 71.3, 67.7, 66.2, 73.4, 61.3, 76.8]

In [50]:
# S1: H1: mu_male - mu_female != 0 , H0: mu_male - mu_female = 0
# S2: alpha = 0.05

In [51]:
wr_m = np.array(Weight_Male)
wr_f = np.array(Weight_Female)

Assumptions in two samples independent test (parametric):
1. test of normalty ( shapiro test )
    H0: data = normal , H1: data != normal
    if pvalue is less than 0.05(alpha), data is not normal
2. test of variance equality ( Levene Test 
    H0: pop1_var = pop_var2 , pop1_var != pop_var2
    if pvalue is more than 0.05(alpha), variances of populations are equal

In [52]:
stats.shapiro(wr_m)

ShapiroResult(statistic=0.989508330821991, pvalue=0.9942821264266968)

In [53]:
stats.shapiro(wr_f)

ShapiroResult(statistic=0.7623502612113953, pvalue=0.011174780316650867)

In [54]:
## wr_m follows normal distribution but wr-f not follow normal distribution
## Hence we are not supposed to use parametric test. we have to use non parametric.

In [56]:
## test of variance equality
stats.levene(wr_f,wr_m)

LeveneResult(statistic=0.028395091071867308, pvalue=0.8685935536098155)

In [58]:
## population variance is equal
## assuming wr_f following normal

In [62]:
cl = 0.95
xm = wr_m.mean()
xf = wr_f.mean()
sm = wr_m.std(ddof=1)
sf = wr_f.std(ddof=1)
nm = len(wr_m)
nf = len(wr_f)
semf = np.sqrt((sm**2/nm) + (sf**2/nf))
tstatmf = ((xm-xf) - 0)/semf
tcritmf = stats.t.isf((1-cl)/2,nm+nf-2)
print(tstatmf,tcritmf)

4.886344172533443 2.1447866879169273


In [65]:
pvalue = stats.t.sf(tstatmf,nm+nf-2) * 2
pvalue

0.00024034957515992842

In [66]:
stats.ttest_ind(wr_m,wr_f)

Ttest_indResult(statistic=4.886344172533444, pvalue=0.00024034957515992796)