In [1]:
import numpy as np
import pandas as pd

In [2]:
from scipy import stats
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
ds_example = pd.read_csv("EXH_QC.csv", engine = "python")
df = ds_example[['Cabbage weight']].copy()

In [9]:
df.head()

Unnamed: 0,Cabbage weight
0,3.6
1,2.9
2,2.8
3,2.6
4,3.4


In [6]:
lower, upper = stats.norm.interval(0.95, loc=2.7, scale = 0.397/np.sqrt(40))

In [10]:
print(lower.round(2), upper.round(2))

2.58 2.82


In [11]:
lower, upper = stats.norm.interval(0.95, loc=df.mean(), scale = stats.sem(df))

In [12]:
print(lower.round(2), upper.round(2))

[2.57] [2.82]


In [15]:
df.mean()

Cabbage weight    2.695
dtype: float64

In [16]:
lower

array([2.57305379])

In [27]:
sample = {'Weight':[18, 18, 20, 21, 20, 23, 19, 18, 17, 21, 22, 20, 20, 21, 20, 19, 19, 18, 17, 19]}

df_potato = pd.DataFrame(sample)

In [28]:
df_potato.head()

Unnamed: 0,Weight
0,18
1,18
2,20
3,21
4,20


In [35]:
lowerPotato, upperPotato = stats.norm.interval(0.95, loc=df_potato.mean(), scale = 3.8/np.sqrt(20))

In [36]:
print(lowerPotato.round(2), upperPotato.round(2))

[17.83] [21.17]


In [39]:
lowerPotato, upperPotato = stats.norm.interval(0.95, loc=df_potato.mean(), scale =  stats.sem(df_potato))

In [40]:
print(lowerPotato.round(2), upperPotato.round(2))

[18.8] [20.2]


In [45]:
sample2 = {"strength" : [54.1, 53.3, 56.1, 55.7, 54.0, 54.1, 54.5, 57.1, 55.2, 53.8, 
                     54.1, 54.1, 56.1, 55.0, 55.9, 56.0, 54.9, 54.3, 53.9, 55.0]}

df_strength = pd.DataFrame(sample2)
lowerStrth, upperStrth = stats.t.interval(0.95, len(df)-1, loc=np.mean(df_strength), scale = stats.sem(df_strength))

In [46]:
print(lowerStrth.round(2), upperStrth.round(2))

[54.4] [55.32]


In [54]:
sample3 = {"Satisfaction": [74.5, 81.2, 73.8, 82.0, 76.3, 75.7, 80.2, 72.6, 77.9, 82.8]}
s3mean = 76.7
s3stddv = 3.66

df_satis = pd.DataFrame(sample3)

t_result = stats.ttest_1samp(df_satis, s3mean)


In [55]:
t, p = t_result.statistic.round(3),t_result.pvalue.round(3)

In [57]:
t

array([0.864])

In [58]:
p

array([0.41])

### 2-Sample t-test

In [81]:
dfA = pd.DataFrame({'A': [1.883, 1.715, 1.799, 1.768, 1.711, 1.832, 1.427, 1.344]})
dfB = pd.DataFrame({'B': [1.435, 1.572, 1.486, 1.511, 1.457, 1.548, 1.404, 1.883]})

In [87]:
t_result = stats.ttest_ind(dfA, dfB, equal_var = True)
t, p = t_result.statistic, t_result.pvalue
t_result2 = stats.ttest_ind(dfA, dfB, equal_var = False)
t2, p2 = t_result2.statistic, t_result2.pvalue

t_result = stats.ttest_rel(dfA, dfB)
t, p = t_result.statistic, t_result.pvalue
print (t, p)

In [88]:
t, p = t_result.statistic, t_result.pvalue

In [89]:
t_result2 = stats.ttest_ind(dfA, dfB, equal_var = False)

In [90]:
t2, p2 = t_result2.statistic, t_result2.pvalue

In [91]:
print (t, p)
print (t2, p2)

[1.69953848] [0.11131813]
[1.69953848] [0.11269619]


In [95]:
t_result = stats.ttest_rel(dfA, dfB)
t, p = t_result.statistic, t_result.pvalue
print (t, p)

[1.37631864] [0.21113503]


### ANOVA

In [98]:
a = {"A": [892, 623, 721, 678, 723, 790, 720, 670, 690, 771]}
b = {"B": [721, 821, 910, 678, 723, 790, 711, 790, 745, 891]}
c = {"C": [621, 915, 888, 721, 894, 834, 841, 912, 845, 889]}

In [99]:
dfANVa = pd.DataFrame(a)
dfANVb = pd.DataFrame(b)
dfANVc = pd.DataFrame(c)

In [100]:
f_resultANV = stats.f_oneway(dfANVa, dfANVb, dfANVc)

In [102]:
f, p  = f_resultANV.statistic.round(3), f_resultANV.pvalue.round(3)

In [103]:
print(f, p)

[4.263] [0.025]


### 1 proportion test

In [104]:
from statsmodels.stats.proportion import proportions_ztest

In [105]:
count = 15
nobs = 100
value = .1

In [107]:
stat, pval = proportions = proportions_ztest(count, nobs, value)

In [108]:
print (stat, pval)

1.4002800840280094 0.16142946236708322


In [109]:
count = 150
nobs = 1000
value = .1

In [110]:
stat, pval = proportions = proportions_ztest(count, nobs, value)

In [112]:
print (stat.round(3), pval.round(3))

4.428 0.0


### 2 proportion test

In [117]:
count = np.array([4, 1])
nobs = np.array([1000, 1200])

In [118]:
stat, pval = proportions = proportions_ztest(count, nobs)

In [119]:
print (stat.round(3), pval.round(3))

1.553 0.12


### chi-square test

In [141]:
s1 = {'HSG': [270, 260, 236, 234], 'SS': [228, 285, 225, 262], 'SPA': [277, 284, 231, 208]}

df_chi = pd.DataFrame(s1)

In [142]:
chi, pval, dof, expected = stats.chi2_contingency(df_chi.T)

In [152]:
chi, pval, dof, expected.round(3)

(13.36627391246374,
 0.03757449387917438,
 6,
 array([[258.333, 276.333, 230.667, 234.667],
        [258.333, 276.333, 230.667, 234.667],
        [258.333, 276.333, 230.667, 234.667]]))

## Correlation

In [153]:
df1 = [1, 2, 3, 4, 4, 5, 6, 6, 7, 8]
df2 = [23, 29, 49, 64, 74, 87, 96, 97, 109, 119]

corr, pval = stats.pearsonr(df1, df2)

In [154]:
print (corr.round(3), pval.round(3))

0.989 0.0


In [158]:
print ("Correlation Anlysis\ncorr: %.3f\npval: %.3f" %(corr, pval))

Correlation Anlysis
corr: 0.989
pval: 0.000


### simple linear regression

In [159]:
minutes = [1, 2, 3, 4, 4, 5, 6, 6, 7, 8]
units =  [23, 29, 49, 64, 74, 87, 96, 97, 109, 119]

mins = sm.add_constant(minutes)
model = sm.OLS(units, mins)
result = model.fit()

In [160]:
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.979
Model:                            OLS   Adj. R-squared:                  0.976
Method:                 Least Squares   F-statistic:                     365.3
Date:                Fri, 12 Jul 2019   Prob (F-statistic):           5.82e-08
Time:                        13:51:26   Log-Likelihood:                -29.401
No. Observations:                  10   AIC:                             62.80
Df Residuals:                       8   BIC:                             63.41
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          7.1712      3.886      1.845      0.1

  "anyway, n=%i" % int(n))
