## Z test

#### Two Proportion Z - Test

In [3]:
from statsmodels.stats.proportion import proportions_ztest
import numpy as np

In [6]:
# can we assume anything from our sample
significance = 0.05

# our samples - 82% are good in one, and ~79% are good in the other
# note - the samples do not need to be the same size
sample_success_a, sample_size_a = (41, 195)
sample_success_b, sample_size_b = (351, 605)

# check our sample against Ho for Ha != Ho
successes = np.array([sample_success_a, sample_success_b])
samples = np.array([sample_size_b, sample_size_b])

In [7]:
# note, no need for a Ho value here - it's derived from the other parameters
stat, p_value = proportions_ztest(count=successes, nobs=samples,  alternative='two-sided')

In [9]:
stat,p_value

(-19.042970456737493, 7.51425891750284e-81)

In [10]:
# report
print('z_stat: %0.3f, p_value: %0.3f' % (stat, p_value))

if p_value > significance:
   print ("Fail to reject the null hypothesis - we have nothing else to say")
else:
   print ("Reject the null hypothesis - suggest the alternative hypothesis is true")

z_stat: -19.043, p_value: 0.000
Reject the null hypothesis - suggest the alternative hypothesis is true


In [21]:
(sample_success_a/sample_size_a) - (sample_success_b/sample_size_b)

-0.3699088789997881

In [13]:
p = (sample_success_a + sample_success_b) / (sample_size_a + sample_size_b)
p

0.49

In [34]:
abs(((sample_success_a/sample_size_a) - (sample_success_b/sample_size_b))) / np.sqrt(((1/sample_size_a) + (1/sample_size_b)) * (1- p) * p)

8.985900954503084

In [26]:
(sample_success_a + sample_success_b) / (sample_size_a + sample_size_b)

0.49

In [32]:
abs((sample_success_a/sample_size_a) - (sample_success_b/sample_size_b))

0.3699088789997881

In [33]:
np.sqrt(((1/sample_size_a) + (1/sample_size_b)) * (1- p) * p)

0.041165474766825305

In [35]:
import pandas as pd

In [37]:
df = pd.read_csv('https://d2beiqkhq929f0.cloudfront.net/public_assets/assets/000/007/356/original/percentiles.csv?1657807153')
df.drop(columns='Unnamed: 0', inplace=True)

In [41]:
df

Unnamed: 0,Dist1,Dist2
0,0.000483,0.014939
1,0.002950,0.041212
2,0.004308,0.051100
3,0.005412,0.058199
4,0.008340,0.074554
...,...,...
49995,99.992134,96.804505
49996,99.995023,97.306047
49997,99.996875,97.736447
49998,99.997964,98.072219


In [39]:
from scipy import stats
import numpy as np
import matplotlib.pyplot as plt

In [44]:
#2-sample KS Test
tobs , p_value = stats.ks_2samp(df['Dist1'], df['Dist2'])

In [45]:
np.round(tobs,2)

0.34

Question: Suppose you're conducting a hypothesis test to determine if the mean weight of apples in a certain orchard is significantly different from 200 grams. You randomly sample 25 apples from the orchard and find that their mean weight is 195 grams with a standard deviation of 15 grams. At a significance level of 0.05, what is your conclusion regarding the null hypothesis that the mean weight of apples in the orchard is 200 grams?

In [1]:
# Ho = The mean weight of apple is same as 200 grams
# Ha = The mean weight of the apple is different from 200 grams


u = 200
x = 195
s = 15
n = 25
alpha = 0.05


In [7]:
from scipy.stats import ttest_ind
from scipy.stats import t
import numpy as np

In [5]:
t_statistic = (x - u)/(s/np.sqrt(n))

In [6]:
t_statistic

-1.6666666666666667

In [12]:
# We are multiplying it by 2 beacuse of the nature of our hypothesis. i.e Two sample t-test
p_value  = 2 * (1 - t.cdf(abs(t_statistic), df = 2 - 1))

In [32]:
if p_value < alpha:
    print(f"Reject the null Hypothesis, p-value: {np.round(p_value,2)}")
else: 
    print(f'Reject Alternate Hypotheisis, p-value: {np.round(p_value,2)}')

Reject Alternate Hypotheisis, p-value: 0.34


Question:

![Alt text](../images/Screenshot%202023-04-24%20at%206.38.57%20PM.png)

In [44]:
u = 150
x = 155
s = 8.5
n = 10


from scipy.stats import t,norm

In [40]:
t_statistic = (x-u)/(s/np.sqrt(n))
t_statistic

1.8601633295108115

In [46]:
p_value = (1 - norm.cdf(t_statistic))
p_value

0.031431210741779014

In [47]:
if p_value < alpha:
    print(f"Reject the null Hypothesis, p-value: {np.round(p_value,2)}")
else: 
    print(f'Reject Alternate Hypotheisis, p-value: {np.round(p_value,2)}')

Reject the null Hypothesis, p-value: 0.03


Question

![Alt text](../images/Screenshot%202023-04-24%20at%206.47.09%20PM.png)

In [54]:
#Ho = people in all state have the same mean height (x = u)
#Ha = people in his state are shorter than other states (x < u)

u = 65
s = 2.5
n = 20
x = 64.5
alpha = 0.05

In [49]:
from scipy.stats import norm

In [55]:
z_statistic = (x-u)/(s/np.sqrt(n))
z_statistic

-0.8944271909999159

In [56]:
p_value = norm.cdf(z_statistic)
p_value

0.18554668476134878

In [57]:
if p_value < alpha:
    print(f"Reject the null Hypothesis, p-value: {np.round(p_value,2)}")
else: 
    print(f'Reject Alternate Hypotheisis, p-value: {np.round(p_value,2)}')

Reject Alternate Hypotheisis, p-value: 0.19


### Question

Suppose you're testing the effectiveness of a new weight loss pill. You randomly divide 100 participants into two groups - the first group receives the pill, and the second group receives a placebo. After 30 days, you record the weight loss (in pounds) for each participant. The mean weight loss in the pill group is 6 pounds with a standard deviation of 2.5 pounds, while the mean weight loss in the placebo group is 3 pounds with a standard deviation of 1.5 pounds. At a significance level of 0.01, can you conclude that the weight loss pill is effective?

In [58]:
from scipy.stats import ttest_ind

In [72]:
# State the Null and Alternate Hypotehsis 

#Ho = There is no difference between the mean weights in two groups
#Ha = There is some difference between the mean weights in two groups

n1 = 100
n2 = 100
x1 = 6
x2 = 3
s1 = 2.5
s2 = 1.5


# Calculate the test statistic
SE = np.sqrt((s1**2/n1) + (s2**2/n2))
t_stat = (x1 - x2) / SE

t_stat

10.289915108550531

In [74]:
# Calculate the p-value
p_value = 2 * (1 - t.cdf(abs(t_stat), df=n1+n2-2))

# Define the significance level
alpha = 0.01

if p_value < alpha:
    print(f"Reject the null Hypothesis, p-value: {np.round(p_value,2)}")
else: 
    print(f'Reject Alternate Hypotheisis, p-value: {np.round(p_value,2)}')

Reject the null Hypothesis, p-value: 0.0


### Question.


A factory produces light bulbs, and the company claims that the average lifespan of its bulbs is at least 2000 hours. To test this claim, a sample of 50 bulbs is randomly selected, and the sample mean lifespan is found to be 1950 hours with a standard deviation of 150 hours. Using a significance level of 0.05, can you reject the company's claim?

In [75]:
u = 2000
n = 50
x = 1950
s = 150

alpha = 0.05

In [None]:
s