In [1]:
import pandas as pd
import numpy as np
import random 
 
 
df=pd.DataFrame({'Gender':random.choices(["M",'F'],weights=(0.4,0.6),k=1000),
                'Age_Group':random.choices(["18-35",'35-45','45-80'],weights=(0.2,0.5,0.3),k=1000)})
df['Weight']=np.where(df['Gender']=="F",np.random.normal(loc=55,scale=5,size=1000),np.random.normal(loc=70,scale=5,size=1000))
df['Height']=np.where(df['Gender']=="F",np.random.normal(loc=160,scale=5,size=1000),np.random.normal(loc=172,scale=5,size=1000))
df['Weight']=df['Weight'].astype(int)
df['Height']=df['Height'].astype(int)
 
df.head()

Unnamed: 0,Gender,Age_Group,Weight,Height
0,F,35-45,63,151
1,F,35-45,59,158
2,M,35-45,59,169
3,M,45-80,70,170
4,F,45-80,56,161


In [5]:
df.shape

(1000, 4)

In [20]:
df[(df['Age_Group'] == '18-35') & (df['Gender'] == 'F')].shape

(145, 4)

In [33]:
df.query('Gender=="M"')['Height'].shape

(386,)

In [32]:
df.query('Gender=="F"')['Height'].shape

(614,)

### For a single categorical variable that we want to check if there is a difference between the number of its values, we will use a one proportion Z test.

In [42]:
# Test About One Categorical Variable
# Sample Question: Is there a difference in the number of men and women in the population?

from statsmodels.stats.proportion import proportions_ztest
 
count = df.query('Gender=="F"').shape[0] #number of females 
nobs = 1000 #number of rows | or trials 
value = 0.5 # This is the value of the null hypothesis. That means porpotion of men = porpotion of women = 0.5
 
#we are using alternative='two-sided' because we are chcking Pm≠Pw.
#for Pw>Pm we have to set it to "larger" and for Pw<Pm to "smaller"
 
stat, pval = proportions_ztest(count, nobs, value, alternative='two-sided')
 
print("p_value: ",round(pval,3))

p_value:  0.0


### Test About Two Categorical Variables
### Sample Question: Does the proportion of males and females differ across age groups?


If we want to check the independence of two categorical values, we will use the Chi-Squared test.

In [52]:
from scipy.stats import chi2_contingency
 
#The easiest way to apply a chi-squared test is to compute the contigency table.
 
contigency= pd.crosstab(df['Gender'], df['Age_Group'])
contigency

Age_Group,18-35,35-45,45-80
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
F,145,278,191
M,78,199,109


In [22]:
contigency.sum(axis=1)

Gender
F    614
M    386
dtype: int64

In [21]:
#Chi-square test of independence.
c, p, dof, expected = chi2_contingency(contigency)
 
print("p_value: ",round(p,3))

p_value:  0.146


### Test About one Categorical and one Numeric Variable

### Sample Question: Is there a difference in height between men and women?


In [40]:

from scipy.stats import ttest_ind
 
#this is a two-sided test
#you can divide the two-sided p-value by two, and this will give you the one-sided one.
 
t_stat, p = ttest_ind(df.query('Gender=="M"')['Height'], df.query('Gender=="F"')['Height'])
 
print("p_value: ",round(p,3))

p_value:  0.0


### Test About one Categorical with more than two unique values and one Numeric Variable.
### Sample Question: Is there a difference in height between age groups?


In [51]:
import scipy.stats as stats
 
# stats f_oneway functions takes the groups as input and returns ANOVA F and p value
fvalue, pvalue = stats.f_oneway(df.query('Age_Group=="18-35"')['Height'],
                                df.query('Age_Group=="35-45"')['Height'],
                                df.query('Age_Group=="45-80"')['Height'])
 
print("p_value: ",round(pvalue,3))

p_value:  0.616


### Test About Two Numeric Variables
### Sample Question: Is there a relationship between height and weight?


In [47]:
import scipy.stats as stats
 
#for this example we will use the Pearson Correlation.
pearson_coef, p_value = stats.pearsonr(df["Weight"], df["Height"])
 
print("Pearson Correlation Coefficient: ", pearson_coef, "and a P-value of:", round(p_value,3))

Pearson Correlation Coefficient:  0.6115254575404476 and a P-value of: 0.0
