In [2]:
import pandas as pd
import numpy as np

In [3]:
dataset = "https://drive.google.com/uc?export=download&id=1bS9yGe0HH4Kj0A6zMSqNOasqCxrMHrU9"
df = pd.read_csv(dataset)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Single Sample T-Test
- Let's draw a sample from the Age population and perform single sample t-test

In [4]:
population = df['Age'].dropna()

In [16]:
population.size

714

In [17]:
population_mean = population.mean()
population_mean

np.float64(29.69911764705882)

In [18]:
sample = population.sample(25).values
sample

array([25., 28., 64., 44., 51., 20., 61., 18., 29., 50., 25., 26., 10.,
       28., 17., 29., 30., 21., 25., 52., 30., 33.,  9., 28., 28.])

In [19]:
# Null Hypothese: H0 -> The mean age is 35
# Alternative Hypothesis: H1 -> The mean is less than 35

In [20]:
# We need to check the normality of the sample
from scipy.stats import shapiro

shapiro_age = shapiro(sample)

print(shapiro_age)

ShapiroResult(statistic=np.float64(0.8982267018059926), pvalue=np.float64(0.016789059847217066))


In [24]:
# As p-value is greater than 0.05 we can interpret that sample is normally distributed
import scipy.stats as stats
pop_mean = 35
t_statistic, p_value = stats.ttest_1samp(sample, pop_mean)

print("t-statistic:", t_statistic)
print("p-value:", p_value/2)

t-statistic: -1.295709760673125
p-value: 0.10369964699054325


In [25]:
alpha = 0.05

if p_value < alpha:
    print("Reject the null hypothesis.")
else:
    print("Fail to reject the null hypothesis.")

Fail to reject the null hypothesis.


## 2 - sample t-test

In [39]:
pop_male = df[df['Sex']=='male']['Age'].dropna()
pop_female = df[df['Sex']=='female']['Age'].dropna()

In [40]:
pop_male
pop_female.size

261

In [41]:
sample_male = pop_male.sample(25)
sample_female = pop_female.sample(25)

alpha = 0.05

In [42]:
# H0 - Mean age of male and female are similar
# H1 - Mean age of male is higher than female

In [44]:
# Performaing Shapiro-Wilk Test to check the normality of the samples
from scipy.stats import shapiro

# Perform the Shapiro-Wilk test for both desktop and mobile users
shapiro_male = shapiro(sample_male)
shapiro_female = shapiro(sample_female)

print("Shapiro-Wilk test for desktop users:", shapiro_male)
print("Shapiro-Wilk test for mobile users:", shapiro_female)

Shapiro-Wilk test for desktop users: ShapiroResult(statistic=np.float64(0.968852161475046), pvalue=np.float64(0.6161128625422334))
Shapiro-Wilk test for mobile users: ShapiroResult(statistic=np.float64(0.899559054575522), pvalue=np.float64(0.01794536900979945))


In [45]:
# Performing levlene test to check equality of std of sampples
from scipy.stats import levene

# Perform Levene's test
levene_test = levene(sample_male, sample_female)
print(levene_test)

LeveneResult(statistic=np.float64(1.5794555726759112), pvalue=np.float64(0.21492070476238412))


In [47]:

import scipy.stats as stats

t_statistic, p_value = stats.ttest_ind(sample_male, sample_female)

print("t-statistic:", t_statistic)
print("p-value:", p_value/2)

t-statistic: 1.0295774841220349
p-value: 0.15418432277041977


In [48]:
alpha = 0.05

if p_value < alpha:
    print("Reject the null hypothesis.")
else:
    print("Fail to reject the null hypothesis.")

Fail to reject the null hypothesis.
