In [58]:
import seaborn as sns
import pandas as pd
from scipy.stats import ttest_1samp

# Load Titanic dataset
titanic = sns.load_dataset('titanic')

# View the dataset
print(titanic.head())


   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  


In [60]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [62]:
titanic.shape

(891, 15)

In [64]:
titanic.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')

In [66]:
# Drop missing values in the 'age' column
ages = titanic['age'].dropna()

# Display basic statistics
print(f"Mean Age: {ages.mean():.2f}")
print(f"Sample Size: {len(ages)}")


Mean Age: 29.70
Sample Size: 714


In [67]:
ages.mean()

29.69911764705882

In [68]:
titanic.age.mean()

29.69911764705882

In [69]:
titanic.age.count()

714

In [70]:
# Perform one-sample t-test
t_stat, p_value = ttest_1samp(ages, 60)

# Print results
print(f"One-Sample T-Test:")
print(f"T-Statistic: {t_stat:.3f}")
print(f"P-Value: {p_value:.3f}")


One-Sample T-Test:
T-Statistic: -55.737
P-Value: 0.000


### Two Sample 

In [74]:
import seaborn as sns
from scipy.stats import ttest_ind

# Load Titanic dataset
titanic = sns.load_dataset('titanic')

# Drop missing values in the 'age' column
titanic = titanic.dropna(subset=['age'])

# Separate the data into two groups
survived_ages = titanic[titanic['survived'] == 1]['age']
not_survived_ages = titanic[titanic['survived'] == 0]['age']

# Perform two-sample t-test
t_stat, p_value = ttest_ind(survived_ages, not_survived_ages)

# Print results
print(f"Two-Sample T-Test on Titanic Dataset:")
print(f"T-Statistic: {t_stat:.3f}")
print(f"P-Value: {p_value:.3f}")


Two-Sample T-Test on Titanic Dataset:
T-Statistic: -2.067
P-Value: 0.039


Null Hypothesis (H₀): The mean age of survivors is equal to the mean age of non-survivors.
Alternative Hypothesis (H₁): The mean age of survivors is different from the mean age of non-survivors.
If p-value < 0.05, reject H₀ and conclude a significant difference between the two groups.


### paired T_test

In [78]:
import seaborn as sns
from scipy.stats import ttest_rel

# Load Titanic dataset
titanic = sns.load_dataset('titanic')

# Filter passengers who embarked at 'C' and drop missing 'fare'
cherbourg_passengers = titanic[titanic['embarked'] == 'C'].dropna(subset=['fare'])

# Simulate "before" and "after" fare data
before_fares = cherbourg_passengers['fare']
after_fares = before_fares * 1.02  # Simulate a 10% price increase

# Perform a paired t-test
t_stat, p_value = ttest_rel(before_fares, after_fares)

# Print results
print(f"Paired T-Test on Fares (Before vs After):")
print(f"T-Statistic: {t_stat:.3f}")
print(f"P-Value: {p_value:.3f}")


Paired T-Test on Fares (Before vs After):
T-Statistic: -9.261
P-Value: 0.000


Null Hypothesis (H₀): The mean of "before" fares is equal to the mean of "after" fares.
Alternative Hypothesis (H₁): The mean of "before" fares is not equal to the mean of "after" fares.
If p-value < 0.05, reject H₀, meaning there is a significant difference between the two means.


In [80]:
import seaborn as sns
from scipy.stats import ttest_rel

# Load Titanic dataset
titanic = sns.load_dataset('titanic')

# Filter passengers who embarked at 'C' and drop missing 'fare'
cherbourg_passengers = titanic[titanic['embarked'] == 'C'].dropna(subset=['fare'])

# Simulate "before" and "after" fare data with a smaller change (2% increase)
before_fares = cherbourg_passengers['fare']
after_fares = before_fares * 1.000002  # Simulate a 2% price increase

# Perform a paired t-test
t_stat, p_value = ttest_rel(before_fares, after_fares)

# Print results
print(f"Paired T-Test on Fares (Before vs After):")
print(f"T-Statistic: {t_stat:.3f}")
print(f"P-Value: {p_value:.3f}")


Paired T-Test on Fares (Before vs After):
T-Statistic: -9.261
P-Value: 0.000


In [81]:
before_fares.mean()


59.95414404761905

In [82]:
after_fares.mean()

59.954263955907145

In [83]:
before_fares.describe(), after_fares.describe()  # Check summary statistics for after fares


(count    168.000000
 mean      59.954144
 std       83.912994
 min        4.012500
 25%       13.697950
 50%       29.700000
 75%       78.500025
 max      512.329200
 Name: fare, dtype: float64,
 count    168.000000
 mean      59.954264
 std       83.913162
 min        4.012508
 25%       13.697977
 50%       29.700059
 75%       78.500182
 max      512.330225
 Name: fare, dtype: float64)

In [85]:
import numpy as np 
from scipy.stats import ttest_1samp

sample_data = [480, 490, 475, 495, 485, 470, 460, 490, 500, 470]
hypothesized_mean = 500

t_stat, p_val = ttest_1samp(sample_data, hypothesized_mean)

alpha = 0.05

t_stat, p_val

(-4.605055965729875, 0.0012812753428723797)

In [86]:
import math
from scipy.stats import t

sample_data = [480, 490, 475, 495, 485, 470, 460, 490, 500, 470]
hypothesized_mean = 500
n = len(sample_data)

sample_mean = sum(sample_data)/len(sample_data)

sample_variance = sum((x - sample_mean)**2 for x in sample_data)/(n-1)
sample_sd = math.sqrt(sample_variance)

t_stat = (sample_mean-hypothesized_mean) / (sample_sd/math.sqrt(n))
dof = n-1

p_value = 2*t.cdf(-abs(t_stat),dof)

alpha = 0.05

p_value

0.0012812753428723797

In [87]:
import math
from scipy.stats import ttest_ind # Only for p-value calculation

# Sample data
class_a_scores = [85, 90, 78, 92, 88, 76, 95, 89]
class_b_scores = [80, 85, 83, 87, 82, 84, 81, 79]

In [88]:
t_stat, p_val = ttest_ind(class_a_scores,class_b_scores)
t_stat,p_val

(1.5825745726907101, 0.13584091935511972)

In [89]:
import math
from scipy.stats import t

class_a_scores = [85, 90, 78, 92, 88, 76, 95, 89]
class_b_scores = [80, 85, 83, 87, 82, 84, 81, 79]

In [90]:
n1 = len(class_a_scores)
n2 = len(class_b_scores)
mean1 = sum(class_a_scores)/n1
mean2 = sum(class_b_scores)/n2
var1 = sum((x-mean1)**2 for x in class_a_scores) / (n1-1)
var2 = sum((x-mean2)**2 for x in class_b_scores) / (n2-1)

p_var = ((n1-1)*var1 + (n2-1)*var2) / (n1+n2-2)

t_stat = (mean1 - mean2) / math.sqrt(p_var * (1/n1 + 1/n2))

df = n1+n2-2

p_val = 2 * t.cdf(-abs(t_stat), df)

t_stat, p_val

(1.5825745726907101, 0.13584091935511972)

In [91]:
mean1, mean2

(86.625, 82.625)

In [92]:
var1, var2

(43.982142857142854, 7.125)

In [93]:
p_var

25.553571428571427

In [174]:
import seaborn as sns
import pandas as pd
from scipy.stats import chi2_contingency
data = sns.load_dataset('titanic')

c_table = pd.crosstab(data['sex'], data['survived'])

chi2, p, dof, expected = chi2_contingency(c_table)
chi2, p, dof, expected

(260.71702016732104,
 1.1973570627755645e-58,
 1,
 array([[193.47474747, 120.52525253],
        [355.52525253, 221.47474747]]))

In [182]:
import numpy as np
from scipy.stats import f

group1 = [23, 21, 18, 25, 27]
group2 = [31, 35, 32, 36, 33]
group3 = [42, 40, 45, 41, 39]

all_data = group1+group2+group3
grand_mean = np.mean(all_data)

mean1 = np.mean(group1)
mean2 = np.mean(group2)
mean3 = np.mean(group3)

n1, n2, n3 = len(group1), len(group2), len(group3)

SSB = n1 * (mean1 - grand_mean)**2 + n2 * (mean2 - grand_mean)**2 + n3 * (mean3 - grand_mean)**2

SSW = sum((x - mean1)**2 for x in group1) + sum((x - mean2)**2 for x in group2) + sum((x - mean3)**2 for x in group3)

df_btw = 3-1
df_with = len(all_data)-3

MSB = SSB/df_btw
MSW = SSW/df_with

F_stat = MSB / MSW

p_val = 1 - f.cdf(F_stat, df_btw, df_with)

F_stat, p_val

(59.89908256880733, 5.696804179500248e-07)

In [184]:
from scipy.stats import f_oneway

f,p = f_oneway(group1, group2, group3)

In [186]:
f,p

(59.89908256880731, 5.696804178985606e-07)