In [None]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import statsmodels.api as sm

data = pd.read_csv('PISA_Socio_Econ_combined_ver02.csv')

median_gdp = data['GDP per Capita PPP Interpolated'].median()

high_gdp = data[data['GDP per Capita PPP Interpolated'] > median_gdp].copy()
low_gdp = data[data['GDP per Capita PPP Interpolated'] <= median_gdp].copy()

high_gdp['gender_gap'] = high_gdp['Reading_Girls'] - high_gdp['Reading_Boys']
low_gdp['gender_gap'] = low_gdp['Reading_Girls'] - low_gdp['Reading_Boys']

t_stat, p_value_two_tailed = stats.ttest_ind(
    high_gdp['gender_gap'].dropna(),
    low_gdp['gender_gap'].dropna(),
    equal_var=False
)

print("Gender Gap t-statistic:", t_stat)
print("Two-tailed p-value from t-test:", p_value_two_tailed)

n1 = len(high_gdp['gender_gap'].dropna())
n2 = len(low_gdp['gender_gap'].dropna())

s1 = np.var(high_gdp['gender_gap'].dropna(), ddof=1)
s2 = np.var(low_gdp['gender_gap'].dropna(), ddof=1)

df = (s1/n1 + s2/n2)**2 / ((s1/n1)**2/(n1 - 1) + (s2/n2)**2/(n2 - 1))
print("Degrees of Freedom:", df)

alpha = 0.05

t_crit_low = stats.t.ppf(alpha / 2, df)
t_crit_high = stats.t.ppf(1 - alpha / 2, df)
print("Critical t-values at alpha = 0.05:", t_crit_low, t_crit_high)

p_value_manual = 2 * stats.t.sf(np.abs(t_stat), df)
print("Two-tailed p-value:", p_value_manual)

if p_value_manual < alpha:
    print("Reject the null hypothesis: There is a significant difference in gender gap between high and low GDP countries.")
else:
    print("Fail to reject the null hypothesis: There is no significant difference in gender gap between high and low GDP countries.")



"""
Output:
Gender Gap t-statistic: -0.7624633757860872
Two-tailed p-value from t-test: 0.4464296422109275
Degrees of Freedom: 278.1246710650789
Critical t-values at alpha = 0.05: -1.9685301166200546 1.9685301166200542
Two-tailed p-value: 0.4464296422109275
Fail to reject the null hypothesis: There is no significant difference in gender gap between high and low GDP countries.
"""

