In [1]:
%pip install pandas scipy numpy

Collecting pandas
  Downloading pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
Collecting scipy
  Downloading scipy-1.15.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting numpy
  Downloading numpy-2.2.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.7/12.7 MB[0m [31m152.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading scipy-1.15.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (37.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m37.3/37.3 MB[0m [31m142.5 MB/s[0m eta [36m0:00:00[

In [2]:
import pandas as pd
import scipy.stats as stats
import numpy as np

In [3]:
top_cs_schools = [
    "Massachusetts Institute of Technology",
    "University of California, Berkeley",
    "Stanford University",
    "Carnegie Mellon University",
    "Princeton University"
]

In [4]:
#z-score for 95% confidence interval
z_95 = stats.norm.ppf(1-0.05/2)

In [5]:
def count_top_schools(counts):
    sum = 0
    for school in top_cs_schools:
        sum += counts.get(school, default=0)

    return sum

In [6]:
def confidence_interval(data, level):
    counts = data[level].value_counts()

    top_schools_count = count_top_schools(counts)
    n = counts.sum()
    p_hat = top_schools_count/n
    se = np.sqrt(p_hat * (1-p_hat)/n)

    me = z_95 * se

    return (p_hat-me, p_hat+me)

In [7]:
caltech_doctoral = pd.read_csv("caltech_grad.csv")

In [8]:
low, high = confidence_interval(caltech_doctoral, "doctorate")
print(f"Caltech sends {low*100:.1f}% to {high*100:.1f}% to top CS doctorate programs (95% confidence)")

low, high = confidence_interval(caltech_doctoral, "institution")
print(f"Caltech sends {low*100:.1f}% to {high*100:.1f}% to be professors at top CS schools (95% confidence)")

Caltech sends 60.7% to 88.1% to top CS doctorate programs (95% confidence)
Caltech sends 7.8% to 33.2% to be professors at top CS schools (95% confidence)


In [9]:
ut_doctoral = pd.read_csv("ut_grad.csv")

In [10]:
low, high = confidence_interval(ut_doctoral, "doctorate")
print(f"UT sends {low*100:.1f}% to {high*100:.1f}% to top CS doctorate programs (95% confidence)")

low, high = confidence_interval(ut_doctoral, "institution")
print(f"UT sends {low*100:.1f}% to {high*100:.1f}% to be professors at top CS schools (95% confidence)")

UT sends 1.2% to 26.3% to top CS doctorate programs (95% confidence)
UT sends -0.7% to 21.4% to be professors at top CS schools (95% confidence)


In [11]:
def diff_proportions(data_1, data_2, level):
    counts = data_1[level].value_counts()

    top_schools_count = count_top_schools(counts)
    n1 = counts.sum()
    p_hat1 = top_schools_count/n1

    counts = data_2[level].value_counts()

    top_schools_count = count_top_schools(counts)
    n2 = counts.sum()
    p_hat2 = top_schools_count/n2

    se = np.sqrt(p_hat1*(1-p_hat1)/n1 + p_hat2*(1-p_hat2)/n2)
    
    z = (p_hat1 - p_hat2)/se

    p = 1 - stats.norm.cdf(z)

    diff_p_hat = p_hat1 - p_hat2
    me = z_95 * se

    return (p, diff_p_hat - me, diff_p_hat + me)

In [17]:
p, low, high = diff_proportions(caltech_doctoral, ut_doctoral, "doctorate")

if p < 0.05:
    print(f"Caltech undergrads have a higher chance of going to top PhD programs compared to UT undergrads (p={p}).")
    print(f"The difference in proportions is {low * 100:.1f} to {high * 100:.1f} pp (95% confidence).")
else:
    print(f"There is not a significant difference in PhD results (p={p}).")

Caltech undergrads have a higher chance of going to top PhD programs compared to UT undergrads (p=8.398115536323303e-11).
The difference in proportions is 42.0 to 79.1 pp (95% confidence).


In [18]:
stats.t.ppf(1-0.05/2, 10)

np.float64(2.2281388519649385)