In [22]:
%pip install pandas scipy numpy

Note: you may need to restart the kernel to use updated packages.


In [24]:
import pandas as pd
import scipy.stats as stats
import numpy as np

In [14]:
top_cs_schools = [
    "Massachusetts Institute of Technology",
    "University of California, Berkeley",
    "Stanford University",
    "Carnegie Mellon University",
    "Princeton University"
]

In [26]:
#z-score for 95% confidence interval
z_95 = stats.norm.ppf(1-0.05/2)

In [75]:
def count_top_schools(counts):
    sum = 0
    for school in top_cs_schools:
        sum += counts.get(school, default=0)

    return sum

In [74]:
def confidence_interval(data, level):
    counts = data[level].value_counts()

    top_schools_count = count_top_schools(counts)
    n = counts.sum()
    p_hat = top_schools_count/n
    se = np.sqrt(p_hat * (1-p_hat)/n)

    me = z_95 * se

    return (p_hat-me, p_hat+me)

In [36]:
caltech_doctoral = pd.read_csv("caltech_grad.csv")

In [76]:
low, high = confidence_interval(caltech_doctoral, "doctorate")
print(f"Caltech sends {low*100:.1f}% to {high*100:.1f}% to top CS doctorate programs (95% confidence)")

low, high = confidence_interval(caltech_doctoral, "institution")
print(f"Caltech sends {low*100:.1f}% to {high*100:.1f}% to be professors at top CS schools (95% confidence)")

Caltech sends 60.7% to 88.1% to top CS doctorate programs (95% confidence)
Caltech sends 7.8% to 33.2% to be professors at top CS schools (95% confidence)


In [40]:
ut_doctoral = pd.read_csv("ut_grad.csv")

In [77]:
low, high = confidence_interval(ut_doctoral, "doctorate")
print(f"UT sends {low*100:.1f}% to {high*100:.1f}% to top CS doctorate programs (95% confidence)")

low, high = confidence_interval(ut_doctoral, "institution")
print(f"UT sends {low*100:.1f}% to {high*100:.1f}% to be professors at top CS schools (95% confidence)")

UT sends 1.2% to 26.3% to top CS doctorate programs (95% confidence)
UT sends -0.7% to 21.4% to be professors at top CS schools (95% confidence)


In [80]:
def p_value(data_1, data_2, level):
    counts = data_1[level].value_counts()

    top_schools_count = count_top_schools(counts)
    n1 = counts.sum()
    p_hat1 = top_schools_count/n1

    counts = data_2[level].value_counts()

    top_schools_count = count_top_schools(counts)
    n2 = counts.sum()
    p_hat2 = top_schools_count/n2

    se = np.sqrt(p_hat1*(1-p_hat1)/n1 + p_hat2*(1-p_hat2)/n2)
    
    z = (p_hat1 - p_hat2)/se

    p = 1 - stats.norm.cdf(z)

    return p

In [85]:
p = p_value(caltech_doctoral, ut_doctoral, "doctorate")

if p < 0.05:
    print(f"Caltech undergrads have a higher chance of going to top PhD programs compared to UT undergrads (p={p}).")
else:
    print(f"There is not a significant difference in PhD results (p={p}).")

Caltech undergrads have a higher chance of going to top PhD programs compared to UT undergrads (p=8.398115536323303e-11).
