In [1]:
from statsmodels.stats.proportion import proportion_confint
import pandas as pd

from bokeh.io import output_notebook
from bokeh.plotting import figure, show

output_notebook()

pd.set_option('display.float_format', lambda x: '%.5f' % x)
pd.set_option("display.max_rows", 1000)

In [2]:
df = pd.read_csv("recent-grads.csv")

In [3]:
lower, upper = proportion_confint(count=df.ShareWomen * df.Sample_size, nobs=df.Sample_size,
                                  alpha=0.01, method="jeffrey")
ci = pd.DataFrame.from_items([("lower", lower), 
                              ("point", df.ShareWomen),
                              ("upper", upper),
                              ("sample_size", df.Sample_size)]) \
       .set_index(pd.MultiIndex.from_tuples(list(zip(df.Major_category, df.Major)),
                                            names=["category", "major"])) \
       .sort_values("point").sort_index(level="category", sort_remaining=False)
ci

Unnamed: 0_level_0,Unnamed: 1_level_0,lower,point,upper,sample_size
category,major,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Agriculture & Natural Resources,FOOD SCIENCE,0.08514,0.2227,0.4284,36
Agriculture & Natural Resources,GENERAL AGRICULTURE,0.41389,0.51554,0.61627,158
Agriculture & Natural Resources,NATURAL RESOURCES MANAGEMENT,0.46034,0.56464,0.66496,152
Agriculture & Natural Resources,AGRICULTURAL ECONOMICS,0.39736,0.58971,0.76373,44
Agriculture & Natural Resources,AGRICULTURE PRODUCTION AND MANAGEMENT,0.51655,0.59421,0.66861,273
Agriculture & Natural Resources,PLANT SCIENCE AND AGRONOMY,0.48441,0.60689,0.72033,110
Agriculture & Natural Resources,FORESTRY,0.5066,0.69037,0.83834,48
Agriculture & Natural Resources,MISCELLANEOUS AGRICULTURE,0.45997,0.71997,0.90151,24
Agriculture & Natural Resources,SOIL SCIENCE,0.18548,0.76443,0.99195,4
Agriculture & Natural Resources,ANIMAL SCIENCES,0.8574,0.91093,0.94933,255


In [11]:
plot_df = pd.DataFrame(ci) \
            .assign(name=lambda df: [name if len(name) < 30 else name[:27] + "..."
                                     for name in df.index.get_level_values(1)]) \
            .set_index("name") \
            .sort_values("sample_size").ix[-30:] \
            .sort_values("point")
intervals = [([row.lower, row.upper], [name, name])
             for name, row in plot_df.iterrows()]
xs, ys = tuple(zip(*intervals))
    
p = figure(y_range=list(plot_df.index), title="Gender Ratio for 30 Most Common Majors")
p.xaxis.axis_label = "Gender Ratio (% Female)"
p.multi_line(xs, ys, line_width=5, alpha=0.8)
p.circle(plot_df.point, plot_df.index, size=6, line_color="black", fill_color="white")
show(p)

In [13]:
select = [
    "ACCOUNTING", "ANTHROPOLOGY AND ARCHEOLOGY", "ARCHITECTURE", "ART HISTORY AND CRITICISM",
    "BIOCHEMICAL SCIENCES", "BIOLOGICAL ENGINEERING", "BIOLOGY", "BUSINESS MANAGEMENT AND ADMINISTRATION", 
    "CHEMICAL ENGINEERING", "CHEMISTRY", "CIVIL ENGINEERING", "COMPUTER ENGINEERING", "COMPUTER SCIENCE", 
    "DRAMA AND THEATER ARTS", 
    "ECONOMICS", "ELECTRICAL ENGINEERING", "ENGLISH LANGUAGE AND LITERATURE", 
    "FILM VIDEO AND PHOTOGRAPHIC ARTS", "FINANCE", 
    "GENERAL BUSINESS", 
    "HISTORY", 
    "JOURNALISM", 
    "MATHEMATICS", "MECHANICAL ENGINEERING", "MUSIC", 
    "NURSING", 
    "PHILOSOPHY AND RELIGIOUS STUDIES", "PHYSICS", "POLITICAL SCIENCE AND GOVERNMENT", "PSYCHOLOGY", 
    "SOCIOLOGY", 
]

In [14]:
plot_df = pd.DataFrame(ci) \
            .assign(name=ci.index.get_level_values(1)).set_index("name") \
            .loc[select] \
            .sort_values("point")
intervals = [([row.lower, row.upper], [name, name])
             for name, row in plot_df.iterrows()]
xs, ys = tuple(zip(*intervals))
    
p = figure(y_range=list(plot_df.index), title="Gender Ratio for Selected Majors")
p.xaxis.axis_label = "Gender Ratio (% Female)"
p.multi_line(xs, ys, line_width=5, alpha=0.8)
p.circle(plot_df.point, plot_df.index, size=6, line_color="black", fill_color="white")

show(p)