Skip to content

Commit

Permalink
Combine both sex related explorations into one
Browse files Browse the repository at this point in the history
  • Loading branch information
hadley committed Oct 6, 2009
1 parent aa195cf commit 2f3abf2
Show file tree
Hide file tree
Showing 3 changed files with 133 additions and 88 deletions.
45 changes: 0 additions & 45 deletions 6-sex-dist.r

This file was deleted.

133 changes: 133 additions & 0 deletions 6-sex-exploration.r
@@ -0,0 +1,133 @@
options(stringsAsFactors = FALSE)
library(plyr)
library(ggplot2)

# Two basic tasks:
# * for names that are used for both boys and girls, how has usage changed?
# * can we use names that clearly have the incorrect sex to estimate error
# rates in this data?

bnames <- read.csv("baby-names.csv")

# The complete data set takes a long time to work with, so we want to limit it
# to names that plenty of data: i.e. that they've been in the top 1000 for
# both boys and girls, and there are a decent number of babies with those
# names.

# Your turn: create a summary that for each name, gives:
# * total proportion of babies with that name
# * the total proportion of boys
# * the total proportion of girls
# * the number of years the name was in the top 1000 as a girls name
# * the number of years the name was in the top 1000 as a boys name
#
# Hint: Start with a single name and figure out how to solve the problem
# Hint: Use summarise
times <- ddply(bnames, c("name"), summarise,
boys = sum(prop[sex == "boy"]),
boys_n = sum(sex == "boy"),
girls = sum(prop[sex == "girl"]),
girls_n = sum(sex == "girl"),
.progress = "text"
)

# ~7000 names in total - want to limit to under 200. In real-life would
# probably use more, but starting with a subset for easier comprehension isn't
# a bad idea.
nrow(times)

# At absolute minimum need at least 1 year each in top 1000. This cuts it
# down to 582 names.
times <- subset(times, boys_n > 1 & girls_n > 1)

# Use basic graphics to figure out good cut-offs
qplot(boys_n, girls_n, data = times)
qplot(log10(boys), log10(girls), data = times)

qplot(pmin(boys_n, girls_n), data = times, binwidth = 1)
times$both <- with(times, boys_n > 10 & girls_n > 10)

# Still a few too many names. Lets focus on names that have managed a certain
# level of popularity.
qplot(pmin(boys, girls), data = subset(times, both), binwidth = 0.01)
qplot(pmax(boys, girls), data = subset(times, both), binwidth = 0.1)
qplot(boys + girls, data = subset(times, both), binwidth = 0.1)

both_sexes <- subset(times, both & boys + girls > 0.4)
selected_names <- both_sexes$name

# Now that we have figured out which names we should focus on (at least to
# begin with), we need to calculate yearly summaries for each of those names.

selected <- subset(bnames, name %in% selected_names)
nrow(selected) / nrow(bnames)

# For each name in each year, figure out total boys and total girls
bysex <- ddply(selected, c("name", "year"), summarise,
boys = sum(prop[sex == "boy"]),
girls = sum(prop[sex == "girl"]),
.progress = "text"
)

# It's useful to have a symmetric means of comparing the relative abundance
# of boys and girls - the log ratio is good for this.
bysex$lratio <- log10(bysex$boys / bysex$girls)
bysex$lratio[!is.finite(bysex$lratio)] <- NA

# Explore the distribution of lratio for each name. This is why a smaller
# number of names helps - we can more easily see the name associated with the
# pattern and use our knowledge of names to determine whether or not it's
# plausible that the name is used for both sexes.
qplot(lratio, reorder(name, lratio, na.rm = T), data = bysex)
qplot(abs(lratio), reorder(name, lratio, na.rm = T), data = bysex)

qplot(abs(lratio), reorder(name, lratio, na.rm = T), data = bysex) +
geom_point(data = both_sexes, colour = "red")

qplot(year, lratio, data = bysex, group = name, geom = "line")

# Two things seem to be helpful in differentiating true dual-sex names from
# errors: the average position and the amount of spread.
#
# Brainstorm: how could we summarise these things quantitatively?
# Your turn: Compute the mean and range of lratio for each name
rng <- ddply(bysex, "name", summarise,
diff = diff(range(lratio, na.rm = T)),
mean = mean(lratio, na.rm = T)
)

qplot(diff, abs(mean), data = rng)
qplot(diff, abs(mean), data = rng, colour = abs(mean) < 1.75 | diff > 0.9)

shared_names <- subset(rng, abs(mean) < 1.75 | diff > 0.9)$name

qplot(abs(lratio), reorder(name, lratio, na.rm=T),
data = subset(bysex, name %in% shared_names))

# Dual-sex names -------------------------------------------------------------

shared <- subset(bysex, name %in% shared_names)

qplot(year, lratio, data = shared, geom = "line") + facet_wrap(~ name)

# lratio useful because it's symmetric, but probably easier to switch back to
# something more familiar.
qplot(year, boys / (boys + girls), data = shared, geom = "line") +
facet_wrap(~ name)

# What's going on with Carol? Let's go back to the raw data
qplot(year, prop, data = subset(bnames, name == "Carol"), colour = sex)
# How about Shirley?
qplot(year, prop, data = subset(bnames, name == "Shirley"), colour = sex)

# Your turn: what is alike about these patterns?

# Sex encoding errors --------------------------------------------------------

errors <- subset(bysex, !(name %in% shared_names))
qplot(year, lratio, data = errors, group = name, geom = "line")
qplot(year, abs(lratio), data = errors, group = name, geom = "line",
colour = lratio > 0)

# Your turn: What do you see in this plot? Do you think we can use this data
# to estimate sex-encoding errors? Why/Why not?
43 changes: 0 additions & 43 deletions 8-sex-errors.r

This file was deleted.

0 comments on commit 2f3abf2

Please sign in to comment.