In [None]:
# Examine the book "Little Women" to see what we can learn from its text

# usual imports
from datascience import *
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

from urllib.request import urlopen 
import re
def read_url(url): 
    return re.sub('\\s+', ' ', urlopen(url).read().decode())

In [None]:
# Read the book and split into separate chapters
little_women_url = 'http://data8.org/materials-fa17/lec/little_women.txt'
little_women_text = read_url(little_women_url)
chapters = little_women_text.split('CHAPTER ')[1:]

In [None]:
# create a table with one chapter's text in each row
Table().with_column('Text', chapters)

In [None]:
# Simple check:  Count the number of times "Christmas" appears in each chapter
np.char.count(chapters, 'Christmas')

In [None]:
# Count the number of times the characters' names appear in each chapter
# and make a table with a column for each character
references = Table().with_columns([
    "Jo",   np.char.count(chapters, "Jo"),
    "Meg",  np.char.count(chapters, "Meg"),
    "Amy",  np.char.count(chapters, "Amy"),
    "Beth", np.char.count(chapters, "Beth"),
    "Laurie", np.char.count(chapters, "Laurie")
])
references

In [None]:
# plot appearances by chapter, one curve per character (column)
references.plot()

In [None]:
# the plot-by-chapter is hard to interpret.  Plot cumulative sums:
references.cumsum().plot()

In [None]:
# How would you see who's mentioned most often in each chapter?

In [None]:
# make a heat map of the references per chapter - mostly copied from Bikes
plots.rcParams['figure.figsize'] = (11., 11.)  # make a square plot
plots.figure()
plots.grid(False)
plots.pcolor(references.to_df())    # make an array of data (already in pivot form)
plots.xlabel('Character')
plots.ylabel('Chapter')
# label the horizontal axis ticks with names instead of column numbers
plots.xticks([0,1,2,3,4],["Jo", "Meg", "Amy", "Beth", "Laurie"])
plots.plot();

In [None]:
# Now, on a different topic, make a scatter plot of chapter length vs a measure of number of sentences
Table().with_columns([
        "Characters", [len(c) for c in chapters],
        "Periods", np.char.count(chapters, "."),
    ]).scatter('Periods')

In [None]:
# Which has the most sentences? 
# Which is longest?
# Which chapter has the longest sentences on average? 
# 
# (Don't just point at the chart, find the chapter number for each.
#  Note suppressed zero!)

In [None]:
lexy = Table().with_columns([   # create the table above and this time keep it
        "Characters", [len(c) for c in chapters],
        "Periods", np.char.count(chapters, "."),
    ])
chapterNum = range(1, len(chapters)+1)  # create an array for chapter numbers
lexy = lexy.with_column("Chapter", chapterNum)  # add a column of chapter numbers
maxSentences = lexy['Periods'].max() # find the maximum number of sentences in a chapter
lexy.where(lexy['Periods'] == maxSentences) # scan for and display that maximum number

In [None]:
maxCharacters = lexy['Characters'].max() # find the maximum number of characters in a chapter
lexy.where(lexy['Characters'] == maxCharacters) # scan for and display that maximum number

In [None]:
# add a column with average sentence length
lexy = lexy.with_column('AvgSentence', lexy['Characters']/lexy['Periods'])
maxLength = lexy['AvgSentence'].max()
lexy.where(lexy['AvgSentence'] == maxLength)

In [None]:
# What else can you look for in a book? (That you likely haven't read!)