In [None]:
#from IPython.display import HTML, display
#display(HTML("<table><tr><td><img src='data/image2.jpg' width='700'></td><td><img src='data/image1.jpeg' width='240'></td></tr></table>"))

### Prep work

Uncomment and run the cell below to install libraries:

In [None]:
#!pip install -U spaCy
#!python -m spacy download en

Run the next cell to load libaries and pre-defined functions:

In [None]:
# load libraries and helper code
#from helper_code.book import *
import re
from pylab import rcParams
import spacy
import urllib
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd

nlp = spacy.load('en')
%matplotlib inline


def get_book_df(chapters):
    book_df = pd.DataFrame(columns=["text", "part-of-speech","lemma","chapter"])
    for i in range(len(chapters)):
        chapter_tokens = nlp(chapters[i])
        for token in chapter_tokens:
             if ((token.pos_=="VERB") | (token.pos_=="NOUN") | (token.pos_=="ADJ") | (token.pos_== "PROPN")):
                    book_df = book_df.append({"text": token.text,
                             "part-of-speech":  token.pos_,
                             "lemma" : token.lemma_.strip().lower(),
                             "chapter": i+1
                              }, ignore_index=True)
    return book_df


def get_speechparts_by_chapter(book_df):
    result = book_df.groupby(["chapter", "part-of-speech"]).size().reset_index(name="count").\
                          pivot(index="chapter", columns='part-of-speech',values="count").reset_index().\
                          rename_axis(None,axis="columns").set_index("chapter")
    return result 

def get_counts(book_df, value):
    result = book_df.groupby(value).size().reset_index(name='count').set_index(value).sort_values(['count'], ascending=False)
    
    return result


def get_counts_by_chapters(book_df):
    result = book_df.groupby(["chapter", "lemma"]).size().reset_index(name="count").\
                                     pivot(index="chapter", columns='lemma',values="count").reset_index().\
                                    rename_axis(None,axis="columns").set_index("chapter")
    return result

# Group goal

 
Go through the "Alice's Adventures in Wonderland" analysis below, work on challenges, and try modifying the code.

**Extra challenge**:

Explore the "Adventures of Tom Sawyer" book to show interesting results and visualizations.


### Download  book from project Guttenberg website

This book was downloaded from project Gutenberg website.

**Project Gutenberg** is a library of over 60,000 free eBooks

[This link](http://www.gutenberg.org/ebooks/search/?sort_order=downloads) shows the most popular books. 


In this notebook we are going to look at "Alice's Adventures in Wonderland" book.  
"The Adventures of Tom Sawyer" is downloaded as well for extra challenge.

In [None]:
alice_filename = "alice.txt"
tom_filename = "tom.txt"

In [None]:
#if reading from cloud object storage
alice_url="https://swift-yeg.cloud.cybera.ca:8080/v1/AUTH_d22d1e3f28be45209ba8f660295c84cf/hackaton/alice.txt"
urllib.request.urlretrieve(alice_url, alice_filename)


tom_url="https://swift-yeg.cloud.cybera.ca:8080/v1/AUTH_d22d1e3f28be45209ba8f660295c84cf/hackaton/tom.txt"
urllib.request.urlretrieve(tom_url, tom_filename)

In [None]:
with open(alice_filename, 'r') as text_file:
    book = text_file.read()

In [None]:
#print the entire book on the screen
print(book)

In [None]:
# how many characters are in the book?
len(book)

In [None]:
# split the book by chapter
chapters = re.split("CHAPTER\s+[IVXLCDM]+.", book)

# strip off any whitespace at the very beginning and very end of each chapter.
chapters = [chapter.strip() for chapter in chapters]

# remove tabs
chapters = [re.sub("\n", " ", c) for c in chapters]

#select only chapters that have more than 1000 characters (to exclude table of contents, title, etc.)
chapters = [c for c in chapters if len(c)>1000]
 
# number of chapters
print(len(chapters))

### Create dataframe selecting only nouns, proper nouns, verbs. and adjectives per chapter

- **text**: actual word
- **part-of-speech**:  ADJ, PROPN, VERB, or NOUN
- **lemma**: headword
- **chapter**: chapter number

In [None]:
#This cell will run 3-5 mins!!!

#create a dataframe from the book
book_df = get_book_df(chapters)

In [None]:
# show first 5 rows of the dataframe
book_df.head()

## try experimenting with "head" function, like head(20), head(5), etc.

In [None]:
## how many rows (indiviadual words) and columns do we have?
book_df.shape

### Number of adjectives, nouns, proper nouns, and verbs 

In [None]:
#we group by "part-of-speech" column and count the number of rows
book_df.groupby("part-of-speech").size()

In [None]:
#figure size 5 by 5 inches
rcParams['figure.figsize'] = 5, 5

#create a pie chart
book_df.groupby("part-of-speech").size().plot.pie()

### Challenge: 
 - Try grouping by different column: if you change `groupby("part-of-speech")` to `groupby("chapter")` what will it give you?
 - Experiment with different kinds of plots: try  changing `plot.pie()` to `plot()` or `plot.bar()` or `plot.barh()`. Which of these better represents the data?  

### Number of adjective/nouns/proper nouns and verbs  per chapter

In [None]:
#we call a function to get total number of all parts of speech per chapter
speech_parts_by_chapter = get_speechparts_by_chapter(book_df)

In [None]:
#print data on the screen
speech_parts_by_chapter

In [None]:
#figure size 18 by 5 inches
rcParams['figure.figsize'] = 18, 5

#different kind of plot - area
speech_parts_by_chapter.plot.area()

### Challenge
Experiment with plots: try changing `plot.area()` to `plot()` or `plot.bar()` or even `plot.bar(stacked=True`. 

What kind of plot can better visually demonstrate which chapter has the largest number of verbs?

An alternative way to find the chapter with max number of words is **sorting**:

In [None]:
#sort_values() function - sorts by a column or set of columns
speech_parts_by_chapter.sort_values("VERB",ascending=False)

### Challenges
 - find the  chapter that has the most **NOUN**s
 - find the chapter that hast the **fewest** adjectives
 - try plotting the results
 - try two new kinds of plots - [histogram](https://www.mathsisfun.com/data/histograms.html) and [boxplot](https://www.mathsisfun.com/definitions/box-and-whisker-plot.html). Can you figure out how to interpret them?
 - add `.boxplot()`
 - add `.plot.hist(alpha=0.4)` (try changing alpha)

### Top 10 most common words

In [None]:
#call function to count the number of rows  for every "lemma"
word_counts = get_counts(book_df, "lemma")

In [None]:
#print top 10 most frequent words on the screen
word_counts.head(10)

### Challenges
 - try using "text" column instead of "lemma" - why do you get different results?
 - plot the results using your choice of plot

###  The top 10 most common adjectives 

In [None]:
## subset only to adjectives
adjectives = book_df[book_df["part-of-speech"]=="ADJ"]

adjectives.head()

In [None]:
#call function to count the number of adjectives
adjective_counts = get_counts(adjectives, "lemma")

adjective_counts.head()

In [None]:
#figure size 18 by 8 inches
rcParams['figure.figsize'] = 18, 8

#visualize the top 10 adjectives:
adjective_counts.head(10).plot.bar()

### Challenges
 - find the top 10 most common nouns and verbs
 - plot the results

### For the top 15 most common  proper nouns, how does the number vary from chapter to chapter?

In [None]:
## subset only to proper nouns
propnouns = book_df[book_df["part-of-speech"]=="PROPN"]

propnouns.head()

In [None]:
#how many most frequent proper nouns do we want to analyse
num_words = 15

#call function to count the number of proper nouns
top_propnouns = get_counts(propnouns, "lemma")

#get the top proper nouns (excluding counts counts)
top_propnouns = top_propnouns.head(num_words).index

#transform them into list
top_propnouns = list(top_propnouns)

#print on the screen
top_propnouns

In [None]:
## subset only to the top proper nouns
character_by_chapter = book_df[book_df["lemma"].isin(top_propnouns)]

character_by_chapter.head()

In [None]:
#what is the distribution of top proper nouns per chapter?
# call function to form resulting dataframe
counts_by_chapter = get_counts_by_chapters(character_by_chapter)

#display on the screen
counts_by_chapter.head()

In [None]:
#figure size 18 by 8 inches
rcParams['figure.figsize'] = 18, 8

#what are the main characters in every chapter?
#we use colormap "tab20" to extend the default number of colors
counts_by_chapter.plot.bar(stacked = True, cmap="tab20")

### Challenges
 - Try experimenting with the number of proper nouns (change `num_words`)
 - Try doing the same thing with adjectives, nouns, or/and verbs - can you guess whats going on in each chapter based on these plots?

### Extra
Now let's try doing the same thing but using **percentage** instead

In [None]:
#will make a copy of dataframe to work with percentages
counts_percent = counts_by_chapter.copy()

#create addtional column - sum of words per chapter (axis =1 - means -sum by row)
counts_percent["sum"] = counts_percent.sum(axis = 1)

#divide evry column by sum
counts_percent = counts_percent.iloc[:,0:num_words].divide(counts_percent["sum"],axis=0)

#multiply every column by 100
counts_percent = counts_percent.iloc[:,0:num_words].multiply(100,axis=0)

#figure size 18 by 8 inches
rcParams['figure.figsize'] = 18,8

#we choose area plot this time
counts_percent.plot.area(cmap="tab20")