In [284]:
import numpy as np
import pandas as pd
import scipy
import sklearn
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import spacy
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords
from collections import Counter
import nltk

nltk.download('gutenberg')
spacy.load('en')

[nltk_data] Downloading package gutenberg to /Users/brien/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


<spacy.lang.en.English at 0x1a244fb048>

## Functions

In [97]:
# Utility function for standard text cleaning.
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text

# Utility function to create a list of the 2000 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]

# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.
def bow_features(sentences, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 50 == 0:
            print("Processing row {}".format(i))
            
    return df

def amount_punc(sentence):
    #selecting only the punctuation in the sentence
    all_punc = [token
                for token in sentence
                if token.is_punct]
    #returning the amount of punctuation
    return len(all_punc)

# Utility function to create a list of the 2000 most common words.
def bag_of_pos(text):
    
    # Filter out punctuation and stop words.
    allpos = [token.pos_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allpos).most_common(100)]

def pos_features(sentences, common_pos):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_pos)
    df['text_sentence'] = sentences
    df.loc[:, common_pos] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        pos = [token.pos_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.pos_ in common_pos
                 )]
        
        # Populate the row with word counts.
        for word in pos:
            df.loc[i, word] += 1
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 50 == 0:
            print("Processing row {}".format(i))
            
    return df

In [2]:
# Load and clean the data.
persuasion = gutenberg.raw('austen-persuasion.txt')
alice = gutenberg.raw('carroll-alice.txt')

# The Chapter indicator is idiosyncratic
persuasion = re.sub(r'Chapter \d+', '', persuasion)
alice = re.sub(r'CHAPTER .*', '', alice)
    
alice = text_cleaner(alice[:int(len(alice)/10)])
persuasion = text_cleaner(persuasion[:int(len(persuasion)/10)])

In [3]:
# Parse the cleaned novels. This can take a bit.
nlp = spacy.load('en')
alice_doc = nlp(alice)
persuasion_doc = nlp(persuasion)

In [4]:
# Group into sentences.
alice_sents = [[sent, "Carroll"] for sent in alice_doc.sents]
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]

# Combine the sentences from the two novels into one data frame.
sentences = pd.DataFrame(alice_sents + persuasion_sents)
sentences.head()

Unnamed: 0,0,1
0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,"(So, she, was, considering, in, her, own, mind...",Carroll
2,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,"(Oh, dear, !)",Carroll
4,"(Oh, dear, !)",Carroll


In [5]:
# Set up the bags.
alicewords = bag_of_words(alice_doc)
persuasionwords = bag_of_words(persuasion_doc)

# Combine bags to create a set of unique words.
common_words = set(alicewords + persuasionwords)

In [6]:
# Create our data frame with features. This can take a while to run.
word_counts = bow_features(sentences, common_words)
word_counts.head()

Processing row 0
Processing row 50
Processing row 100
Processing row 150
Processing row 200
Processing row 250
Processing row 300
Processing row 350
Processing row 400


Unnamed: 0,suffering,shrubbery,depend,practice,raise,alas,tenant,dispose,fortitude,accurately,...,society,Rabbit,tempt,garden,effect,maintenance,tiny,grove,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,"(So, she, was, considering, in, her, own, mind...",Carroll
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Oh, dear, !)",Carroll
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Oh, dear, !)",Carroll


In [9]:
rfc = ensemble.RandomForestClassifier()
Y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)
train = rfc.fit(X_train, y_train)

print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

Training set score: 0.981203007518797

Test set score: 0.8314606741573034




In [10]:
lr = LogisticRegression(penalty='l2') # No need to specify l2 as it's the default. But we put it for demonstration.
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

(266, 1612) (266,)
Training set score: 0.9774436090225563

Test set score: 0.8707865168539326




In [11]:
clf = ensemble.GradientBoostingClassifier()
train = clf.fit(X_train, y_train)

print('Training set score:', clf.score(X_train, y_train))
print('\nTest set score:', clf.score(X_test, y_test))

Training set score: 0.9774436090225563

Test set score: 0.8258426966292135


In [12]:
# Clean the Emma data.
emma = gutenberg.raw('austen-emma.txt')
emma = re.sub(r'VOLUME \w+', '', emma)
emma = re.sub(r'CHAPTER \w+', '', emma)
emma = text_cleaner(emma[:int(len(emma)/60)])
print(emma[:100])

Emma Woodhouse, handsome, clever, and rich, with a comfortable home and happy disposition, seemed to


In [13]:
# Parse our cleaned data.
emma_doc = nlp(emma)

In [37]:
#for chunk in emma_doc.noun_chunks:
#    print(chunk.text)
for token in emma_doc:
    if token.pos_ =='VERB':
        print(token.lemma_)
        

seem
unite
have
live
vex
be
have
be
have
die
have
have
be
supply
have
fall
have
be
be
have
cease
hold
have
allow
impose
be
pass
have
be
live
attach
do
like
esteem
direct
be
have
think
be
threaten
be
do
come
marry
be
bring
be
sit
go
be
leave
cheer
compose
sleep
have
sit
think
have
lose
have
be
be
consider
deny
have
wish
promote
be
would
be
feel
recall
have
teach
have
play
have
devote
attach
amuse
nurse
be
owe
have
follow
be
leave
be
have
be
possess
inform
know
could
speak
arise
have
could
find
be
bear
be
be
go
be
must
be
be
suffer
love
be
could
meet
have
marry
be
increase
have
be
be
could
have
recommend
remove
be
settle
be
must
be
struggle
bring
fill
give
amount
do
belong
afford
be
look
have
be
could
be
accept
be
could
sigh
wish
awake
make
be
require
be
be
use
hate
part
hate
be
be
reconcile
could
speak
have
be
be
oblige
part
be
suppose
could
feel
be
think
have
do
would
have
be
have
spend
smile
chat
could
keep
come
be
say
have
say
wish
be
be
think
agree
papa
know
be
deserve
would
have
ha

In [14]:
# Group into sentences.
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]
emma_sents = [[sent, "Austen"] for sent in emma_doc.sents]

In [15]:
# Build a new Bag of Words data frame for Emma word counts.
# We'll use the same common words from Alice and Persuasion.
emma_sentences = pd.DataFrame(emma_sents)
emma_bow = bow_features(emma_sentences, common_words)

print('done')

Processing row 0
Processing row 50
Processing row 100
Processing row 150
done


In [24]:
emma_bow.columns

Index(['suffering', 'shrubbery', 'depend', 'practice', 'raise', 'alas',
       'tenant', 'dispose', 'fortitude', 'accurately',
       ...
       'society', 'Rabbit', 'tempt', 'garden', 'effect', 'maintenance', 'tiny',
       'grove', 'text_sentence', 'text_source'],
      dtype='object', length=1614)

In [16]:
# Now we can model it!
# Let's use logistic regression again.

# Combine the Emma sentence data with the Alice data from the test set.
X_Emma_test = np.concatenate((
    X_train[y_train[y_train=='Carroll'].index],
    emma_bow.drop(['text_sentence','text_source'], 1)
), axis=0)
y_Emma_test = pd.concat([y_train[y_train=='Carroll'],
                         pd.Series(['Austen'] * emma_bow.shape[0])])

# Model.
print('\nTest set score:', lr.score(X_Emma_test, y_Emma_test))
lr_Emma_predicted = lr.predict(X_Emma_test)
pd.crosstab(y_Emma_test, lr_Emma_predicted)


Test set score: 0.7073170731707317


col_0,Austen,Carroll
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
Austen,155,15
Carroll,57,19


Challenge 0:
See if can improve test set model accuracy to over 90% by making new features, using different modeling techniques, anything else that I can think of.

In [143]:
emma_bow.loc[:, 'is_emma'] = 1

In [147]:
all_books = pd.concat([emma_bow, word_counts])
all_books.loc[all_books.is_emma.isna(), 'is_emma'] = 0

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [148]:
all_books.is_emma.value_counts()

0.0    444
1.0    170
Name: is_emma, dtype: int64

In [134]:
all_books['punctuation_count'] = pd.Series(amount_punc(i) for i in all_books.text_sentence)

In [136]:
# Set up the bags.
alicepunc = bag_o_punc(alice_doc)
persuasionpunc = bag_o_punc(persuasion_doc)
emmapunc = bag_o_punc(emma_doc)

# Combine bags to create a set of unique punctuation
common_punc = set(alicepunc + persuasionpunc + emmapunc)

In [137]:
emmapos = bag_of_pos(emma_doc)
persuasionpos = bag_of_pos(persuasion_doc)
alicepos = bag_of_pos(alice_doc)

print('emma\n', emmapos)
print('persuasion\n', persuasionpos)
print('alice\n', alicepos)

common_pos = set(emmapos + alicepos + persuasionpos)

emma
 ['NOUN', 'VERB', 'ADJ', 'PROPN', 'ADV', 'NUM', 'INTJ', 'DET', 'ADP', 'PART', 'PUNCT']
persuasion
 ['NOUN', 'VERB', 'PROPN', 'ADJ', 'ADV', 'NUM', 'ADP', 'PUNCT', 'INTJ', 'DET', 'PART']
alice
 ['NOUN', 'VERB', 'ADJ', 'PROPN', 'ADV', 'INTJ', 'ADP', 'AUX', 'NUM']


In [138]:
all_books = all_books.reset_index(drop=True)
pos = pos_features(all_books['text_sentence'], common_pos)
all_books = pd.concat([pos, all_books])

Processing row 0
Processing row 50
Processing row 100
Processing row 150
Processing row 200
Processing row 250
Processing row 300
Processing row 350
Processing row 400
Processing row 450
Processing row 500
Processing row 550
Processing row 600


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  This is separate from the ipykernel package so we can avoid doing imports until


In [141]:
all_books['text_source'].value_counts()

Austen     485
Carroll    129
Name: text_source, dtype: int64

In [152]:
X_emma_test = all_books.loc[(all_books['text_source'] == 'Carroll') | 
                            (all_books['is_emma'] == 1.0)
                           ]

Y_emma_test = X_emma_test.loc[:, 'text_source']

X_emma_test = X_emma_test.drop(['text_sentence', 'text_source', 'is_emma'], axis=1)

print(lr.score(X_emma_test, Y_emma_test))
lr_emma_pred = lr.predict(X_emma_test)
pd.crosstab(Y_emma_test, lr_emma_pred)

0.5618729096989966


col_0,Austen,Carroll
text_source,Unnamed: 1_level_1,Unnamed: 2_level_1
Austen,166,4
Carroll,127,2


Adding in features for the amount of punctuation in each sentence and a count of the different parts of speech in each sentence shot our Austen recall through the roof to 98%. However, our precision is pretty bad at 57%. The model didn't necessarily predict Austen correctly, because it predicted everything the be Austen. 

Challenge 1:
Find out if new model is good at identifying Alice in Wonderland vs. any other work. 

I'm going with Alice in Wonderland vs. Moby Dick
'melville-moby_dick.txt'

In [182]:
gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [244]:
burg = gutenberg.raw('burgess-busterbrown.txt')
pattern = r'\b[A-Z]+\b'
burg = re.sub(pattern, '', burg)
burg = text_cleaner(burg[:int(len(burg)/8)])
print(burg[:100])

Buster Bear yawned as he lay on his comfortable bed of leaves and watched the first early morning su


In [245]:
burg_doc = nlp(burg)

In [246]:
burg_sents = [[sent, 'Burgess'] for sent in burg_doc.sents]

In [247]:
burg_sentences = pd.DataFrame(burg_sents)
burg_bow = bow_features(burg_sentences, common_words)

Processing row 0
Processing row 50
Processing row 100
Processing row 150


In [253]:
all_books = pd.concat([all_books, burg_bow])
all_books = all_books.reset_index(drop=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [256]:
all_books['punctuation_count'] = pd.Series(amount_punc(i) for i in all_books.text_sentence)

In [257]:
burgpos = bag_of_pos(burg_doc)
print('burgess\n', burgpos)

burgess
 ['VERB', 'NOUN', 'PROPN', 'ADJ', 'ADV', 'ADP', 'INTJ']


In [258]:
pos = pos_features(all_books['text_sentence'], common_pos)
all_books = pd.concat([pos, all_books])

Processing row 0
Processing row 50
Processing row 100
Processing row 150
Processing row 200
Processing row 250
Processing row 300
Processing row 350
Processing row 400
Processing row 450
Processing row 500
Processing row 550
Processing row 600
Processing row 650
Processing row 700
Processing row 750


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [285]:
all_books = all_books.fillna(0)
X = all_books.loc[(all_books['text_source'] == 'Carroll') | 
                  (all_books['text_source'] == 'Burgess')
                 ]
Y = X.loc[:, 'text_source']
X = X.drop(['text_sentence', 'text_source', 'is_emma'], axis=1)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.4, random_state=2974)

lr.fit(X_train, Y_train)
print(lr.score(X_train, Y_train))
Y_train_pred = lr.predict(X_train)
print(classification_report(Y_train, Y_train_pred))

print('\n', lr.score(X_test, Y_test))
Y_test_pred = lr.predict(X_test)
print(classification_report(Y_test, Y_test_pred))

0.9534883720930233
              precision    recall  f1-score   support

     Burgess       0.92      1.00      0.96        96
     Carroll       1.00      0.89      0.94        76

   micro avg       0.95      0.95      0.95       172
   macro avg       0.96      0.95      0.95       172
weighted avg       0.96      0.95      0.95       172


 0.75
              precision    recall  f1-score   support

     Burgess       0.69      0.97      0.81        63
     Carroll       0.93      0.49      0.64        53

   micro avg       0.75      0.75      0.75       116
   macro avg       0.81      0.73      0.72       116
weighted avg       0.80      0.75      0.73       116





Wow, look at that overfit! The model does a good job of identifying Burgess, but like the previous model, it has a tough time identifying the Carroll sentences. However, it does do better at identifying the Carroll sentences than when comparing to Jane Austen novels.