In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords
from collections import Counter

In [2]:
# Utility function for standard text cleaning.
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text
    
# Load and clean the data.
persuasion = gutenberg.raw('austen-persuasion.txt')
alice = gutenberg.raw('carroll-alice.txt')

# The Chapter indicator is idiosyncratic
persuasion = re.sub(r'Chapter \d+', '', persuasion)
alice = re.sub(r'CHAPTER .*', '', alice)
    
alice = text_cleaner(alice)
persuasion = text_cleaner(persuasion)

In [3]:
# Parse the cleaned novels. This can take a bit.
nlp = spacy.load('en')
alice_doc = nlp(alice)
persuasion_doc = nlp(persuasion)

In [4]:
# Group into sentences.
alice_sents = [[sent, "Carroll"] for sent in alice_doc.sents]
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]

# Combine the sentences from the two novels into one data frame.
sentences = pd.DataFrame(alice_sents + persuasion_sents)
sentences.head()

Unnamed: 0,0,1
0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,"(So, she, was, considering, in, her, own, mind...",Carroll
2,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,"(Oh, dear, !)",Carroll
4,"(I, shall, be, late, !, ')",Carroll


In [5]:
# Utility function to create a list of the 2000 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]
    

# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.
def bow_features(sentences, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 500 == 0:
            print("Processing row {}".format(i))
            
    return df

# Set up the bags.
alicewords = bag_of_words(alice_doc)
persuasionwords = bag_of_words(persuasion_doc)

# Combine bags to create a set of unique words.
common_words = set(alicewords + persuasionwords)

In [6]:
# Create our data frame with features. This can take a while to run.
word_counts = bow_features(sentences, common_words)
word_counts.head()

Processing row 0
Processing row 500
Processing row 1000
Processing row 1500
Processing row 2000
Processing row 2500
Processing row 3000
Processing row 3500
Processing row 4000
Processing row 4500
Processing row 5000


Unnamed: 0,undoubtedly,shoe,comprehend,heart,insult,convenient,benwick,chimney,stolen,belmont,...,distance,mere,performance,i'll,Norman,afternoon,byron,husband,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(So, she, was, considering, in, her, own, mind...",Carroll
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Oh, dear, !)",Carroll
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(I, shall, be, late, !, ')",Carroll


# BoW with SVM

In [7]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

svc = SVC()
Y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)
train = svc.fit(X_train, y_train)

print('Training set score:', svc.score(X_train, y_train))
print('\nTest set score:', svc.score(X_test, y_test))

Training set score: 0.682445141066

Test set score: 0.691729323308


# Add POS to BoW Features

Parts of Speech

In [8]:
list_pos = []
for token in nlp(word_counts['text_sentence'][0].string):
    list_pos.append(token.pos_)
list_pos = set(list_pos)
list_pos

{'ADJ',
 'ADP',
 'ADV',
 'CCONJ',
 'DET',
 'NOUN',
 'PART',
 'PRON',
 'PROPN',
 'PUNCT',
 'VERB'}

In [19]:
def count_pos(sent, pos):
    sentenc = nlp(sent.string)
    i = 0
    for token in sentenc:
        if token.pos_ == pos:
            i+=1
    return i

def speech_part(sent, part):
    print('doing {}s'.format(part))
    return [count_pos(word_counts['text_sentence'][i], part)
        for i in range (len(word_counts['text_sentence']))]

In [20]:
adj_count = speech_part(word_counts['text_sentence'], 'ADJ')
adp_count = speech_part(word_counts['text_sentence'], 'ADP')
adv_count = speech_part(word_counts['text_sentence'], 'ADV')
cconj_count = speech_part(word_counts['text_sentence'], 'CCONJ')
det_count = speech_part(word_counts['text_sentence'], 'DET')
noun_count = speech_part(word_counts['text_sentence'], 'NOUN')
part_count = speech_part(word_counts['text_sentence'], 'PART')
pron_count = speech_part(word_counts['text_sentence'], 'PRON')
verb_count = speech_part(word_counts['text_sentence'], 'VERB')

doing ADJs
doing ADPs
doing ADVs
doing CCONJs
doing DETs
doing NOUNs
doing PARTs
doing PRONs
doing VERBs


In [21]:
features['adj_count']=adj_count
features['adp_count']=adp_count
features['adv_count']=adv_count
features['cconj_count']=cconj_count
features['det_count']=det_count
features['noun_count']=noun_count
features['part_count']=part_count
features['pron_count']=pron_count
features['verb_count']=verb_count

Average Word Length

In [24]:
avg_word_len = [len(word_counts['text_sentence'][i].string) / 
            len(word_counts['text_sentence'][i].string.split())
                for i in range (len(word_counts['text_sentence']))]

In [25]:
features['avg_wrd_len'] = avg_word_len

Average Sentence Length

In [27]:
wrdspersent = [len(word_counts['text_sentence'][i].string.split()) 
               for i in range (len(word_counts['text_sentence']))]

In [28]:
features['avg_sent_len'] = wrdspersent

Entities

In [10]:
sent_ents = []
for i in range (0, 5317):
    sent_ents.append(nlp(word_counts['text_sentence'][i].string).ents) 

In [11]:
ent_labels = []
for i in range (0, 5317):
    for entity in sent_ents[i]:
        ent_labels.append(entity.label_)

In [12]:
set_entlabels = set(ent_labels)
set_entlabels

{'CARDINAL',
 'DATE',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LOC',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART'}

In [13]:
df_entlabels = pd.DataFrame(columns=set_entlabels)

In [14]:
def entity_lab(sent, ent):
    print('doing {}s'.format(ent))
    return [count_ent(word_counts['text_sentence'][i], ent)
        for i in range (len(word_counts['text_sentence']))]

In [15]:
def count_ent(sent, ent):
    sentent = nlp(sent.string).ents
    i = 0
    for entity in sentent:
        if entity.label_ == ent:
            i+=1
    return i

In [16]:
cardinal_count = entity_lab(word_counts['text_sentence'], 'CARDINAL')
date_count = entity_lab(word_counts['text_sentence'], 'DATE')
fac_count = entity_lab(word_counts['text_sentence'], 'FAC')
gpe_count = entity_lab(word_counts['text_sentence'], 'GPE')
language_count = entity_lab(word_counts['text_sentence'], 'LANGUAGE')
loc_count = entity_lab(word_counts['text_sentence'], 'LOC')
norp_count = entity_lab(word_counts['text_sentence'], 'NORP')
ordinal_count = entity_lab(word_counts['text_sentence'], 'ORDINAL')
org_count = entity_lab(word_counts['text_sentence'], 'ORG')
person_count = entity_lab(word_counts['text_sentence'], 'PERSON')
product_count = entity_lab(word_counts['text_sentence'], 'PRODUCT')
quantity_count = entity_lab(word_counts['text_sentence'], 'QUANTITY')
time_count = entity_lab(word_counts['text_sentence'], 'TIME')
work_of_art_count = entity_lab(word_counts['text_sentence'], 'WORK_OF_ART')

doing CARDINALs
doing DATEs
doing FACs
doing GPEs
doing LANGUAGEs
doing LOCs
doing NORPs
doing ORDINALs
doing ORGs
doing PERSONs
doing PRODUCTs
doing QUANTITYs
doing TIMEs
doing WORK_OF_ARTs


In [17]:
features = word_counts
features['CARDINAL'] = cardinal_count
features['DATE'] = date_count
features['FAC'] = fac_count
features['GPE'] = gpe_count
features['LANGUAGE'] = language_count
features['LOC'] = loc_count
features['NORP'] = norp_count
features['ORDINAL'] = ordinal_count
features['ORG'] = org_count
features['PERSON'] = person_count
features['PRODUCT'] = product_count
features['QUANTITY'] = quantity_count
features['TIME'] = time_count
features['WORK_OF_ART'] = work_of_art_count

In [29]:
features.head(2)

Unnamed: 0,undoubtedly,shoe,comprehend,heart,insult,convenient,benwick,chimney,stolen,belmont,...,adp_count,adv_count,cconj_count,det_count,noun_count,part_count,pron_count,verb_count,avg_wrd_len,avg_sent_len
0,0,0,0,0,0,0,0,0,0,0,...,8,3,6,5,12,2,3,13,5.298246,57
1,0,0,0,0,0,0,0,0,0,0,...,8,7,2,6,8,1,4,11,5.272727,55


Test the Accuracy of the New Features:

In [34]:
# Logistic Regression

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

Y = features['text_source']
X = np.array(features.drop(['text_sentence','text_source'], 1))
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4)

In [35]:
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

(3190, 3027) (3190,)
Training set score: 0.962382445141

Test set score: 0.910714285714


In [36]:
# Random Forest Classifier

from sklearn import ensemble
rfc = ensemble.RandomForestClassifier()

train = rfc.fit(X_train, y_train)

print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

Training set score: 0.992163009404

Test set score: 0.845394736842


In [37]:
# Gradient Boosting Classifier

clf = ensemble.GradientBoostingClassifier()

train = clf.fit(X_train, y_train)

print('Training set score:', clf.score(X_train, y_train))
print('\nTest set score:', clf.score(X_test, y_test))

Training set score: 0.8934169279

Test set score: 0.857142857143


In [38]:
# SVC with new features

train = svc.fit(X_train, y_train)

print('Training set score:', svc.score(X_train, y_train))
print('\nTest set score:', svc.score(X_test, y_test))

Training set score: 0.689028213166

Test set score: 0.681860902256


# Challenge 1

In [39]:
print(gutenberg.fileids())

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


In [44]:
# Preprocess the chosen text--Hamlet.

hamlet = gutenberg.raw('shakespeare-hamlet.txt')
# Print the first 100 characters of Alice in Wonderland.
print('\nRaw:\n', hamlet[0:100])


Raw:
 [The Tragedie of Hamlet by William Shakespeare 1599]


Actus Primus. Scoena Prima.

Enter Barnardo a


In [47]:
hamlet = re.sub(r'Actus .*', '', hamlet)

hamlet = text_cleaner(hamlet)
print('\nRaw:\n', hamlet[0:100])


Raw:
 Enter Barnardo and Francisco two Centinels. Barnardo. Who's there? Fran. Nay answer me: Stand & vnfo


In [48]:
# Parse the cleaned text
hamlet_doc = nlp(hamlet)

In [51]:
# Group into sentences.
hamlet_sents = [[sent, "Shakespeare"] for sent in hamlet_doc.sents]

# Combine the sentences of Hamlet with those of Carroll and Austen in separate DataFames
sentences_C_S = pd.DataFrame(alice_sents + hamlet_sents)

sentences_A_S = pd.DataFrame(hamlet_sents + persuasion_sents)

In [52]:
# Set up a bag of words for Hamlet
hamlet_words = bag_of_words(hamlet_doc)

# Make list of the common words between Hamlet & Alice and Hamlet & Persuasion
common_words_C_S = set(alicewords + hamlet_words)

common_words_A_S = set(hamlet_words + persuasionwords)

In [53]:
# Create our data frames with features. 
features_C_S = bow_features(sentences_C_S, common_words_C_S)

features_A_S = bow_features(sentences_A_S, common_words_A_S)

Processing row 0
Processing row 500
Processing row 1000
Processing row 1500
Processing row 2000
Processing row 2500
Processing row 3000
Processing row 3500
Processing row 4000
Processing row 4500
Processing row 0
Processing row 500
Processing row 1000
Processing row 1500
Processing row 2000
Processing row 2500
Processing row 3000
Processing row 3500
Processing row 4000
Processing row 4500
Processing row 5000
Processing row 5500
Processing row 6000
Processing row 6500


In [58]:
print(len(features_C_S))
print(len(features_A_S))

4705
6685


In [59]:
sent_ents_C_S = []
for i in range (0, 4704):
    sent_ents_C_S.append(nlp(features_C_S['text_sentence'][i].string).ents) 
    
sent_ents_A_S = []
for i in range (0, 6684):
    sent_ents_A_S.append(nlp(features_A_S['text_sentence'][i].string).ents) 

In [61]:
ent_labels_C_S = []
for i in range (0, 4704):
    for entity in sent_ents_C_S[i]:
        ent_labels_C_S.append(entity.label_)
        
ent_labels_A_S = []
for i in range (0, 6684):
    for entity in sent_ents_A_S[i]:
        ent_labels_A_S.append(entity.label_)

In [64]:
set_entlabels_C_S = set(ent_labels_C_S)
print('Entity Labels for Carroll and Shakespeare Dataset:\n {}'.format(set_entlabels_C_S))

Entity Labels for Carroll and Shakespeare Dataset:
 {'CARDINAL', 'LOC', 'LAW', 'ORDINAL', 'ORG', 'GPE', 'WORK_OF_ART', 'EVENT', 'FAC', 'NORP', 'PRODUCT', 'TIME', 'LANGUAGE', 'DATE', 'PERSON', 'QUANTITY'}


In [65]:
set_entlabels_A_S = set(ent_labels_A_S)
print('Entity Labels for Carroll and Shakespeare Dataset:\n {}'.format(set_entlabels_A_S))

Entity Labels for Carroll and Shakespeare Dataset:
 {'CARDINAL', 'LANGUAGE', 'LOC', 'WORK_OF_ART', 'ORG', 'ORDINAL', 'EVENT', 'FAC', 'NORP', 'DATE', 'TIME', 'PRODUCT', 'QUANTITY', 'GPE', 'PERSON', 'LAW'}


In [67]:
cardinal_count_C_S = entity_lab(features_C_S['text_sentence'], 'CARDINAL')
cardinal_count_A_S = entity_lab(features_A_S['text_sentence'], 'CARDINAL')
date_count_C_S = entity_lab(features_C_S['text_sentence'], 'DATE')
date_count_A_S = entity_lab(features_A_S['text_sentence'], 'DATE')
fac_count_C_S = entity_lab(features_C_S['text_sentence'], 'FAC')
fac_count_A_S = entity_lab(features_A_S['text_sentence'], 'FAC')
gpe_count_C_S = entity_lab(features_C_S['text_sentence'], 'GPE')
gpe_count_A_S = entity_lab(features_A_S['text_sentence'], 'GPE')
language_count_C_S = entity_lab(features_C_S['text_sentence'], 'LANGUAGE')
language_count_A_S = entity_lab(features_A_S['text_sentence'], 'LANGUAGE')
loc_count_C_S = entity_lab(features_C_S['text_sentence'], 'LOC')
loc_count_A_S = entity_lab(features_A_S['text_sentence'], 'LOC')
norp_count_C_S = entity_lab(features_C_S['text_sentence'], 'NORP')
norp_count_A_S = entity_lab(features_A_S['text_sentence'], 'NORP')
ordinal_count_C_S = entity_lab(features_C_S['text_sentence'], 'ORDINAL')
ordinal_count_A_S = entity_lab(features_A_S['text_sentence'], 'ORDINAL')
org_count_C_S = entity_lab(features_C_S['text_sentence'], 'ORG')
org_count_A_S = entity_lab(features_A_S['text_sentence'], 'ORG')
person_count_C_S = entity_lab(features_C_S['text_sentence'], 'PERSON')
person_count_A_S = entity_lab(features_A_S['text_sentence'], 'PERSON')
product_count_C_S = entity_lab(features_C_S['text_sentence'], 'PRODUCT')
product_count_A_S = entity_lab(features_A_S['text_sentence'], 'PRODUCT')
quantity_count_C_S = entity_lab(features_C_S['text_sentence'], 'QUANTITY')
quantity_count_A_S = entity_lab(features_A_S['text_sentence'], 'QUANTITY')
time_count_C_S = entity_lab(features_C_S['text_sentence'], 'TIME')
time_count_A_S = entity_lab(features_A_S['text_sentence'], 'TIME')
work_of_art_count_C_S = entity_lab(features_C_S['text_sentence'], 'WORK_OF_ART')
work_of_art_count_A_S = entity_lab(features_A_S['text_sentence'], 'WORK_OF_ART')
law_count_C_S = entity_lab(features_C_S['text_sentence'], 'LAW')
law_count_A_S = entity_lab(features_A_S['text_sentence'], 'LAW')
event_count_C_S = entity_lab(features_C_S['text_sentence'], 'EVENT')
event_count_A_S = entity_lab(features_A_S['text_sentence'], 'EVENT')

doing CARDINALs
doing CARDINALs
doing DATEs
doing DATEs
doing FACs
doing FACs
doing GPEs
doing GPEs
doing LANGUAGEs
doing LANGUAGEs
doing LOCs
doing LOCs
doing NORPs
doing NORPs
doing ORDINALs
doing ORDINALs
doing ORGs
doing ORGs
doing PERSONs
doing PERSONs
doing PRODUCTs
doing PRODUCTs
doing QUANTITYs
doing QUANTITYs
doing TIMEs
doing TIMEs
doing WORK_OF_ARTs
doing WORK_OF_ARTs
doing LAWs
doing LAWs
doing EVENTs
doing EVENTs


In [70]:
avg_word_len_C_S = [len(features_C_S['text_sentence'][i].string) / 
            len(features_C_S['text_sentence'][i].string.split())
                for i in range (len(features_C_S['text_sentence']))]

avg_word_len_A_S = [len(features_A_S['text_sentence'][i].string) / 
            len(features_A_S['text_sentence'][i].string.split())
                for i in range (len(features_A_S['text_sentence']))]

In [71]:
features_C_S['avg_wrd_len'] = avg_word_len_C_S

features_A_S['avg_wrd_len'] = avg_word_len_A_S

In [72]:
wrdspersent_C_S = [len(features_C_S['text_sentence'][i].string.split()) 
               for i in range (len(features_C_S['text_sentence']))]

wrdspersent_A_S = [len(features_A_S['text_sentence'][i].string.split()) 
               for i in range (len(features_A_S['text_sentence']))]

In [73]:
features_C_S['avg_sent_len'] = wrdspersent_C_S

features_A_S['avg_sent_len'] = wrdspersent_A_S

In [79]:
features_C_S.head(3)

Unnamed: 0,cou'nant,shoe,ambitious,heart,insult,most,chimney,stolen,contagion,peepe,...,grone,forfeite,poleak,weepe,e'ene,husband,text_sentence,text_source,avg_wrd_len,avg_sent_len
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll,5.298246,57
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,"(So, she, was, considering, in, her, own, mind...",Carroll,5.272727,55
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,"(There, was, nothing, so, VERY, remarkable, in...",Carroll,4.862069,29


In [80]:
features_A_S.head(3)

Unnamed: 0,undoubtedly,cou'nant,ambitious,heart,comprehend,convenient,insult,most,benwick,contagion,...,forfeite,poleak,afternoon,weepe,e'ene,husband,text_sentence,text_source,avg_wrd_len,avg_sent_len
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,"(Enter, Barnardo, and, Francisco, two, Centine...",Shakespeare,7.333333,6
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,"(Barnardo, .)",Shakespeare,10.0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,"(Who, 's, there, ?)",Shakespeare,6.5,2


In [74]:
Y = features_C_S['text_source']
X = np.array(features_C_S.drop(['text_sentence','text_source'], 1))
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4)

In [75]:
# Logistic Regression: C_S

train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

(2823, 3289) (2823,)
Training set score: 0.964222458378

Test set score: 0.917640807651


In [76]:
# Random Forest Classifier: C_S

train = rfc.fit(X_train, y_train)

print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

Training set score: 0.995040736805

Test set score: 0.88788522848


In [77]:
# Gradient Boosting Classifier: C_S

train = clf.fit(X_train, y_train)

print('Training set score:', clf.score(X_train, y_train))
print('\nTest set score:', clf.score(X_test, y_test))

Training set score: 0.879206517889

Test set score: 0.848565356004


In [78]:
# SVCP: C_S

train = svc.fit(X_train, y_train)

print('Training set score:', svc.score(X_train, y_train))
print('\nTest set score:', svc.score(X_test, y_test))

Training set score: 0.669854764435

Test set score: 0.675345377258
