<a href="https://colab.research.google.com/github/conditg/dsprojects/blob/master/U4_L4_A2v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from collections import Counter
import nltk
nltk.download()
from nltk.corpus import gutenberg, stopwords

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> q


In [0]:
# Utility function for standard text cleaning.
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text
    

In [0]:
# Load and clean the data.
persuasion = gutenberg.raw('austen-persuasion.txt')
alice = gutenberg.raw('carroll-alice.txt')

# The Chapter indicator is idiosyncratic
persuasion = re.sub(r'Chapter \d+', '', persuasion)
alice = re.sub(r'CHAPTER .*', '', alice)
    
alice = text_cleaner(alice)
persuasion = text_cleaner(persuasion)

In [0]:
# Parse the cleaned novels. This can take a bit.
nlp = spacy.load('en')
alice_doc = nlp(alice)
persuasion_doc = nlp(persuasion)

In [5]:
# Group into sentences.
alice_sents = [[sent, "Carroll"] for sent in alice_doc.sents]
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]

# Combine the sentences from the two novels into one data frame.
sentences = pd.DataFrame(alice_sents + persuasion_sents)
print(sentences.shape)
sentences.head()

(5318, 2)


Unnamed: 0,0,1
0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,"(So, she, was, considering, in, her, own, mind...",Carroll
2,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,"(Oh, dear, !)",Carroll
4,"(I, shall, be, late, !, ')",Carroll


In [6]:
sentences.groupby(1).agg('count')

Unnamed: 0_level_0,0
1,Unnamed: 1_level_1
Austen,3649
Carroll,1669


In [0]:
# Utility function to create a list of the 2000 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]
   
# Set up the bags.
alicewords = bag_of_words(alice_doc)
persuasionwords = bag_of_words(persuasion_doc)

# Combine bags to create a set of unique words.
common_words = set(alicewords + persuasionwords)

In [8]:
list(common_words)[0]

'occasionally'

In [0]:
from sklearn.feature_extraction.text import CountVectorizer

In [0]:
vectorizer = CountVectorizer(vocabulary=common_words)

In [0]:
df = pd.DataFrame()
df['text_sentence'] = sentences[0]
df['text_source'] = sentences[1]
#String version of the spacy objects
df['str_sentence'] = [_.text for _ in df.text_sentence]

In [12]:
type(df.loc[5,'str_sentence'])

str

In [0]:
X = vectorizer.fit_transform(df['str_sentence'])

In [0]:
df2 =  pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())

In [15]:
word_counts = pd.concat([df,df2], axis=1)
print(word_counts.shape)
word_counts.head()

(5318, 3065)


Unnamed: 0,text_sentence,text_source,str_sentence,'s,-PRON-,1,`,a,abide,ability,...,you,you'd,young,your,yours,youth,zeal,zealand,zealous,zigzag
0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll,Alice was beginning to get very tired of sitti...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"(So, she, was, considering, in, her, own, mind...",Carroll,So she was considering in her own mind (as wel...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"(There, was, nothing, so, VERY, remarkable, in...",Carroll,There was nothing so VERY remarkable in that; ...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"(Oh, dear, !)",Carroll,Oh dear!,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"(I, shall, be, late, !, ')",Carroll,I shall be late!',0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
#CHECK
word_counts.loc[4,'str_sentence']

"I shall be late!'"

In [17]:
word_counts['shall'].head()

0    0
1    0
2    0
3    0
4    1
Name: shall, dtype: int64

In [18]:
word_counts['late'].head()

0    0
1    0
2    0
3    0
4    1
Name: late, dtype: int64

# Random Forest Attempt

In [19]:
from sklearn import ensemble
from sklearn.model_selection import train_test_split

rfc = ensemble.RandomForestClassifier()
Y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source', 'str_sentence'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)
train = rfc.fit(X_train, y_train)

print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))



Training set score: 0.9893416927899686

Test set score: 0.8477443609022557


# Logistic Regression

In [20]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

(3190, 3062) (3190,)
Training set score: 0.9545454545454546

Test set score: 0.8843984962406015




# Gradient Booster

In [21]:
clf = ensemble.GradientBoostingClassifier()
train = clf.fit(X_train, y_train)

print('Training set score:', clf.score(X_train, y_train))
print('\nTest set score:', clf.score(X_test, y_test))

Training set score: 0.8642633228840125

Test set score: 0.8529135338345865


#New Inputs

In [22]:
# Clean the Emma data.
emma = gutenberg.raw('austen-emma.txt')
emma = re.sub(r'VOLUME \w+', '', emma)
emma = re.sub(r'CHAPTER \w+', '', emma)
emma = text_cleaner(emma)
print(emma[:100])

Emma Woodhouse, handsome, clever, and rich, with a comfortable home and happy disposition, seemed to


In [0]:
# Parse our cleaned data.
emma_doc = nlp(emma)

In [0]:
# Group into sentences.
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]
emma_sents = [[sent, "Austen"] for sent in emma_doc.sents]

# Emma is quite long, let's cut it down to the same length as Alice.
emma_sents = emma_sents[0:len(alice_sents)]

In [25]:
emma_sentences = pd.DataFrame(emma_sents)
emma_sentences.head()

Unnamed: 0,0,1
0,"(Emma, Woodhouse, ,, handsome, ,, clever, ,, a...",Austen
1,"(She, was, the, youngest, of, the, two, daught...",Austen
2,"(Her, mother, had, died, too, long, ago, for, ...",Austen
3,"(Sixteen, years, had, Miss, Taylor, been, in, ...",Austen
4,"(Between, _, them)",Austen


In [0]:
emma = pd.DataFrame()
emma['text_sentence'] = emma_sentences[0]
emma['text_source'] = emma_sentences[1]
#String version of the spacy objects
emma['str_sentence'] = [_.text for _ in emma.text_sentence]

In [0]:
X = vectorizer.fit_transform(emma['str_sentence'])

In [0]:
df2 =  pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())

In [29]:
emma_bow = pd.concat([emma,df2], axis=1)
print(emma_bow.shape)
emma_bow.head()

(1669, 3065)


Unnamed: 0,text_sentence,text_source,str_sentence,'s,-PRON-,1,`,a,abide,ability,...,you,you'd,young,your,yours,youth,zeal,zealand,zealous,zigzag
0,"(Emma, Woodhouse, ,, handsome, ,, clever, ,, a...",Austen,"Emma Woodhouse, handsome, clever, and rich, wi...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"(She, was, the, youngest, of, the, two, daught...",Austen,She was the youngest of the two daughters of a...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"(Her, mother, had, died, too, long, ago, for, ...",Austen,Her mother had died too long ago for her to ha...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"(Sixteen, years, had, Miss, Taylor, been, in, ...",Austen,Sixteen years had Miss Taylor been in Mr. Wood...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"(Between, _, them)",Austen,Between _them,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
# Combine the Emma sentence data with the Alice data from the test set.
X_Emma_test = np.concatenate((
    X_train[y_train[y_train=='Carroll'].index],
    emma_bow.drop(['text_sentence','text_source', 'str_sentence'], 1)
), axis=0)
y_Emma_test = pd.concat([y_train[y_train=='Carroll'],
                         pd.Series(['Austen'] * emma_bow.shape[0])])

# Model.
print('\nTest set score:', lr.score(X_Emma_test, y_Emma_test))
lr_Emma_predicted = lr.predict(X_Emma_test)
pd.crosstab(y_Emma_test, lr_Emma_predicted)


Test set score: 0.6923937360178971


col_0,Austen,Carroll
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
Austen,1536,133
Carroll,692,321


# Challenge 0:

Recall that the logistic regression model's best performance on the test set was 93%.  See what you can do to improve performance.  Suggested avenues of investigation include: Other modeling techniques (SVM?), making more features that take advantage of the spaCy information (include grammar, phrases, POS, etc), making sentence-level features (number of words, amount of punctuation), or including contextual information (length of previous and next sentences, words repeated from one sentence to the next, etc), and anything else your heart desires.  Make sure to design your models on the test set, or use cross_validation with multiple folds, and see if you can get accuracy above 90%.  

In [31]:
#Baseline
lr = LogisticRegression()
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

(3190, 3062) (3190,)




Training set score: 0.9545454545454546

Test set score: 0.8843984962406015


In [32]:
len(lr.coef_[0])

3062

In [33]:
#How many of those are actually over zero?
len([x for x in lr.coef_[0] if x > 0])

1015

In [34]:
print(word_counts.loc[0,'text_sentence'])
print([x.pos_ for x in word_counts.loc[0,'text_sentence']])

Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversation?'
['PROPN', 'VERB', 'VERB', 'PART', 'VERB', 'ADV', 'ADJ', 'ADP', 'VERB', 'ADP', 'ADJ', 'NOUN', 'ADP', 'DET', 'NOUN', 'PUNCT', 'CCONJ', 'ADP', 'VERB', 'NOUN', 'PART', 'VERB', 'PUNCT', 'ADV', 'CCONJ', 'ADV', 'PRON', 'VERB', 'VERB', 'ADP', 'DET', 'NOUN', 'ADJ', 'NOUN', 'VERB', 'VERB', 'PUNCT', 'CCONJ', 'PRON', 'VERB', 'DET', 'NOUN', 'CCONJ', 'NOUN', 'ADP', 'PRON', 'PUNCT', 'PUNCT', 'CCONJ', 'NOUN', 'VERB', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'PUNCT', 'PUNCT', 'VERB', 'PROPN', 'PUNCT', 'ADP', 'NOUN', 'CCONJ', 'NOUN', 'PUNCT', 'PUNCT']


In [0]:
#Add counts of key types
word_counts['sent_len'] = [len(x) for x in word_counts.str_sentence]
word_counts['cnt_verbs'] = [sum([1 for x in j if x.pos_ == 'VERB']) for j in word_counts.text_sentence]
word_counts['cnt_adj'] = [sum([1 for x in j if x.pos_ == 'ADJ']) for j in word_counts.text_sentence]
word_counts['cnt_prop'] = [sum([1 for x in j if x.pos_ == 'PROPN']) for j in word_counts.text_sentence]
word_counts['cnt_punct'] = [sum([1 for x in j if x.pos_ == 'PUNCT']) for j in word_counts.text_sentence]
word_counts['cnt_adv'] = [sum([1 for x in j if x.pos_ == 'ADV']) for j in word_counts.text_sentence]
word_counts['cnt_nouns'] = [sum([1 for x in j if x.pos_ == 'NOUN']) for j in word_counts.text_sentence]
word_counts['crude_sentiment'] = [j.sentiment for j in word_counts.text_sentence]




In [0]:
#Redefine train/test splits
Y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source',
                               'str_sentence'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)

In [37]:
lr = LogisticRegression()
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

(3190, 3070) (3190,)
Training set score: 0.9554858934169279

Test set score: 0.8956766917293233




In [38]:
#Interestingly, th added columns reduces how many features are used by Lasso Regression
len([x for x in lr.coef_[0] if x > 0])

1004

In [39]:
len(lr.coef_[0])

3070

In [40]:
len(word_counts.columns[3:])

3070

In [41]:
#Remove useless columns, then try one of the other models
colsToUse = []

for i, col in enumerate(word_counts.columns[3:]):
  if lr.coef_[0][i] >0:
    colsToUse.append(col)
#confirm:
len(colsToUse)

1004

In [0]:
#Redefine train/test splits
Y = word_counts['text_source']
X = np.array(word_counts[colsToUse])

X_shorttrain, X_shorttest, y_shorttrain, y_shorttest = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)

In [43]:
clf = ensemble.GradientBoostingClassifier()
train = clf.fit(X_shorttrain, y_shorttrain)

print('Training set score:', clf.score(X_shorttrain, y_shorttrain))
print('\nTest set score:', clf.score(X_shorttest, y_shorttest))

Training set score: 0.8677115987460815

Test set score: 0.8491541353383458


#Analysis 0:
By adding features related to the types of tokens in each span, Logistic Regression with Lasso regularization was able to get 89.6% accuracy on a test set, continuing to outperform other models types with this dataset

# Challenge 1:
Find out whether your new model is good at identifying Alice in Wonderland vs any other work, Persuasion vs any other work, or Austen vs any other work.  This will involve pulling a new book from the Project Gutenberg corpus (print(gutenberg.fileids()) for a list) and processing it.

In [44]:
print(gutenberg.fileids())

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


In [45]:
#Can this differentiate Paradise from Alice in Wonderland?

md = gutenberg.raw('milton-paradise.txt')
md = re.sub(r'VOLUME \w+', '', md)
md = re.sub(r'CHAPTER \w+', '', md)
md = text_cleaner(md)
print(type(md))
print(md[:100])

<class 'str'>
Book I Of Man's first disobedience, and the fruit Of that forbidden tree whose mortal taste Brought 


In [0]:
# Parse our cleaned data

# This book  is quite long, let's cut it down to the same length as Alice.
mdabb = md[0:len(alice)]
md_doc = nlp(mdabb)

In [0]:
md_sents = [[sent, "Milton"] for sent in md_doc.sents]

In [0]:
mdf = pd.DataFrame(md_sents)
mdf2 = pd.DataFrame()
mdf2['text_sentence'] = mdf[0]
mdf2['text_source'] = mdf[1]


In [0]:
#String version of the spacy objects
mdf2['str_sentence'] = [_.text for _ in mdf2.text_sentence]

In [0]:
X = vectorizer.fit_transform(mdf2['str_sentence'])
df2 =  pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())

In [55]:
para_bow = pd.concat([mdf2,df2], axis=1)
print(para_bow.shape)
para_bow.head()

(852, 3065)


Unnamed: 0,text_sentence,text_source,str_sentence,'s,-PRON-,1,`,a,abide,ability,...,you,you'd,young,your,yours,youth,zeal,zealand,zealous,zigzag
0,"(Book, I, Of, Man, 's, first, disobedience, ,,...",Milton,"Book I Of Man's first disobedience, and the fr...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"(And, chiefly, thou, ,, O, Spirit, ,, that, do...",Milton,"And chiefly thou, O Spirit, that dost prefer B...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"(Say, first, for, Heaven, hides, nothing, from...",Milton,Say first for Heaven hides nothing from thy vi...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"(Who, first, seduced, them, to, that, foul, re...",Milton,Who first seduced them to that foul revolt?,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"(Th, ', infernal, Serpent, ;, he, it, was, who...",Milton,"Th' infernal Serpent; he it was whose guile, S...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [61]:
df = pd.DataFrame()
df['text_sentence'] = sentences[0]
df['text_source'] = sentences[1]
#String version of the spacy objects
df['str_sentence'] = [_.text for _ in df.text_sentence]
X = vectorizer.fit_transform(df['str_sentence'])
df2 =  pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
word_counts = pd.concat([df,df2], axis=1)
Y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source',
                               'str_sentence'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)
lr = LogisticRegression()
train = lr.fit(X_train, y_train)



In [62]:
# Combine the paradise sentence data with the Alice data from the test set.
X_para_test = np.concatenate((
    X_train[y_train[y_train=='Carroll'].index],
    para_bow.drop(['text_sentence','text_source', 'str_sentence'], 1)
), axis=0)
y_para_test = pd.concat([y_train[y_train=='Carroll'],
                         pd.Series(['Milton'] * para_bow.shape[0])])

# Model.
print('\nTest set score:', lr.score(X_para_test, y_para_test))
lr_para_predicted = lr.predict(X_para_test)
pd.crosstab(y_para_test, lr_para_predicted)


Test set score: 0.17211796246648794


col_0,Austen,Carroll
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
Carroll,692,321
Milton,693,159


This classifier appears to be better at identifying Austen than it is at differentiating Carroll.