In [20]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords
from collections import Counter
import nltk
import matplotlib

nltk.download('gutenberg')
!python -m spacy download 'en'

[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/amyscott/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!



[93m    Linking successful[0m
    /anaconda3/lib/python3.7/site-packages/en_core_web_sm -->
    /anaconda3/lib/python3.7/site-packages/spacy/data/en

    You can now load the model via spacy.load('en')



In [21]:
import sys
print(sys.executable)
print(sys.version)
print(sys.version_info)

/anaconda3/bin/python
3.7.3 (default, Mar 27 2019, 16:54:48) 
[Clang 4.0.1 (tags/RELEASE_401/final)]
sys.version_info(major=3, minor=7, micro=3, releaselevel='final', serial=0)


In [22]:
from nltk.corpus import gutenberg, stopwords
nltk.corpus.gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [23]:
poem = gutenberg.raw('blake-poems.txt')
thursday= gutenberg.raw('chesterton-thursday.txt')

pattern = "[\[].*?[\]]"
poem = re.sub(pattern, "", poem)
thursday = re.sub(pattern, "", thursday)



In [24]:
poem = re.sub(r'Chapter \d+', '', poem)
thursday = re.sub(r'CHAPTER .*', '', thursday)

poem = ' '.join(poem.split())
thursday = ' '.join(thursday.split())


In [25]:
print('Extra whitespace removed:\n', poem[0:100])
print('Extra whitespace removed:\n', thursday[0:100])

Extra whitespace removed:
 SONGS OF INNOCENCE AND OF EXPERIENCE and THE BOOK of THEL SONGS OF INNOCENCE INTRODUCTION Piping dow
Extra whitespace removed:
 To Edmund Clerihew Bentley A cloud was on the mind of men, and wailing went the weather, Yea, a sick


In [26]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [27]:
import spacy
nlp = spacy.load('en')
nlp.max_length

poem_doc = nlp(poem)
thursday_doc = nlp(thursday)

In [28]:
from collections import Counter

def word_frequencies(text, include_stop=True):
    
    words = []
    for token in text:
        if not token.is_punct and (not token.is_stop or include_stop):
            words.append(token.text)
  
    return Counter(words)
    
poem_freq = word_frequencies(poem_doc).most_common(10)
thursday_freq = word_frequencies(thursday_doc).most_common(10)
print('Poem:', poem_freq)
print('Thursday:', thursday_freq)

Poem: [('the', 351), ('And', 176), ('and', 169), ('of', 131), ('I', 130), ('in', 116), ('a', 108), ('to', 92), ('my', 72), ('The', 61)]
Thursday: [('the', 3290), ('a', 1712), ('of', 1710), ('and', 1568), ('to', 1044), ('in', 887), ('I', 880), ('he', 858), ('that', 840), ('his', 765)]


In [29]:
poem_freq = word_frequencies(poem_doc, include_stop=False).most_common(10)
thursday_freq = word_frequencies(thursday_doc, include_stop=False).most_common(10)
print('Poem:', poem_freq)
print('Thursday:', thursday_freq)

Poem: [('And', 176), ('I', 130), ('The', 61), ("'s", 43), ('thee', 42), ('like', 29), ('thy', 28), ('thou', 28), ('THE', 27), ('little', 26)]
Thursday: [('I', 880), ('Syme', 512), ('said', 495), ('The', 325), ('man', 272), ('He', 268), ('like', 260), ("'s", 223), ('But', 161), ('It', 152)]


In [30]:
poem_common = [pair[0] for pair in poem_freq]
thursday_common = [pair[0] for pair in thursday_freq]

print('Unique to Poem:', set(poem_common) - set(thursday_common))
print('Unique to Thursday:', set(thursday_common) - set(poem_common))

Unique to Poem: {'thou', 'thee', 'thy', 'And', 'THE', 'little'}
Unique to Thursday: {'But', 'Syme', 'said', 'man', 'It', 'He'}


In [33]:
def lemma_frequencies(text, include_stop=True):
    
    lemmas = []
    for token in text:
        if not token.is_punct and (not token.is_stop or include_stop):
            lemmas.append(token.lemma_)
            
    return Counter(lemmas)


poem_lemma_freq = lemma_frequencies(poem_doc, include_stop=False).most_common(10)
thursday_lemma_freq = lemma_frequencies(thursday_doc, include_stop=False).most_common(10)
print('Poem:', poem_lemma_freq)
print('Thursday:', thursday_lemma_freq)

poem_lemma_common = [pair[0] for pair in poem_lemma_freq]
thursday_lemma_common = [pair[0] for pair in thursday_lemma_freq]
print('Unique to Poem:', set(poem_lemma_common) - set(thursday_lemma_common))
print('Unique to Thursday:', set(thursday_lemma_common) - set(poem_lemma_common))

Poem: [('-PRON-', 204), ('and', 179), ('the', 88), ("'s", 45), ('little', 45), ('thee', 42), ('weep', 35), ('like', 35), ('thou', 35), ('hear', 33)]
Thursday: [('-PRON-', 1712), ('syme', 516), ('say', 510), ('man', 365), ('the', 344), ('like', 268), ("'s", 164), ('look', 163), ('but', 161), ('come', 161)]
Unique to Poem: {'weep', 'thou', 'and', 'thee', 'little', 'hear'}
Unique to Thursday: {'say', 'but', 'syme', 'come', 'look', 'man'}


In [34]:
sentences = list(poem_doc.sents)
print("Poem has {} sentences.".format(len(sentences)))

example_sentence = sentences[2]
print("Here is an example: \n{}\n".format(example_sentence))

Poem has 498 sentences.
Here is an example: 
So I piped with merry cheer. "



In [35]:
example_words = [token for token in example_sentence if not token.is_punct]
unique_words = set([token.text for token in example_words])

print(("There are {} words in this sentence, and {} of them are"
       " unique.").format(len(example_words), len(unique_words)))

There are 6 words in this sentence, and 6 of them are unique.


In [36]:
def bag_of_words(text):
    
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    return [item[0] for item in Counter(allwords).most_common(2000)]
    

def bow_features(sentences, common_words):
    
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    

    for i, sentence in enumerate(df['text_sentence']):
        
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        for word in words:
            df.loc[i, word] += 1
        
        if i % 50 == 0:
            print("Processing row {}".format(i))
            
    return df


poemwords = bag_of_words(poem_doc)
thursdaywords = bag_of_words(thursday_doc)


common_words = set(poemwords + thursdaywords)

In [37]:
word_counts = bow_features(sentences, common_words)
word_counts.head()

ValueError: Length of values does not match length of index

In [39]:
thursday=gutenberg.paras('chesterton-thursday.txt')
#processing
thursday_paras=[]
for paragraph in thursday:
    para=paragraph[0]
    
    para=[re.sub(r'--','',word) for word in para]
    
    thursday_paras.append(' '.join(para))

print(thursday_paras[0:4])

['[ The Man Who Was Thursday by G . K . Chesterton 1908 ]', 'To Edmund Clerihew Bentley', 'A cloud was on the mind of men , and wailing went the weather , Yea , a sick cloud upon the soul when we were boys together .', 'Not all unhelped we held the fort , our tiny flags unfurled ; Some giants laboured in that cloud to lift it from the world .']


In [41]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

X_train, X_test = train_test_split(thursday_paras, test_size=0.4, random_state=0)

vectorizer = TfidfVectorizer(max_df=0.5,
                             min_df=2, 
                             stop_words='english', 
                             lowercase=True, 
                             use_idf=True,
                             norm=u'l2', 
                             smooth_idf=True 
                            )



thursday_paras_tfidf=vectorizer.fit_transform(thursday_paras)
print("Number of features: %d" % thursday_paras_tfidf.get_shape()[1])

#splitting into training and test sets
X_train_tfidf, X_test_tfidf= train_test_split(thursday_paras_tfidf, test_size=0.4, random_state=0)


#Reshapes the vectorizer output into something people can read
X_train_tfidf_csr = X_train_tfidf.tocsr()

#number of paragraphs
n = X_train_tfidf_csr.shape[0]
#A list of dictionaries, one per paragraph
tfidf_bypara = [{} for _ in range(0,n)]
#List of features
terms = vectorizer.get_feature_names()
#for each paragraph, lists the feature words and their tf-idf scores
for i, j in zip(*X_train_tfidf_csr.nonzero()):
    tfidf_bypara[i][terms[j]] = X_train_tfidf_csr[i, j]

#Keep in mind that the log base 2 of 1 is 0, so a tf-idf score of 0 indicates that the word was present once in that sentence.
print('Original sentence:', X_train[5])
print('Tf_idf vector:', tfidf_bypara[5])

Number of features: 1114
Original sentence: " I tell you it can ' t be !"
Tf_idf vector: {'tell': 1.0}
