In [31]:
import time
import re
import spacy
import scipy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

import nltk
#nltk.download()
from nltk.corpus import gutenberg, stopwords

from collections import Counter

In [32]:
def text_cleaner(text):
    text = re.sub(r"--", "", text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = " ".join(text.split())
    return text

In [33]:
persuasion = gutenberg.raw("austen-persuasion.txt")
alice = gutenberg.raw("carroll-alice.txt")

In [34]:
persuasion = re.sub(r"Chapter \d+", "", persuasion)
alice = re.sub(r"CHAPTER .*", "", alice)
    
alice = text_cleaner(alice)
persuasion = text_cleaner(persuasion)

In [35]:
nlp = spacy.load("en")
alice_doc = nlp(alice)
persuasion_doc = nlp(persuasion)

In [36]:
alice_sents = [[sent, "Carroll"] for sent in alice_doc.sents]
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]

In [37]:
sentences = pd.DataFrame(alice_sents + persuasion_sents)

In [38]:
sentences.head()

Unnamed: 0,0,1
0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,"(So, she, was, considering, in, her, own, mind...",Carroll
2,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,"(Oh, dear, !)",Carroll
4,"(I, shall, be, late, !, ')",Carroll


In [39]:
sentences.tail()

Unnamed: 0,0,1
5298,"(Her, spring, of, felicity, was, in, the, glow...",Austen
5299,"(Anne, was, tenderness, itself, ,, and, she, h...",Austen
5300,"(His, profession, was, all, that, could, ever,...",Austen
5301,"(She, gloried, in, being, a, sailor, 's, wife,...",Austen
5302,(Finis),Austen


In [40]:
nlp("running")[0].lemma_

'run'

In [43]:
def bag_of_words(text):
    allwords = [token.lemma_ for token in text if not token.is_punct and not token.is_stop]
    return [item[0] for item in Counter(allwords).most_common(2000)]

def bow_features(sentences, common_words):
    df = pd.DataFrame(columns=common_words)
    df["text_sentence"] = sentences[0]
    df["text_source"] = sentences[1]
    df.loc[:, common_words] = 0
    
    for i, sentence in enumerate(df["text_sentence"]):
        words = [token.lemma_ for token in sentence
        if ( not token.is_punct and not token.is_stop
             and token.lemma_ in common_words
        )]
        for word in words:
            df.loc[i, word] += 1
    
        if i % 500 == 0:
            print("Processing row {}".format(i))
    
    return df

In [44]:
def bow_features(sentences, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 500 == 0:
            print("Processing row {}".format(i))
            
    return df

In [45]:
alice_words = bag_of_words(alice_doc)
persuasion_words = bag_of_words(persuasion_doc)

common_words = set(alice_words + persuasion_words)

In [46]:
word_counts = bow_features(sentences, common_words)

Processing row 0


KeyboardInterrupt: 

In [None]:
word_counts.head()