In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from collections import Counter
from nltk.corpus import gutenberg, stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

In [118]:
# Utility function to clean text.
def text_cleaner(text):
    
    # Visual inspection shows spaCy does not recognize the double dash '--'.
    # Better get rid of it now!
    text = re.sub(r'--',' ',text)
    
    # Get rid of headings in square brackets.
    text = re.sub("[\[].*?[\]]", "", text)
    
    # Get rid of chapter titles.
    text = re.sub(r'Chapter \d+','',text)
    text = re.sub(r'CHAPTER \d+', '', text)
    text = re.sub("\\n\\n.*?\\n\\n", '', text)
  
    # Get rid of extra whitespace.
    text = ' '.join(text.split())

    
    return text

In [3]:
gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

## Data Cleaning & Setup

In [119]:
ball = gutenberg.raw('chesterton-ball.txt')
brown = gutenberg.raw('chesterton-brown.txt')
thursday = gutenberg.raw('chesterton-thursday.txt')

ball = text_cleaner(ball)
brown = text_cleaner(brown)
thursday = text_cleaner(thursday)

nlp = spacy.load('en')

ball_doc = nlp(ball)
brown_doc = nlp(brown)
thursday_doc = nlp(thursday)

In [120]:
ball_sents = [[sent, "Ball"] for sent in ball_doc.sents]
brown_sents = [[sent, "Brown"] for sent in brown_doc.sents]
thursday_sents = [[sent, "Thursday"] for sent in thursday_doc.sents]

sentences = pd.DataFrame(ball_sents + brown_sents + thursday_sents)
sentences.head()

Unnamed: 0,0,1
0,"(The, flying, ship, of, Professor, Lucifer, sa...",Ball
1,"(That, it, was, far, above, the, earth, was, n...",Ball
2,"(The, professor, had, himself, invented, the, ...",Ball
3,"(Every, sort, of, tool, or, apparatus, had, ,,...",Ball
4,"(For, the, world, of, science, and, evolution,...",Ball


In [121]:
len(sentences)

10859

In [8]:
labels = ['ball', 'brown', 'thursday']
text = ''

for label in labels:
    text += gutenberg.raw('chesterton-' + label + '.txt')
    
clean_text = text_cleaner(text)

In [10]:
len(clean_text)

1138076

In [9]:
# Parse the data.
nlp = spacy.load('en')
nlp.max_length = 10000000

chesterton_doc = nlp(clean_text)

# Make sure process didn't die
print('Done processing')

Done processing


In [11]:
chesterton_doc[:250]

The flying ship of Professor Lucifer sang through the skies like a silver arrow; the bleak white steel of it, gleaming in the bleak blue emptiness of the evening. That it was far above the earth was no expression for it; to the two men in it, it seemed to be far above the stars. The professor had himself invented the flying machine, and had also invented nearly everything in it. Every sort of tool or apparatus had, in consequence, to the full, that fantastic and distorted look which belongs to the miracles of science. For the world of science and evolution is far more nameless and elusive and like a dream than the world of poetry and religion; since in the latter images and ideas remain themselves eternally, while it is the whole idea of evolution that identities melt into each other as they do in a nightmare. All the tools of Professor Lucifer were the ancient human tools gone mad, grown into unrecognizable shapes, forgetful of their origin, forgetful of their names. That thing which 

In [15]:
# Create sentences; convert to lower case; exclude stop and punctuation
bow_sentences = []
for sentence in chesterton_doc.sents:
    sentence = [
        token.lemma_.lower()
        for token in sentence
        if not token.is_stop
        and not token.is_punct
    ]
    bow_sentences.append(sentence)

print('There are {} sentences and {} tokens'.format(len(sentences), len(chesterton_doc)))

There are 10860 sentences and 244872 tokens


In [127]:
#remove sentence to match
bow_sentences.remove(['to', 'edmund', 'clerihew', 'bentley'])

In [131]:
import gensim
from gensim.models import word2vec

model = word2vec.Word2Vec(
    bow_sentences,
    workers=4,     # Number of threads to run in parallel (if your computer does parallel processing).
    min_count=10,  # Minimum word count threshold.
    window=6,      # Number of words around target word to consider.
    sg=0,          # Use CBOW because our corpus is small.
    sample=1e-3 ,  # Penalize frequent words.
    size=300,      # Word vector length.
    hs=1           # Use hierarchical softmax.
)

print('done!')



done!


## Tf-idf

In [None]:
#td-idf, pass param into vectorizer
X_train, X_test = train_test_split(chesterton_paras, test_size=0.4, random_state=0)

vectorizer = TfidfVectorizer(max_df=0.5, # drop words that occur in more than half the paragraphs
                             min_df=2, # only use words that appear at least twice
                             stop_words='english', 
                             lowercase=True, #convert everything to lower case (since Alice in Wonderland has the HABIT of CAPITALIZING WORDS for EMPHASIS)
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )

sent_tfidf = vectorizer.fit_transform(chesterton_paras)
print ('Number of features: %d' % sent_tfidf.get_shape()[1])

# split into train/test set
X_train_tf, X_test_tf = train_test_split(sent_tfidf, test_size=0.4, random_state=0)

#reshape vectorizer
X_train_tf_csr = X_train_tf.tocsr()

n = X_train_tf_csr.shape[0]
#list of dictionaries

tfidf_sent = [{} for _ in range(0, n)]

#list by features
terms = vectorizer.get_feature_names()

# for each paragraph, list feature words and tf-idf score
for i, j in zip(*X_train_tf_csr.nonzero()):
    tfidf_sent[i][terms[j]] = X_train_tf_csr[i,j]
    
print('Original sentence:', X_train[5])
print('Tf-idf vector:'), tfidf_sent[5]

In [None]:
# reduce feature set
svd = TruncatedSVD(330)
lsa = make_pipeline(svd, Normalizer(copy=False))

# run SVD on training data then project on training data
X_train_lsa = lsa.fit_transform(X_train_tf)

variance_explained = svd.explained_variance_ratio_
total_variance = variance_explained.sum()
print("Percent variance captured by all components:", total_variance*100)

#Looking at what sorts of paragraphs our solution considers similar, for the first five identified topics
paras_by_component=pd.DataFrame(X_train_lsa,index=X_train)
for i in range(5):
    print('Component {}:'.format(i))
    print(paras_by_component.loc[:,i].sort_values(ascending=False)[0:10])