In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from collections import Counter
from nltk.corpus import gutenberg, stopwords
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import ensemble
import gensim



In [2]:
# Utility function to clean text.
def text_cleaner(text):
    
    # Visual inspection shows spaCy does not recognize the double dash '--'.
    # Better get rid of it now!
    text = re.sub(r'--',' ',text)
    
    # Get rid of headings in square brackets.
    text = re.sub("[\[].*?[\]]", "", text)
    
    # Get rid of chapter titles.
    text = re.sub(r'Chapter \d+','',text)
    text = re.sub(r'CHAPTER \d+', '', text)
    text = re.sub(r'Chapter \w+','',text)
    text = re.sub(r'CHAPTER \w+', '', text)
    text = re.sub("\\n\\n.*?\\n\\n", '', text)
  
    # Get rid of extra whitespace.
    text = ' '.join(text.split())

    
    return text

# returns list of documents with NLP
def load_spacy(list_of_docs):
    
    # load spacy
    print('Running Spacy...')
    nlp = spacy.load('en')
    nlp.max_length = 2000000
    
    # set empty list; holds processed list of docs
    nlp_docs = []
    
    for i, doc in enumerate(list_of_docs):
      print('Processing {}'.format(doc_names[i]))
      nlp_docs.append(nlp(doc))
    print('Done processing')
    return nlp_docs

In [3]:
gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [4]:
# load documents
doc_names, docs = [], []
for name in gutenberg.fileids():
  doc_names.append(str(name))
  
print('Getting documents..')
for name in doc_names:
  clean_doc = text_cleaner(gutenberg.raw(name))
  if len(clean_doc) >= 2000000:
    print('length of {} ({}) is too long, trimming.'.format(name, len(clean_doc)))
    clean_doc = clean_doc[:1500000]
  docs.append(clean_doc)

Getting documents..
length of bible-kjv.txt (4227119) is too long, trimming.


In [5]:
# # run spacy
# print('Running Spacy...')
# nlp = spacy.load('en')
# nlp.max_length = 2000000
# for i, doc in enumerate(docs):
#   print('Processing {}'.format(doc_names[i]))
#   nlp_docs.append(nlp(doc))
# print('Done processing')

nlp_docs = load_spacy(docs)

Running Spacy...
Processing austen-emma.txt
Processing austen-persuasion.txt
Processing austen-sense.txt
Processing bible-kjv.txt
Processing blake-poems.txt
Processing bryant-stories.txt
Processing burgess-busterbrown.txt
Processing carroll-alice.txt
Processing chesterton-ball.txt
Processing chesterton-brown.txt
Processing chesterton-thursday.txt
Processing edgeworth-parents.txt
Processing melville-moby_dick.txt
Processing milton-paradise.txt
Processing shakespeare-caesar.txt
Processing shakespeare-hamlet.txt
Processing shakespeare-macbeth.txt
Processing whitman-leaves.txt
Done processing


In [6]:
# put the tokens of a doc in a list
def convert_to_tokens(doc):
  token_list = []
  for token in doc:
    if not token.is_stop and not token.is_punct:
      token_list.append(token.lemma_.lower())
  return token_list

In [7]:
# list of documents tokens and sentences
all_docs = []

for doc in nlp_docs:
    all_docs.append(convert_to_tokens(doc))

## Generate Dictionary

In [8]:
# create dictionary from list of document
dic = gensim.corpora.Dictionary(all_docs)
dic.filter_n_most_frequent(3)
# don't want words that exist in almost all documents
dic.filter_extremes(no_above=0.95)

# create bag of words representation for each document
corpus = [dic.doc2bow(doc) for doc in all_docs]
for doc in corpus:
  print(doc[:15])

[(0, 2), (1, 1), (2, 3), (3, 72), (4, 1), (5, 7), (6, 3), (7, 8), (8, 3), (9, 5), (10, 1), (11, 16), (12, 12), (13, 4), (14, 33)]
[(1, 1), (2, 3), (3, 30), (4, 1), (5, 1), (6, 1), (7, 5), (11, 9), (12, 5), (13, 3), (14, 5), (16, 6), (17, 1), (18, 1), (21, 3)]
[(0, 3), (2, 12), (3, 47), (4, 5), (6, 8), (7, 9), (9, 3), (10, 1), (11, 11), (12, 3), (13, 3), (14, 10), (16, 2), (17, 4), (18, 2)]
[(0, 15), (1, 31), (2, 1), (3, 55), (4, 38), (5, 5), (6, 3), (7, 33), (12, 1), (18, 9), (19, 1), (20, 7), (21, 2), (23, 21), (24, 2)]
[(7, 1), (40, 1), (63, 1), (96, 1), (98, 7), (102, 6), (107, 1), (110, 1), (117, 1), (136, 1), (137, 3), (138, 11), (145, 1), (146, 4), (151, 1)]
[(1, 1), (3, 13), (6, 2), (12, 1), (15, 1), (16, 1), (19, 1), (37, 2), (40, 1), (42, 1), (48, 5), (49, 3), (55, 1), (58, 2), (66, 2)]
[(2, 1), (3, 1), (23, 1), (34, 1), (43, 2), (53, 4), (65, 3), (71, 2), (73, 6), (74, 1), (88, 40), (89, 2), (94, 3), (96, 4), (98, 3)]
[(1, 1), (3, 1), (11, 1), (16, 2), (25, 1), (26, 2), (28, 

In [9]:
columns = [value[1] for value in dic.items()]
len(columns)

5384

In [10]:
rows_list = []

# name to add name to source column in row
for i, doc in enumerate(nlp_docs):
    print('Processing document {}'.format(doc_names[i]))
    # document level, searching by sentence
    for sentence in doc.sents:
        sentence_list = []
        for word in columns:
            if word in str(sentence):
                sentence_list.append(1)
            else:
                sentence_list.append(0)
        
        # now append source since columns are done
        sentence_list.append(doc_names[i])
        
        # append sentence_list as a row
        rows_list.append(list(sentence_list))
        

Processing document austen-emma.txt
Processing document austen-persuasion.txt
Processing document austen-sense.txt
Processing document bible-kjv.txt
Processing document blake-poems.txt
Processing document bryant-stories.txt
Processing document burgess-busterbrown.txt
Processing document carroll-alice.txt
Processing document chesterton-ball.txt
Processing document chesterton-brown.txt
Processing document chesterton-thursday.txt
Processing document edgeworth-parents.txt
Processing document melville-moby_dick.txt
Processing document milton-paradise.txt
Processing document shakespeare-caesar.txt
Processing document shakespeare-hamlet.txt
Processing document shakespeare-macbeth.txt
Processing document whitman-leaves.txt


In [None]:
len(row)

In [11]:
df = pd.DataFrame(sentence_list, columns=[columns, 'source'])
df.head()

ValueError: Shape of passed values is (1, 5385), indices imply (2, 5385)

## TF-IDF

In [None]:
# tf-idf
tf_idf = gensim.models.TfidfModel(corpus)
print(tf_idf)

# store tf-idf
sims = gensim.similarities.Similarity('/usr/workdir/', tf_idf[corpus], num_features=len(dic))
print(sims)