In [4]:
import glob
import nltk
import numpy as np
import os
import pandas as pd
import random
import seaborn as sns
import string
from itertools import islice

def printNDict(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

def extract_content(path):
    with open(path, 'r') as book:
        content = book.read()
    return content

def remove_gutenberg_text(content):
    paragraphs = (p for p in content.split('\n') if p != '')
    include = False
    START_PREFIX = '***START OF'
    END_PREFIX = '***END OF'
    
    non_gutenberg_paragraphs = []
    
    for paragraph in paragraphs:
        if paragraph[:len(END_PREFIX)] == END_PREFIX:
            include = False
        
        if include:
            non_gutenberg_paragraphs.append(paragraph)
        
        if paragraph[:len(START_PREFIX)] == START_PREFIX:
            include = True
    
    return '\n'.join(non_gutenberg_paragraphs)

def extract_sentences(book):
    return nltk.tokenize.sent_tokenize(book)

def tokenize(sentence):
    return nltk.tokenize.word_tokenize(sentence)

def lookup_token(token, vocabulary):
    result = vocabulary.get(token.lower())
    if result is None:
        result = vocabulary[OOV]
    return result

def lookup_index(index, indexed_vocabulary):
    return indexed_vocabulary[index]

def encode_document(document, vocabulary):
    return [lookup_token(token, vocabulary) for token in document]

def decode_document(encoded_document, indexed_vocabulary):
    return [lookup_index(index, indexed_vocabulary) for index in encoded_document]

def calculate_frequencies(encoded_document, vocabulary):
    return np.bincount(encoded_document, minlength=len(vocabulary))



english_words = nltk.corpus.words.words()

lower_english_words = {word.lower() for word in english_words}

punctuation_tokens = set(string.punctuation)

OOV = '<oov>'
non_word_tokens = punctuation_tokens.union({OOV})

indexed_base_vocabulary = sorted(list(lower_english_words.union(non_word_tokens)))


base_vocabulary = {indexed_base_vocabulary[i]:i for i in range(len(indexed_base_vocabulary))}

n_items = printNDict(10, base_vocabulary.items())


In [23]:
DARWIN_DIR = 'darwin'
DICKENS_DIR = 'dickens'

pd.read_csv(os.path.join(DARWIN_DIR, 'metadata.tsv'), delimiter='\t')
pd.read_csv(os.path.join(DICKENS_DIR, 'metadata.tsv'), delimiter='\t')

def build_corpora(vocabulary):
    directories = [('darwin', DARWIN_DIR), ('dickens', DICKENS_DIR)]
    
    book_paths = list(map(
        lambda p: (p[0], glob.glob(os.path.join(p[1], '*.txt'))),
        directories
    ))
    
    books = list(map(
        lambda p: (
            p[0],
            (remove_gutenberg_text(extract_content(path)) for path in p[1])
        ),
        book_paths
    ))
    
    sentences = map(
        lambda p: (
            p[0],
            (sentence for book in p[1] for sentence in extract_sentences(book))
        ),
        books
    )
    
    documents = map(
        lambda p: (
            p[0],
            (tokenize(sentence) for sentence in p[1])
        ),
        sentences
    )
    
    encoded_sentences = list(map(
        lambda p: (
            p[0],
            [encode_document(document, vocabulary) for document in p[1]]
        ),
        documents
    ))
    
    return dict(encoded_sentences)

directories = [('darwin', DARWIN_DIR), ('dickens', DICKENS_DIR)]

book_paths = list(map(
    lambda p: (p[0], glob.glob(os.path.join(p[1], '*.txt'))),
    directories
))

books = list(map(
        lambda p: (
            p[0],
            (remove_gutenberg_text(extract_content(path)) for path in p[1])
        ),
        book_paths
    ))

sentences = map(
    lambda p: (
        p[0],
        (sentence for book in p[1] for sentence in extract_sentences(book))
    ),
    books
)

print(sentences)

<map object at 0x11dcbb908>
