In [1]:
import pandas as pd
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers
from gutenberg.query import get_etexts
from gutenberg.query import get_metadata

In [9]:
import nltk
nltk.download('punkt')
from tidytext import unnest_tokens
from nltk.corpus import stopwords
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\alexa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
BOOK_ID = 74
CHAPTER_RE = '^CHAPTER [DIVCXL]*$'
book_text = strip_headers(load_etext(BOOK_ID, mirror = 'http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/')).strip()

In [6]:
import re
book_lines = book_text.splitlines()
book_lines

['THE ADVENTURES OF TOM SAWYER',
 '',
 '',
 'By Mark Twain',
 '',
 '(Samuel Langhorne Clemens)',
 '',
 '',
 '',
 '',
 'CONTENTS',
 '',
 '',
 'CHAPTER I. Y-o-u-u Tom-Aunt Polly Decides Upon her Duty—Tom Practices',
 'Music—The Challenge—A Private Entrance',
 '',
 'CHAPTER II. Strong Temptations—Strategic Movements—The Innocents',
 'Beguiled',
 '',
 'CHAPTER III. Tom as a General—Triumph and Reward—Dismal',
 'Felicity—Commission and Omission',
 '',
 'CHAPTER IV. Mental Acrobatics—Attending Sunday—School—The',
 'Superintendent—“Showing off”—Tom Lionized',
 '',
 'CHAPTER V. A Useful Minister—In Church—The Climax',
 '',
 'CHAPTER VI. Self-Examination—Dentistry—The Midnight Charm—Witches and',
 'Devils—Cautious Approaches—Happy Hours',
 '',
 'CHAPTER VII. A Treaty Entered Into—Early Lessons—A Mistake Made',
 '',
 'CHAPTER VIII. Tom Decides on his Course—Old Scenes Re-enacted',
 '',
 'CHAPTER IX. A Solemn Situation—Grave Subjects Introduced—Injun Joe',
 'Explains',
 '',
 'CHAPTER X. The Solem

In [10]:
class DataCleaner:
    def __init__(self, book_id, chapter_re):
        self.book_id = book_id
        self.chapter_re = chapter_re
        self.book_text = strip_headers(load_etext(BOOK_ID, mirror = 'http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/')).strip()
    def init_book_lines(self):
        self.book_lines = self.book_text.splitlines()
    def get_chapter_for_lines(self, dataframe):
        chapters = []
        curr_chapter = 0
        for index, row in dataframe.iterrows():
            if re.search(CHAPTER_RE, row['content'], re.IGNORECASE):
                curr_chapter += 1
            chapters.append(curr_chapter)
        return chapters
    def init_line_df(self):
        self.line_df = pd.DataFrame({
            "content": self.book_lines,
            "line": list(range(len(self.book_lines)))
        })
        self.line_df['chapter'] = self.get_chapter_for_lines(self.line_df)
        
    def init_word_df(self):
        self.words_df = (unnest_tokens(self.lines_df, "word", "content"))
        self.words_df = self.words_df.reset_index()
        self.words_df = self.words_df.drop(columns=['index'])
        self.words_df = self.words_df[words_df.word.notnull()]
        self.words_df = self.words_df.reset_index()
        self.words_df = self.words_df.drop(columns=['index'])
        self.words_df = self.words_df[self.words_df.word.notnull()]
        self.words_df = self.words_df.reset_index()
        self.words_df = self.words_df.drop(columns=['index'])
        self.words_df = filter(self.words_df, ~_.word.isin(stopwords.words('english')))
        punctuation_signs = ['\'', '"', ',', '.', '!', '?', '/', '[', ']', '(',')', '“', '”', '’', '‘']
        self.words_df = filter(self.words_df, ~_.word.isin(punctuation_signs))
        

In [11]:
cleaner = DataCleaner(74, '^CHAPTER [DIVCXL]*$')

In [13]:
cleaner.init_book_lines()
cleaner.book_lines

['THE ADVENTURES OF TOM SAWYER',
 '',
 '',
 'By Mark Twain',
 '',
 '(Samuel Langhorne Clemens)',
 '',
 '',
 '',
 '',
 'CONTENTS',
 '',
 '',
 'CHAPTER I. Y-o-u-u Tom-Aunt Polly Decides Upon her Duty—Tom Practices',
 'Music—The Challenge—A Private Entrance',
 '',
 'CHAPTER II. Strong Temptations—Strategic Movements—The Innocents',
 'Beguiled',
 '',
 'CHAPTER III. Tom as a General—Triumph and Reward—Dismal',
 'Felicity—Commission and Omission',
 '',
 'CHAPTER IV. Mental Acrobatics—Attending Sunday—School—The',
 'Superintendent—“Showing off”—Tom Lionized',
 '',
 'CHAPTER V. A Useful Minister—In Church—The Climax',
 '',
 'CHAPTER VI. Self-Examination—Dentistry—The Midnight Charm—Witches and',
 'Devils—Cautious Approaches—Happy Hours',
 '',
 'CHAPTER VII. A Treaty Entered Into—Early Lessons—A Mistake Made',
 '',
 'CHAPTER VIII. Tom Decides on his Course—Old Scenes Re-enacted',
 '',
 'CHAPTER IX. A Solemn Situation—Grave Subjects Introduced—Injun Joe',
 'Explains',
 '',
 'CHAPTER X. The Solem