In [1]:
import aiohttp
import asyncio
from io import BytesIO, TextIOWrapper
import zipfile
import re

You can get a reading-friendly version at <https://ebooks.adelaide.edu.au/o/orwell/george/o79n/>

In [2]:
zip_1984_url = 'http://gutenberg.net.au/ebooks01/0100021.zip'

In [3]:
@asyncio.coroutine
def get_zip(url):
    r = yield from aiohttp.request('get', url)
    if r.status == 200:
        zip_bytes = yield from r.read_and_close()
        return zip_bytes
    raise ValueError('Fetching url %s failed beacuse %s'
                     % (url, r.reason))

def read_zip_txt(zip_bytes):
    with zipfile.ZipFile(BytesIO(zip_bytes), 'r') as myzip:
        txt_files = [pth for pth in myzip.namelist() if pth.endswith('.txt')]
        for txt_pth in txt_files:
            with TextIOWrapper(myzip.open(txt_pth, 'r')) as txt_f:
                yield txt_f.read()

In [4]:
loop = asyncio.get_event_loop()
r = loop.run_until_complete(get_zip(zip_1984_url))

In [5]:
txt_1984 = next(read_zip_txt(r))

In [6]:
start_of_part = re.compile(r'^\s*PART [A-Z]+\s*$').match
start_of_chapter = re.compile(r'\s*Chapter [0-9]+\s*$').match
end_of_content = re.compile(r'\s*THE END\s*$').match

In [7]:
cur_part = 0
cur_chpater = 0
chapters = {}

for line in txt_1984.split('\n\n'):
    if start_of_part(line):
        if cur_part == 0:
            print('Start reading the book')
            chapter_paragraphs = []
        else:
            # close last chapter of the last part
            chapters[(cur_part, cur_chapter)] = chapter_paragraphs
            chapter_paragraphs = []
            print('Closing part %d with %d chapters' % (cur_part, cur_chapter))
        cur_part += 1
        cur_chapter = 0
    elif start_of_chapter(line):
        if cur_chapter > 0:
            # sotre the chapter's paragraphs
            chapters[(cur_part, cur_chapter)] = chapter_paragraphs
            chapter_paragraphs = []
        cur_chapter += 1
    elif end_of_content(line):
        # the final chapter of the final part
        # Yes, we skipped the APPENDIX
        chapters[(cur_part, cur_chapter)] = chapter_paragraphs
        print('Closing part %d with %d chpaters' % (cur_part, cur_chapter))
        print('Book ends')
        break
    elif cur_part > 0 and line:
        chapter_paragraphs.append(line.replace('\n', ' '))

Start reading the book
Closing part 1 with 8 chapters
Closing part 2 with 9 chapters
Closing part 3 with 6 chpaters
Book ends


In [8]:
len(chapters)

23

In [9]:
list(sorted(chapters))

[(1, 1),
 (1, 2),
 (1, 3),
 (1, 4),
 (1, 5),
 (1, 6),
 (1, 7),
 (1, 8),
 (2, 1),
 (2, 2),
 (2, 3),
 (2, 4),
 (2, 5),
 (2, 6),
 (2, 7),
 (2, 8),
 (2, 9),
 (3, 1),
 (3, 2),
 (3, 3),
 (3, 4),
 (3, 5),
 (3, 6)]

In [14]:
'Total paragraphs: %d' % sum(map(len, chapters.values()))

'Total paragraphs: 1290'