In [1]:
import requests

verbose = True

base_url = 'http://www.gandhiashramsevagram.org/gandhi-literature/mahatma-gandhi-collected-works-volume-{}.pdf'
base_gandhi_file = '../corpus/gandhi/pdf/volume_{}.pdf'
base_gandhi_text = '../corpus/gandhi/text/volume_{}.txt'
base_clean_file = '../corpus/gandhi/text/clean_volume_{}.txt'
gandhi = '../corpus/gandhi/gandhi.txt'

def donwload_pdf(url, save_name):
    response = requests.get(url)
    
    with open(save_name, 'wb') as f:
        f.write(response.content)

In [91]:
# utils
import time

def progress_bar(progress, total, message='', size=20):
    """
    Prints a progress bar. Overwrites the last progress bar written using the "\r" (carriage return) character.
    :param progress: The current progress, i.e. the number of things 
    :param total: The total number of things you have to do to reach 100.00%
    :param message: An optional argument: the extra message to display inline.
    :param size: Optionally, the size of the progress bar to display, in characters.
    """
    # Check some error conditions
    if total <= 0 or size < 0:
        return
    if progress < 0:
        progress = 0
    
    
    # Do some calculations
    perc = float(progress) / float(total)
    cur =  int(perc * size) if progress < total else int(size)
    
    # Make the progress strings
    complete = str(''.join('-' * cur))
    incomplete = str(''.join(' ' * (size - cur)))
    
    
    # Print the progress bar
    print("\r" + '[' + complete + incomplete + '] {:.2f}% complete'.format(perc * 100), message, end="")
    
    # Output: 
    # [-----------------------------           ] 73.00% complete

In [2]:
def download_gandhi():
    # Download each of the 98 pdfs from the site http://www.gandhiashramsevagram.org/
    for i in range(1, 99):
        save_file = base_gandhi_file.format(i)

        if verbose:
            print('Downloading', save_file + '...')

        donwload_pdf(base_url.format(i), save_file)

# download_gandhi()

In [3]:
import subprocess

def pdf_to_text():
    for i in range(1, 99):

        gandhi_text_file = base_gandhi_text.format(i)

        if verbose:
            print('Converting PDF to', gandhi_text_file + '...')

        # Convert the ith pdf into a txt file and save it in the txt/ directory
        result = subprocess.run(['pdftotext', base_gandhi_file.format(i), gandhi_text_file])
        print(result)
        
# pdf_to_text()

In [99]:
import re, codecs

verbose = False
total = 99

def check_regex(regex, text):

    matches = re.finditer(regex, text)

    for match in matches:
        print(match.group(0))
        print('---')
        
def line(pattern, br='[\n]+[\s]*', end=''):
    """
    Match a new line with all surrounding whitespace.
    """
    
    return re.compile(br + pattern + br + end)

        
def clean_text():

    # Define the patterns we will match that don't depend on the volume
    collected_works_page_break = line('[0-9]*[\n\s]*THE COLLECTED WORKS OF.*[\n\s]*', end='\f')
    headers_and_footers = re.compile('\n[a-z|0-9].*[\n\s]+')
    description_of_text = line('.*photostat.*')
    only_uppercase_and_num = line('[A-Z0-9]+')
    short = line('.{,20}')
    

    for i in range(1, total):
        
        # Read the ith text file
        gandhi_text_file = codecs.open(base_gandhi_text.format(i), 'r', 'iso-8859-1')
        text = gandhi_text_file.read()

        # Define the regex that depend on the volume number
        vol_page_break = re.compile('VOL\.\s*{}.*[\n\s]*[0-9]*[\n\s]*\f'.format(i))
    
        # Successively remove all of the text matched by our patterns
        result1, n1 = re.subn(collected_works_page_break, '', text)
        result2, n2 = re.subn(vol_page_break, '', result1)
        result3, n3 = re.subn(headers_and_footers, '', result2)
        result4, n4 = re.subn(description_of_text, '', result3)
        result5, n5 = re.subn(only_uppercase_and_num, '', result4)
        result6, n6 = re.subn(short, '', result5)

        # For debugging
#         check_regex(only_uppercase, text)
        
        if verbose:
            print('\rdeletions:', n1, n2, n3, n4, n5, n6, end=" ")
        
        clean_text = result4

        with open(base_clean_file.format(i), 'w') as clean_file:
            clean_file.write(clean_text)
        
        if verbose:
            print('wrote clean file to', base_clean_file.format(i), end="")
            
        if not verbose:
            progress_bar(i, total, message='wrote clean file to ' + base_clean_file.format(i))

print('Cleaning text files')
clean_text()
print('\rDone')

Cleaning text files
Done


In [93]:
import time
time.time()

1531678827.3045828

In [102]:
# put everything into one big clean file
import os

total = 99

# Delete the previous one big clean file
try:
    os.remove(gandhi)
except FileNotFoundError:
    print('No previous {} file'.format(gandhi))

print('Cleaning text files')
clean_text()
print('\rDone')

gandhi_file =  open(gandhi, 'w')

for i in range(1, total):
    clean_text = open(base_clean_file.format(i), 'r').read()
    print(clean_text[:10], end=' ')
    gandhi_file.write(clean_text)

    print('Added all of', base_clean_file.format(i), 'to', gandhi)
    
gandhi_file.close()

for i in range(1, total):
    os.remove(base_clean_file.format(i))
    
print('Removed temp clean files.')

No previous ../corpus/gandhi/gandhi.txt file
Cleaning text files
Done
1. A CONFE Added all of ../corpus/gandhi/text/clean_volume_1.txt to ../corpus/gandhi/gandhi.txt
1. INTERVI Added all of ../corpus/gandhi/text/clean_volume_2.txt to ../corpus/gandhi/gandhi.txt
1. LETTER  Added all of ../corpus/gandhi/text/clean_volume_3.txt to ../corpus/gandhi/gandhi.txt
1. LETTER  Added all of ../corpus/gandhi/text/clean_volume_4.txt to ../corpus/gandhi/gandhi.txt
1. LETTER  Added all of ../corpus/gandhi/text/clean_volume_5.txt to ../corpus/gandhi/gandhi.txt
1. CIRCULA Added all of ../corpus/gandhi/text/clean_volume_6.txt to ../corpus/gandhi/gandhi.txt
1. ROYAL A Added all of ../corpus/gandhi/text/clean_volume_7.txt to ../corpus/gandhi/gandhi.txt
1. LATE MR Added all of ../corpus/gandhi/text/clean_volume_8.txt to ../corpus/gandhi/gandhi.txt
1. SPEECH  Added all of ../corpus/gandhi/text/clean_volume_9.txt to ../corpus/gandhi/gandhi.txt
1. LETTER  Added all of ../corpus/gandhi/text/clean_volume_10.txt 