In [46]:
import requests

verbose = True

base_url = 'http://www.gandhiashramsevagram.org/gandhi-literature/mahatma-gandhi-collected-works-volume-{}.pdf'
base_gandhi_file = '../corpus/gandhi/pdf/volume_{}.pdf'
base_gandhi_text = '../corpus/gandhi/text/volume_{}.txt'
base_clean_file = '../corpus/gandhi/text/clean_volume_{}.txt'
gandhi = '../corpus/gandhi/gandhi.txt'

def donwload_pdf(url, save_name):
    response = requests.get(url)
    
    with open(save_name, 'wb') as f:
        f.write(response.content)

In [3]:
# Download each of the 98 pdfs from the site http://www.gandhiashramsevagram.org/
for i in range(1, 99):
    save_file = base_gandhi_file.format(i)
    
    if verbose:
        print('Downloading', save_file + '...')
    
    donwload_pdf(base_url.format(i), save_file)

Downloading ../corpus/gandhi/pdf/volume_1.pdf...
Downloading ../corpus/gandhi/pdf/volume_2.pdf...


KeyboardInterrupt: 

In [2]:
import subprocess

for i in range(1, 99):
    
    gandhi_text_file = base_gandhi_text.format(i)
    
    if verbose:
        print('Converting PDF to', gandhi_text_file + '...')
    
    # Convert the ith pdf into a txt file and save it in the txt/ directory
    result = subprocess.run(['pdftotext', base_gandhi_file.format(i), gandhi_text_file])
    print(result)

Converting PDF to ../corpus/gandhi/text/volume_1.txt...
CompletedProcess(args=['pdftotext', '../corpus/gandhi/pdf/volume_1.pdf', '../corpus/gandhi/text/volume_1.txt'], returncode=0)
Converting PDF to ../corpus/gandhi/text/volume_2.txt...
CompletedProcess(args=['pdftotext', '../corpus/gandhi/pdf/volume_2.pdf', '../corpus/gandhi/text/volume_2.txt'], returncode=0)
Converting PDF to ../corpus/gandhi/text/volume_3.txt...
CompletedProcess(args=['pdftotext', '../corpus/gandhi/pdf/volume_3.pdf', '../corpus/gandhi/text/volume_3.txt'], returncode=0)
Converting PDF to ../corpus/gandhi/text/volume_4.txt...
CompletedProcess(args=['pdftotext', '../corpus/gandhi/pdf/volume_4.pdf', '../corpus/gandhi/text/volume_4.txt'], returncode=0)
Converting PDF to ../corpus/gandhi/text/volume_5.txt...
CompletedProcess(args=['pdftotext', '../corpus/gandhi/pdf/volume_5.pdf', '../corpus/gandhi/text/volume_5.txt'], returncode=0)
Converting PDF to ../corpus/gandhi/text/volume_6.txt...
CompletedProcess(args=['pdftotext'

In [39]:
import re, codecs

def check_regex(regex, text):

    matches = re.finditer(regex, text)

    for match in matches:
        print(match.group(0))
        print('---')

        
def clean_text():

    # Define the patterns we will match that don't depend on the volume
    collected_works_page_break = re.compile('[\n\s]*[0-9]*[\n\s]*THE COLLECTED WORKS OF.*[\n\s]*\f')
    headers_and_footers = re.compile('\n[a-z|0-9].*[\n\s]+')
    description_of_text = re.compile('[\n\s]*.*photostat.*[\n\s]*')

    for i in range(1, 99):
        
        # Read the ith text file
        gandhi_text_file = codecs.open(base_gandhi_text.format(i), 'r', 'iso-8859-1')
        text = gandhi_text_file.read()

        # Define the regex that depend on the volume number
        vol_page_break = re.compile('VOL\.\s*{}.*[\n\s]*[0-9]*[\n\s]*\f'.format(i))
    
        # Successively remove all of the text matched by our patterns
        result1, n1 = re.subn(collected_works_page_break, '', text)
        result2, n2 = re.subn(vol_page_break, '', result1)
        result3, n3 = re.subn(headers_and_footers, '', result2)
        result4, n4 = re.subn(description_of_text, '', result3)

        # For debugging
#         check_regex(collected_works_page_break, text)
        
        if verbose:
            print('deletions:', n1, n2, n3, n4)
        
        clean_text = result4

        with open(base_clean_file.format(i), 'w') as clean_file:
            clean_file.write(clean_text)
        
        if verbose:
            print('wrote clean file to', base_clean_file.format(i))

clean_text()
print('Done')

deletions: 235 236 1009 25
wrote clean file to ../corpus/gandhi/text/clean_volume_1.txt
deletions: 242 42 903 112
wrote clean file to ../corpus/gandhi/text/clean_volume_2.txt
deletions: 252 253 834 21
wrote clean file to ../corpus/gandhi/text/clean_volume_3.txt
deletions: 240 241 900 22
wrote clean file to ../corpus/gandhi/text/clean_volume_4.txt
deletions: 246 247 1021 91
wrote clean file to ../corpus/gandhi/text/clean_volume_5.txt
deletions: 268 269 911 122
wrote clean file to ../corpus/gandhi/text/clean_volume_6.txt
deletions: 224 224 739 4
wrote clean file to ../corpus/gandhi/text/clean_volume_7.txt
deletions: 247 248 738 16
wrote clean file to ../corpus/gandhi/text/clean_volume_8.txt
deletions: 252 252 851 45
wrote clean file to ../corpus/gandhi/text/clean_volume_9.txt
deletions: 255 256 896 90
wrote clean file to ../corpus/gandhi/text/clean_volume_10.txt
deletions: 255 255 1036 165
wrote clean file to ../corpus/gandhi/text/clean_volume_11.txt
deletions: 243 243 1332 58
wrote clea

In [38]:
import time
time.time()

1530737909.627785

In [47]:
# put everything into one big clean file

gandhi_file =  open(gandhi, 'w')

for i in range(1, 99):
    clean_text = open(base_clean_file.format(i), 'r').read()
    print(clean_text[:10])
    gandhi_file.write(clean_text)

    print('Wrote', base_clean_file.format(i), 'to', gandhi)
    
gandhi_file.close()

1. A CONFE
Wrote ../corpus/gandhi/text/clean_volume_1.txt to ../corpus/gandhi/gandhi.txt
1. INTERVI
Wrote ../corpus/gandhi/text/clean_volume_2.txt to ../corpus/gandhi/gandhi.txt
1. LETTER 
Wrote ../corpus/gandhi/text/clean_volume_3.txt to ../corpus/gandhi/gandhi.txt
1. LETTER 
Wrote ../corpus/gandhi/text/clean_volume_4.txt to ../corpus/gandhi/gandhi.txt
1. LETTER 
Wrote ../corpus/gandhi/text/clean_volume_5.txt to ../corpus/gandhi/gandhi.txt
1. CIRCULA
Wrote ../corpus/gandhi/text/clean_volume_6.txt to ../corpus/gandhi/gandhi.txt
1. ROYAL A
Wrote ../corpus/gandhi/text/clean_volume_7.txt to ../corpus/gandhi/gandhi.txt
1. LATE MR
Wrote ../corpus/gandhi/text/clean_volume_8.txt to ../corpus/gandhi/gandhi.txt
1. SPEECH 
Wrote ../corpus/gandhi/text/clean_volume_9.txt to ../corpus/gandhi/gandhi.txt
1. LETTER 
Wrote ../corpus/gandhi/text/clean_volume_10.txt to ../corpus/gandhi/gandhi.txt
1. JOHANNE
Wrote ../corpus/gandhi/text/clean_volume_11.txt to ../corpus/gandhi/gandhi.txt
1. TO THE 
Wrote ..