In [1]:
import requests
from PyPDF3 import PdfFileReader
import io, csv, re, os 
from tqdm import tqdm 
from bs4 import BeautifulSoup
import pandas as pd 

### Helper functions

In [12]:
"""
Replace non-standard characters and whitespace in letter text. 
"""
def clean_up_text(text):

    # Remove copyright text which doesn't have to do with the letter contents
    text = text.replace('© Copyright 2001 R. G. Harrison', '')

    # Replace non-standard quotes and other characters
    text = text.replace('™', "'")
    text = text.replace('ﬁ', '"') 
    text = text.replace('ﬂ', '"') 

    # Replace special characters like Œ with their standard equivalents
    text = text.replace('Œ', '\u2014')  

    # Replace line breaks with a space unless they follow punctuation
    text = re.sub(r'(?<=[^\.\?\!,;:])\s*\n', ' ', text)

    return text

""" 
Given a URL link to a PDF letter, parse the content of the letter. Paginate as needed.
"""
def fetch_letter_text(url):
    response = requests.get(url)
    response.raise_for_status()  # Ensure that the request was successful

    # Use BytesIO to handle the PDF file as a binary stream
    with io.BytesIO(response.content) as f:
        # Read the PDF file using PyPDF3
        reader = PdfFileReader(f)

        full_text = ""
        # Loop through each page and extract text
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text = page.extractText()
            full_text += text + "\n"

    full_text_clean = clean_up_text(full_text)

    return full_text_clean

""" 
Extract the content and metadata for all Van Gogh letters available online. 
"""
def extract_letters(url: str, output_file: str):
    # Fetch the content of the webpage
    response = requests.get(url)
    response.raise_for_status()  # Ensure that the request was successful
    
    # Parse the content using Beautiful Soup
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Select the first table on the page
    file_table = soup.find_all('table')[2]  # Select only the first table

    # Find all table row elements and skip the first row
    rows = file_table.find_all('tr')[1:]  

    
    with open(output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        # Write the headers
        writer.writerow(['Number', 'Date', 'Origin', 'From', 'To', 'PDF Link', 'Text'])

        i = 0
        for row in tqdm(rows):
            i += 1
            cols = row.find_all('td')
            if len(cols) == 5:
                number = cols[0].get_text(strip=True)
                date = cols[1].get_text(strip=True)
                origin = cols[2].get_text(strip=True)
                from_person = cols[3].get_text(strip=True)
                to_person = cols[4].get_text(strip=True)
                pdf_link_tag = cols[0].find('a')
                pdf_link = f"http://www.vggallery.com/letters/{pdf_link_tag['href']}"
                
                # fetch letter text
                text = fetch_letter_text(pdf_link)           
                
                # Write the row to the CSV file
                writer.writerow([number, date, origin, from_person, to_person, pdf_link, text])
            else:
                raise Exception("Table columns are not in the correct format!")

### Run extraction

In [13]:
clean_text = fetch_letter_text("http://www.vggallery.com/letters/010_V-H_009a.pdf")
print(clean_text)


Original text
© Copyright 2001 R. G. Harrison
Letter 009a
London, 2 July 1873
[Letter to The Van Stockum Œ Haanebeek family; the envelope is addressed to W. J. van Stockum, Esq.,
Varkensmarkt, The Hague.]
Dear friends,
I should have liked to write sooner, and now I will not postpone it any longer. How are you? I heard that

your house has been smartened up, and that all is well with you. I hope very much you will drop me a line
when you have a moment to spare.
All is well with me. I see much that is new and beautiful, and have been fortunate in finding a good

boardinghouse, so that on the whole I feel quite at home already. Yet I do not forget The Hague, and should

very, very much like to spend an evening in the Poten, and look in on you, too.

The business here is only a stockroom, and our work is quite different from what it is in The Hague; but I

shall probably get used to it. At six o™clock my work is already done for the day, so that I have a nice bit of
time for myself, which 



In [3]:
# if data folder does not exist, create it as a subdirectory of the current directory
try:
    os.mkdir("data")
except:
    pass

page_url = "http://www.vggallery.com/letters/combined.htm"
output_file = "data/vg_letters.csv"

letters = extract_letters(url = page_url, output_file = output_file)

 21%|██        | 183/864 [00:50<03:07,  3.62it/s]


KeyboardInterrupt: 

In [108]:
letters = pd.read_csv(output_file)

In [109]:
letters.head()

Unnamed: 0,Number,Date,Origin,From,To,PDF Link,Text
0,1,"August, 1872",T-H,VvG,TvG,http://www.vggallery.com/letters/001_V-T_001.pdf,© Copyright 2001 R. G. Harrison Letter 001 The...
1,2,13 December 1872,T-H,VvG,TvG,http://www.vggallery.com/letters/002_V-T_002.pdf,© Copyright 2001 R. G. Harrison Letter 002 The...
2,3,"January, 1873",T-H,VvG,TvG,http://www.vggallery.com/letters/003_V-T_003.pdf,© Copyright 2001 R. G. Harrison Letter 003 The...
3,4,28 January 1873,T-H,VvG,TvG,http://www.vggallery.com/letters/004_V-T_004.pdf,© Copyright 2001 R. G. Harrison Letter 004 The...
4,5,17 March 1873,T-H,VvG,TvG,http://www.vggallery.com/letters/005_V-T_005.pdf,© Copyright 2001 R. G. Harrison Letter 005 The...


In [110]:
len(letters)

864

In [112]:
print(letters.Text[0])

© Copyright 2001 R. G. Harrison Letter 001 The Hague, c. 18 August 1872 Dear Theo,
Many thanks for your letter, I was glad to hear you arrived home safely. I missed you the first few days and it felt strange not to find you there when I came home in the afternoons.
  We have had some enjoyable days together, and managed to take a few walks and see one or two sights between the spots of rain.
 What dreadful weather! You must have sweltered from the heat on your walks to Oisterwijk. There was harness racing yesterday for the Exhibition; but the illuminations and the fireworks were put off because of the bad weather, so it's just as well you didn't stay on to see them. Regards from the Haanebeek and Roos families.
 Always your loving Vincent.
 
