# Project Gutenberg Web Scrapping
In this notebook, we will web scraping from Project Gutenberg & Google to obtain the content and other informations of political philosophy texts. Those would be the building blocks of a database for Natural Language Processing (NLP) purposes open-source for anyone interested in the intersection between data science and political thought. 


## Table of Contents
1. Environment set-up
2. Project Gutenberg: Text Data Retrieval 
3. Google Search API: Date Retrieval

### 1. Environment set-up

In [9]:
# importing libraries
from urllib import request
from bs4 import BeautifulSoup
import requests
import json
import pandas as pd
import jellyfish
import re

In [100]:
url = 'http://gutendex.com/books?search=hume,%20david'
html = request.urlopen(url).read()
soup = BeautifulSoup(html,'html.parser')
site_json = json.loads(soup.text)
books = site_json['results']

In [101]:
books_formatted = []
for book in books:
    author_details = book['authors'][0]
    
    # Avoid any books written by someone else
    name = author_details['name'].lower()
    if jellyfish.jaro_distance(name, 'hume, david') < 0.8:
        continue
    text_urls = book['formats']
    res = [val for key, val in text_urls.items() if 'text/plain' in key]
    topics = book['subjects']+book['bookshelves']
    lang = book['languages'][0]

    book.update(author_details)
    book.update({'text_url':res[0]})
    book.update({'topics':topics})
    book.update({'language':lang})

    keys_to_remove = ['id', 'authors','translators', 'subjects',
                  'bookshelves', 'languages','copyright', 
                  'media_type','formats','download_count']
    for key in keys_to_remove:
        book.pop(key)
    books_formatted.append(book)

In [103]:
hume_texts = pd.json_normalize(books_formatted)

### 3. Project Gutenberg: Text Data Retrieval 

In [107]:
# Putting everything into a Class with different methods
class GutenbergTextRetrieveal():
    ''' 
    A class extracting the details and content of texts 
    pulled from Project Gutenberg
    '''
    def __init__(self, links):
        self.links = links
    
    def text_scraper(self):
        self.texts = []

        content_beg = 'START OF THIS PROJECT GUTENBERG EBOOK'
        content_end = 'End of the Project Gutenberg EBook'
        for link in self.links:
            # Details on the text
            req = requests.get(link)
            soup = BeautifulSoup(req.content, "html.parser")
            raw_text = soup.text
            
            idx0 = raw_text.find(content_beg)
            idx1 = raw_text.find(content_end)
            text_body = raw_text[idx0:idx1].replace(content_beg, '')
            self.texts.append(text_body)
        return self.texts

# Base URL
links = list(hume_texts['text_url'])


## Text decoding UTF-8 & ASCII characters
ret = GutenbergTextRetrieveal(links)
texts = ret.text_scraper()
hume_texts['text_content'] = texts
hume_texts

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


In [22]:
# Think about the inclusion of paragraphs 
# req = requests.get('https://www.gutenberg.org/files/4320/4320-h/4320-h.htm')
# soup = BeautifulSoup(req.content, "html.parser")
# paragraph=soup.find_all("p")
# for para in paragraph:
#     print(para.text)

### 4. Google Search API: Date Retrieval