# Project Gutenberg Web Scrapping
In this notebook, we will web scraping from Project Gutenberg & Google to obtain the content and other informations of political philosophy texts. Those would be the building blocks of a database for Natural Language Processing (NLP) purposes open-source for anyone interested in the intersection between data science and political thought. 


## Table of Contents
1. Environment set-up
2. Project Gutenberg: Text Data Retrieval 
3. Google Search API: Date Retrieval

### 1. Environment set-up

In [1]:
# importing libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re

### 3. Project Gutenberg: Text Data Retrieval 

In [18]:
# Putting everything into a Class with different methods
class GutenbergTextRetrieveal():
    ''' 
    A class extracting the details and content of texts 
    pulled from Project Gutenberg
    '''
    def __init__(self, base_url, links):
        self.links = links
        self.base_url = base_url
    
    def text_scraper(self):
        # Lists hosting text data info
        self.titles, self.authors, self.languages = [],[],[]
        
        # Lists hosting text content
        self.texts = []
        
        
        delims = ['\r\nTitle: ', '\r\n\r\nAuthor: ', 
                '\r\nRelease Date: ', '\r\nLanguage: ',
                '\r\nCharacter set encoding: ']
        content_beg = 'START OF THIS PROJECT GUTENBERG EBOOK'
        content_end = 'End of the Project Gutenberg EBook'
        for link in self.links:
            # Details on the text
            req = requests.get(self.base_url+link)
            soup = BeautifulSoup(req.content, "html.parser")
            raw_text = soup.text
            
            idx = []
            for delim in delims:
                idx.append(raw_text.find(delim))
            
            # Get book title
            title = raw_text[idx[0]:idx[1]]
            title = title.replace('Title: ', '').replace('\r\n', '')
            self.titles.append(title)
            
            # Get book author
            author = raw_text[idx[1]:idx[2]]
            author = author.replace('Author: ', '').replace('\r\n', '')
            self.authors.append(author)
            
            # Get book language
            lang = raw_text[idx[3]:idx[4]]
            lang = lang.replace('Language: ', '').replace('\r\n', '')
            self.languages.append(lang)

            # Get the body of the text
            
            idx0 = raw_text.find(content_beg)
            idx1 = raw_text.find(content_end)
            text_body = raw_text[idx0:idx1].replace(content_beg, '')
            self.texts.append(text_body)

    def data_formatting(self):
        # Creating the dictionary for the dataframe structure
        books_dict = {'title': self.titles,
                      'author': self.authors,
                      'language': self.languages,
                      'text': self.texts}

        # Creating the datafame
        df = pd.DataFrame.from_dict(data=books_dict, orient='columns')

        return df

# Base URL
base_url = 'https://www.gutenberg.org/files/'
links = ['4320/4320-h/4320-h.htm']
ret = GutenbergTextRetrieveal(base_url, links)
ret.text_scraper()
ret.data_formatting()

Unnamed: 0,title,author,language,text
0,An Enquiry Concerning the Principles of Morals,David Hume,English,PRINCIPLES OF MORALS ***\r\n\r\n\r\n\r\n\r\nP...


In [22]:
# Think about the inclusion of paragraphs 
# req = requests.get('https://www.gutenberg.org/files/4320/4320-h/4320-h.htm')
# soup = BeautifulSoup(req.content, "html.parser")
# paragraph=soup.find_all("p")
# for para in paragraph:
#     print(para.text)

### 4. Google Search API: Date Retrieval