# Importing the necessary Libraries

In [1]:
import numpy as np
import pandas as pd

import math
import os
import re
import string
import random
import urllib3.request
import zipfile
import requests

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aorsot1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Web Scrapping the texts from Project Gutenberg

In [2]:
# Plato - Republic
rep = requests.get('https://www.gutenberg.org/cache/epub/150/pg150.txt')
beg = rep.text.find('BOOK I')
end = rep.text.find('End of the Project Gutenberg EBook')
book1 = rep.text[beg:end]

In [3]:
# Machiavelli - The Prince
prc = requests.get('https://www.gutenberg.org/files/1232/1232-0.txt')
beg = prc.text.find('DEDICATION\r\n\r\n\r\nTo the Magnificent')
end = prc.text.find('*** END OF THE PROJECT GUTENBERG')
book2 = prc.text[beg:end]

In [4]:
# Hobbes - The Leviathan
lvt = requests.get('https://www.gutenberg.org/files/3207/3207-0.txt')
beg = lvt.text.find('Nature (the art whereby God hath made and governes the world)')
end = lvt.text.find('FINIS')
book3 = lvt.text[beg:end]

In [5]:
# Locke - 2nd Treatise
second = requests.get('https://www.gutenberg.org/cache/epub/7370/pg7370.txt')
beg = second.text.find('PREFACE')
end = second.text.find('FINIS.')
book4 = second.text[beg:end]

In [6]:
# Lao - Tao
tao = requests.get('https://www.gutenberg.org/cache/epub/216/pg216.txt')
beg = tao.text.find('PART 1.')
end = tao.text.find('End of the Project Gutenberg EBook')
book5 = tao.text[beg:end]

In [7]:
# Rousseau - Contract
soc = requests.get('https://www.gutenberg.org/files/46333/46333-0.txt')
beg = soc.text.find('This little treatise')
end = soc.text.find('***END OF THE PROJECT GUTENBERG EBOOK')
book6 = soc.text[beg:end]

In [8]:
# Schmitt - Political
# scht = pd.read
# beg = scht.text.find('BOOK I')
# end = scht.text.find('End of the Project Gutenberg EBook')
# book2 = scht.text[beg:end]
# book2

# Creating the lists with the necessary information to tabulate

In [9]:
titles = ['The Republic', 'The Prince', 'The Leviathan', 
          'Second Treatise of Government', 'Tao Te Ching', 'The Social Contract']
authors = ['Plato', 'Niccolò Machiavelli', 'Thomas Hobbes', 
           'John Locke', 'Lao Tzu', 'Jean-Jacques Rousseau']
publish_dates = [-375, 1532, 1651, 1689, -400, 1762]

texts = [book1,book2,book3,
         book4, book5,book6]

# Utility Functions

In [10]:
#Utitlity functions for removing ASCII characters, converting lower case, removing stop words, html and punctuation from description
def _removeNonAscii(s):
    return "".join(i for i in s if  ord(i)<128)

def make_lower_case(text):
    return text.lower()

def remove_stop_words(text):
    text = text.split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return text

def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

def remove_punctuation(text):
    tokenizer = RegexpTokenizer(r'\w+')
    text = tokenizer.tokenize(text)
    text = " ".join(text)
    return text

# Creating a dataframe

In [11]:
books_dict = {'book_title': titles,
              'publishing_date': publish_dates,
              'authors': authors,
              'text': texts}
df = pd.DataFrame.from_dict(data=books_dict, orient='columns')
df['text_clean'] = df['text'].astype(str)
df['text_clean'] = df['text_clean'].apply(_removeNonAscii)
df['text_clean'] = df['text_clean'].apply(func = make_lower_case)
df['text_clean'] = df['text_clean'].apply(func = remove_stop_words)
df['text_clean'] = df['text_clean'].apply(func=remove_punctuation)
df['text_clean'] = df['text_clean'].apply(func=remove_html)

# Saving to csv file

In [12]:
df.to_csv('political_thought_works_corpus.csv', header=True)