# Scrape Bible Summary Tweets

This notebook scrapes all of the tweets from biblesummary.info, a website that contains tweet-length summaries of each chapter in the Bible as determined by a real human (in this case, his name is Chris Juby). This content was later adapted into a book entitled "@BibleIntro - a Bible handbook for the Twitter generation". 

# Import requirements

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import sqlite3
import sqlalchemy
import matplotlib.pyplot as plt
%matplotlib inline

# Functions

In [2]:
def get_chapter_tweets(book_url):
    """
    returns all tweets for a book
    """
    page = requests.get(book_url)
    soup = BeautifulSoup(page.content, 'html.parser')
    chapter_list, tweet_list = [], []
    ch_num = 1
    for tag in soup.find_all("p", class_="tweet_content"):
        raw_text = tag.text
        label, split, new_text = raw_text.partition(': ')
        chapter_list.append(ch_num)
        ch_num += 1
        tweet_list.append(str(new_text))
    return chapter_list, tweet_list

# Get book URLs

In [4]:
main_url = "http://www.biblesummary.info"
page = requests.get(main_url)
soup = BeautifulSoup(page.content, 'html.parser')

# create book dictionary to store urls
book_dict = {'book':[],'url':[]}
for ultag in soup.find_all("ul", class_="bible"):
    for litag in ultag.find_all("li"):
        for atag in litag.find_all("a"):
            book_dict['book'].append(atag.text)
            book_dict['url'].append(main_url + atag['href'])

# Get tweets for each chapter

In [10]:
# create chapter dictionary to store summaries
chapter_dict = {'book':[], 'chapter':[], 'tweet_text':[]}

for book_info in zip(book_dict['book'], book_dict['url']):
    book_name = str(book_info[0])
    book_url = book_info[1]    
    chapter_list, tweet_list = get_chapter_tweets(book_url)
    chapter_dict['book'].extend([book_name]*len(chapter_list))
    chapter_dict['chapter'].extend(chapter_list)
    chapter_dict['tweet_text'].extend(tweet_list)

chapter_tweet_df = pd.DataFrame(chapter_dict)

In [11]:
print(chapter_tweet_df.shape)
chapter_tweet_df.head()

(1189, 3)


Unnamed: 0,book,chapter,tweet_text
0,Genesis,1,"God created the heavens, the earth and everyth..."
1,Genesis,2,God formed a man and gave him the garden in Ed...
2,Genesis,3,The serpent deceived the woman. She and Adam a...
3,Genesis,4,Eve's sons made offerings to the LORD. Only Ab...
4,Genesis,5,"Adam's line was: Seth, Enosh, Kenan, Mahalalel..."


# Define helper functions

In [12]:
def character_counter(string):
    """
    return the number of characters in a string
    """
    return len(string)

def word_counter(string):
    """
    returns the number of words in a string
    """
    return len(string.split())

In [17]:
chapter_tweet_df['tweet_words'] = chapter_tweet_df['tweet_text'].apply(word_counter)
chapter_tweet_df['tweet_characters'] = chapter_tweet_df['tweet_text'].apply(character_counter)

chapter_tweet_df = chapter_tweet_df[['book','chapter','tweet_words','tweet_characters','tweet_text']]
display(chapter_tweet_df.head())

Unnamed: 0,book,chapter,tweet_words,tweet_characters,tweet_text
0,Genesis,1,23,130,"God created the heavens, the earth and everyth..."
1,Genesis,2,29,134,God formed a man and gave him the garden in Ed...
2,Genesis,3,26,131,The serpent deceived the woman. She and Adam a...
3,Genesis,4,24,133,Eve's sons made offerings to the LORD. Only Ab...
4,Genesis,5,20,130,"Adam's line was: Seth, Enosh, Kenan, Mahalalel..."


# Save this dataframe

In [18]:
chapter_tweet_df.to_csv('../data/chapter_tweets.csv', index=False)