In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import unicodedata
import re
import pickle
from tqdm import tqdm

In [2]:
def get_page(url):
    '''
    url: str, webpage address to scrape
    page: str, webpage DOM
    '''
    response = requests.get(url)
    page = response.text
    return page

def get_soup(webpage):
    '''
    webpage: str, page to soup
    soup: bs4.BeautifulSoup object, souped page
    '''
    soup = BeautifulSoup(webpage)
    return soup

In [3]:
books_and_chs = {
'Genesis':50, 
'Exodus':40,
'Leviticus':27,
'Numbers':36,
'Deuteronomy':34,
'Joshua':24,
'Judges':21,
'Ruth':4,
'1-Samuel':31,
'2-Samuel':24,
'1-Kings':22,
'2-Kings':25,
'1-Chronicles':29,
'2-Chronicles':36,
'Ezra':10,
'Nehemiah':13,
'Esther':10,
'Job':42,
'Psalms':150,
'Proverbs':31,
'Ecclesiastes':12,
#'Song-of-Songs':8,
'Song-of-Solomon':8,
'Isaiah':66,
'Jeremiah':52,
'Lamentations':5,
'Ezekiel':48,
'Daniel':12,
'Hosea':14,
'Joel':3,
'Amos':9,
'Obadiah':1,
'Jonah':4,
'Micah':7,
'Nahum':3,
'Habakkuk':3,
'Zephaniah':3,
'Haggai':2,
'Zechariah':14,
'Malachi':4,
'Matthew':28,
'Mark':16,
'Luke':24,
'John':21,
'Acts':28,
'Romans':16,
'1-Corinthians':16,
'2-Corinthians':13,
'Galatians':6,
'Ephesians':6,
'Philippians':4,
'Colossians':4,
'1-Thessalonians':5,
'2-Thessalonians':3,
'1-Timothy':6,
'2-Timothy':4,
'Titus':3,
'Philemon':1,
'Hebrews':13,
'James':5,
'1-Peter':5,
'2-Peter':3,
'1-John':5,
'2-John':1,
'3-John':1,
'Jude':1,
'Revelation':22
}

In [10]:
range(1, books_and_chs['Revelation']+1)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]

In [4]:
versions = {  #27 --> 24
'American Standard Version':'ASV',
#'English Standard Version':'ESV',
'GOD\'S WORD Translation':'GW',
'Good News Translation':'GNT',
'Holman Christian Standard Bible':'CSB',
'Jubilee Bible 2000':'JUB',
'King James Version':'KJV',
'Lexham English Bible':'LEB',
'Douay-Rhiems Catholic Bible':'RHE',
#'New American Standard Bible':'NAS',
#'New International Version':'NIV',
'New King James Version':'NKJV',
'New Living Translation':'NLT',
'New Revised Standard':'NRS',
'Revised Standard Version':'RSV',
'The Message Bible':'MSG',
'Hebrew Names Version':'HNV',
'New Century Version':'NCV',
'New International Reader\'s Version':'NIRV',
'The Bible in Basic English':'BBE',
'The Complete Jewish Bible':'CJB',
'Third Millennium Bible':'TMB',
'World English Bible':'WEB',
'Young\'s Literal Translation':'YLT',
'The Darby Translation':'DBY',
'The Webster Bible':'WBT',
'Wycliffe':'WYC'
}

#1. get links

In [5]:
def get_bible_links(versions=versions, books_and_chs=books_and_chs):

    ##may have to do try/except KeyError: continue; 
    #b/c song of songs/sol confusion...
    #or just not include CEB

    links = {}
    for name, abbr in versions.items():
        for book in books_and_chs.keys():
            for ch in range(1, books_and_chs[book]+1):
                link = 'http://www.biblestudytools.com/' + abbr + '/' + \
                        book + '/' + str(ch) + '.html'
                try:
                    links[name].append(link)
                except KeyError:
                    links[name] = [link]
    return links

In [6]:
bible_links = get_bible_links()

In [5]:
def links_by_chapter(versions=versions, books_and_chs=books_and_chs):
    links = {}
    for name, abbr in versions.items():
        for book in books_and_chs.keys():
            for ch in range(1, books_and_chs[book]+1):
                link = 'http://www.biblestudytools.com/' + abbr + '/' + \
                        book + '/' + str(ch) + '.html'
                links[(name, book, ch)] = link
    return links

In [6]:
bible_links_by_chapter = links_by_chapter()

In [None]:
bible_links_by_chapter[('American Standard Version')]

In [10]:
len(bible_links_by_chapter.values())

28536

In [11]:
test_dict = {'American Standard Version': 
['http://www.biblestudytools.com/ASV/1-Corinthians/1.html',
 'http://www.biblestudytools.com/ASV/1-Corinthians/2.html',
 'http://www.biblestudytools.com/ASV/1-Corinthians/3.html',
 'http://www.biblestudytools.com/ASV/1-Corinthians/4.html',
 'http://www.biblestudytools.com/ASV/1-Corinthians/5.html',
 'http://www.biblestudytools.com/ASV/1-Corinthians/6.html',
 'http://www.biblestudytools.com/ASV/1-Corinthians/7.html',
 'http://www.biblestudytools.com/ASV/1-Corinthians/8.html',
 'http://www.biblestudytools.com/ASV/1-Corinthians/9.html',
 'http://www.biblestudytools.com/ASV/1-Corinthians/10.html']}

In [66]:
def soupify(link_dict=bible_links):
    '''
    Params: links, a list of strings representing urls.
    Returns: soupiness, a list of "BeautifullySouped" webpages
    
    CAUTION: do not call this function more than once for a given group of links.
             Too many calls to a given website may result in blockage from said site.
    '''
    soupiness = {}
    for name, links in link_dict.items():
        for link in tqdm(links):
            soup = get_soup(get_page(link))
            try:
                soupiness[name].append(soup)
            except KeyError:
                soupiness[name] = [soup]
            time.sleep(.001)
    return soupiness

In [7]:
## Well, I'm getting kicked off. Recalibratio!

def soup_one(links):
    '''
    Params: links, a list of strings representing urls.
    Returns: soupiness, a list of "BeautifullySouped" webpages
    
    CAUTION: do not call this function more than once for a given group of links.
             Too many calls to a given website may result in blockage from said site.
    '''
    soupiness = []
    for link in tqdm(links):
        soup = get_soup(get_page(link))
        soupiness.append(soup)
        time.sleep(.002)
    return soupiness

In [7]:
def soup_for_one(name, links=bible_links_by_chapter, books_and_chs=books_and_chs):
    print name
    soupiness = {}
    for book in tqdm(books_and_chs.keys()):
        for ch in range(1, books_and_chs[book]+1):
            soup = get_soup(get_page(links[(name, book, ch)]))
            soupiness[(book, ch)] = soup
            time.sleep(.002)
    return soupiness

In [19]:
#test = soup_for_one('English Standard Version')

In [20]:
#1st by-chapter (metadata-labeled) bundle:
ASV = soup_for_one('American Standard Version')
GW = soup_for_one('GOD\'S WORD Translation')
GNT = soup_for_one('Good News Translation')
CSB = soup_for_one('Holman Christian Standard Bible')

soup_1 = {
'American Standard Version':ASV,
'GOD\'S WORD Translation':GW,
'Good News Translation':GNT,
'Holman Christian Standard Bible':CSB}



In [9]:
#2nd metadata bundle
JUB = soup_for_one('Jubilee Bible 2000')
KJV = soup_for_one('King James Version')
LEB = soup_for_one('Lexham English Bible')
RHE = soup_for_one('Douay-Rhiems Catholic Bible')

biblical_soup_2 = {
'Jubilee Bible 2000':JUB,
'King James Version':KJV,
'Lexham English Bible':LEB,
'Douay-Rhiems Catholic Bible':RHE}

                                               

Jubilee Bible 2000
King James Version

                                               


Lexham English Bible

                                               


Douay-Rhiems Catholic Bible

                                               






In [10]:
#3rd metadata bundle
NKJV = soup_for_one('New King James Version')
NLT = soup_for_one('New Living Translation')  
NRS = soup_for_one('New Revised Standard')
RSV = soup_for_one('Revised Standard Version')

soup_3 = {
'New King James Version':NKJV,
'New Living Translation':NLT,
'New Revised Standard':NRS,
'Revised Standard Version':RSV
}

                                               

New King James Version
New Living Translation

                                               


New Revised Standard

                                               


Revised Standard Version

                                               






In [8]:
#4th metadata bundle
MSG = soup_for_one('The Message Bible')
HNV = soup_for_one('Hebrew Names Version')  
NCV = soup_for_one('New Century Version')
NIRV = soup_for_one('New International Reader\'s Version')

soup_3 = {
'The Message Bible':MSG,
'Hebrew Names Version':HNV,
'New Century Version':NCV,
'New International Reader\'s Version':NIRV
}

                                               

The Message Bible
Hebrew Names Version

                                               


New Century Version

                                               


New International Reader's Version

                                               






In [9]:
#5th metadata bundle
BBE = soup_for_one('The Bible in Basic English')
CJB = soup_for_one('The Complete Jewish Bible')
TMB = soup_for_one('Third Millennium Bible')
WEB = soup_for_one('World English Bible')

soup_5 = {
    'The Bible in Basic English':BBE,
    'The Complete Jewish Bible':CJB,
    'Third Millennium Bible':TMB,
    'World English Bible':WEB
}

                                               

The Bible in Basic English
The Complete Jewish Bible

                                               


Third Millennium Bible

                                               


World English Bible

                                               






In [9]:
#6th metadata bundle
YLT = soup_for_one('Young\'s Literal Translation')
DBY = soup_for_one('The Darby Translation')
WBT = soup_for_one('The Webster Bible')
WYC = soup_for_one('Wycliffe')

soup_6 = {
    'Young\'s Literal Translation':YLT,
    'The Darby Translation':DBY,
    'The Webster Bible':WBT,
    'Wycliffe':WYC
}

                                               

Young's Literal Translation
The Darby Translation

                                               


The Webster Bible

                                               


Wycliffe

                                               






In [68]:
#test = soupify(test_dict)

In [63]:
#test['American Standard Version']

In [19]:
#1st bundle
ASV = soup_one(bible_links['American Standard Version'])



In [21]:
ESV = soup_one(bible_links['English Standard Version'])



In [22]:
GW = soup_one(bible_links['GOD\'S WORD Translation'])



In [23]:
GNT = soup_one(bible_links['Good News Translation'])



In [24]:
CSB = soup_one(bible_links['Holman Christian Standard Bible'])



In [26]:
#1st bundle
biblical_soup_1 = {
'American Standard Version':ASV,
'English Standard Version':ESV,
'GOD\'S WORD Translation':GW,
'Good News Translation':GNT,
'Holman Christian Standard Bible':CSB}

In [8]:
#2nd bundle
JUB = soup_one(bible_links['Jubilee Bible 2000'])



In [9]:
KJV = soup_one(bible_links['King James Version'])



In [10]:
LEB = soup_one(bible_links['Lexham English Bible'])



In [11]:
RHE = soup_one(bible_links['Douay-Rhiems Catholic Bible'])



In [13]:
#2nd bundle
biblical_soup_2 = {
'Jubilee Bible 2000':JUB,
'King James Version':KJV,
'Lexham English Bible':LEB,
'Douay-Rhiems Catholic Bible':RHE}

In [8]:
#3rd bundle
NAS = soup_one(bible_links['New American Standard Bible'])



In [9]:
NIV = soup_one(bible_links['New International Version'])



In [10]:
NKJV = soup_one(bible_links['New King James Version'])



In [11]:
NLT = soup_one(bible_links['New Living Translation'])



In [12]:
#3rd bundle
biblical_soup_3 = {
'New American Standard Bible':NAS,
'New International Version':NIV,
'New King James Version':NKJV,
'New Living Translation':NLT}

In [8]:
#4th bundle
NRS = soup_one(bible_links['New Revised Standard'])



In [9]:
RSV = soup_one(bible_links['Revised Standard Version'])



In [10]:
MSG = soup_one(bible_links['The Message Bible'])



In [11]:
HNV = soup_one(bible_links['Hebrew Names Version'])



In [12]:
#4th bundle
biblical_soup_4 = {
'New Revised Standard':NRS,
'Revised Standard Version':RSV,
'The Message Bible':MSG,
'Hebrew Names Version':HNV}

In [8]:
#5th bundle
NCV = soup_one(bible_links['New Century Version'])



In [9]:
NIRV = soup_one(bible_links['New International Reader\'s Version'])



In [10]:
BBE = soup_one(bible_links['The Bible in Basic English'])



In [11]:
CJB = soup_one(bible_links['The Complete Jewish Bible'])



In [12]:
#5th bundle
biblical_soup_5 = {
'New Century Version':NCV,
'New International Reader\'s Version':NIRV,
'The Bible in Basic English':BBE,
'The Complete Jewish Bible':CJB}

In [8]:
#6th bundle
TMB = soup_one(bible_links['Third Millennium Bible'])



In [9]:
WEB = soup_one(bible_links['World English Bible'])



In [10]:
YLT = soup_one(bible_links['Young\'s Literal Translation'])



In [11]:
#6th bundle
biblical_soup_6 = {
'Third Millennium Bible':TMB,
'World English Bible':WEB,
'Young\'s Literal Translation':YLT}

In [8]:
DBY = soup_one(bible_links['The Darby Translation'])



In [9]:
WBT = soup_one(bible_links['The Webster Bible'])



In [10]:
WYC = soup_one(bible_links['Wycliffe'])



In [11]:
#7th bundle
biblical_soup_7 = {
'The Darby Translation':DBY,
'The Webster Bible':WBT,
'Wycliffe':WYC}

In [None]:
#http://arxiv.org/pdf/1405.4053v2.pdf

In [75]:
#biblical_soup = soupify()

In [15]:
type(test)

dict

In [18]:
#test['American Standard Version'][0]

In [None]:
#for i, page in enumerate(soups) (w/in soupiness)

In [None]:
# def bible_links_to_pkl():
#     biblical_soup = soupify()
#     scriptures = get_scripture(biblical_soup)
#     text = get_text(scriptures)

In [None]:
biblical_soup1 = {  #27
'American Standard Version':ASV,
'English Standard Version':ESV,
'GOD\'S WORD Translation':'GW',
'Good News Translation':'GNT',
'Holman Christian Standard Bible':'CSB',
'Jubilee Bible 2000':'JUB',
'King James Version':'KJV',
'Lexham English Bible':'LEB',
'Douay-Rhiems Catholic Bible':'RHE',
'New American Standard Bible':'NAS',
'New International Version':'NIV',
'New King James Version':'NKJV',
'New Living Translation':'NLT',
'New Revised Standard':'NRS',
'Revised Standard Version':'RSV',
'The Message Bible':'MSG',
'Hebrew Names Version':'HNV',
'New Century Version':'NCV',
'New International Reader\'s Version':'NIRV',
'The Bible in Basic English':'BBE',
'The Complete Jewish Bible':'CJB',
'Third Millennium Bible':'TMB',
'World English Bible':'WEB',
'Young\'s Literal Translation':'YLT',
'The Darby Translation':'DBY',
'The Webster Bible':'WBT',
'Wycliffe':'WYC'
}

#2. get content

In [None]:
# def get_data(souped_pages=biblical_soup):
#     '''
#     Params: souped_pages, soupified Lincoln Center event pages.
#     Returns: event_data, list of lists of condensed lines.
#              Each line contains a list of strings containing data about events.
#     '''
#     events = find_lincoln_center_events(souped_pages)
#     event_data = []
#     for event_page in events:
#         unicode_text = get_text(event_page)
#         ascii_text = unicode_to_ascii(unicode_text)
#         data = condense_data_from_text(ascii_text)
#         event_data.append(data)
#     return event_data

In [None]:
#lincoln_center_event_data = get_lincoln_center_event_data(lincoln_center_soup)

In [12]:
def get_scripture(soup):
    '''
    Params: soup, list of soupified pages of Lincoln Center events.
    Returns: events, list of bs4 ResultSets of Lincoln Center event info by page.
    '''
    scriptures = {}
    for name, soups in soup.items():
        for page in tqdm(soups):
            scripture = page.find_all('div', {'class': 'scripture'})
            try:
                scriptures[name].append(scripture)
            except KeyError:
                scriptures[name] = [scripture]
    return scriptures

In [10]:
def get_scripture_w_metadata(soup):
    '''
    Params: soup, list of soupified pages of Lincoln Center events.
    Returns: events, list of bs4 ResultSets of Lincoln Center event info by page.
    '''
    bundle = {}
    for name, soups in soup.items():
        print name
        scriptures = {}
        for key, page in tqdm(soups.items()):
            scripture = page.find_all('div', {'class': 'scripture'})
            scriptures[key] = scripture
        bundle[name] = scriptures
    return bundle

In [22]:
scripts1 = get_scripture_w_metadata(soup_1)

                                                   

American Standard Version
Good News Translation

                                                   


Holman Christian Standard Bible

                                                   


GOD'S WORD Translation

                                                   






In [11]:
scripts2 = get_scripture_w_metadata(biblical_soup_2)

                                                   

Jubilee Bible 2000
Lexham English Bible

                                                   


King James Version

                                                   


Douay-Rhiems Catholic Bible

                                                   






In [12]:
scripts3 = get_scripture_w_metadata(soup_3)

                                                   

New King James Version
New Revised Standard

                                                   


New Living Translation

                                                   


Revised Standard Version

                                                   






In [10]:
scripts4 = get_scripture_w_metadata(soup_3)

                                                   

New International Reader's Version
New Century Version

                                                   


Hebrew Names Version

                                                   


The Message Bible

                                                   






In [11]:
scripts5 = get_scripture_w_metadata(soup_5)

                                                   

World English Bible
Third Millennium Bible

                                                   


The Bible in Basic English

                                                   


The Complete Jewish Bible

                                                   






In [11]:
scripts6 = get_scripture_w_metadata(soup_6)

                                                   

The Webster Bible
Young's Literal Translation

                                                   


Wycliffe

                                                   


The Darby Translation

                                                   






In [21]:
test_scriptures = get_scripture(test)
#test_scriptures['American Standard Version'][0]

In [30]:
scriptures1 = get_scripture(biblical_soup_1)

In [16]:
scriptures2 = get_scripture(biblical_soup_2)



In [15]:
scriptures3 = get_scripture(biblical_soup_3)



In [14]:
scriptures4 = get_scripture(biblical_soup_4)



In [14]:
scriptures5 = get_scripture(biblical_soup_5)



In [13]:
scriptures6 = get_scripture(biblical_soup_6)



In [13]:
scriptures7 = get_scripture(biblical_soup_7)



In [14]:
def get_text(souped_results):
    '''
    Params: souped_results, bs4.element.ResultSet for a given page. 
    Returns: text, list of lines of text from souped_results.
    '''
    texts = {}
    for name, soups in souped_results.items():
        #each version is a corpus
        corpus = []
        for page in tqdm(soups):
            #each book chapter is a document
            document = ''
            for line in page:
                document += line.text
            corpus.append(document)
        texts[name] = corpus
    return texts

In [12]:
def get_text_w_metadata(souped_results):
    bundle = {}
    for name, soups in souped_results.items():
        print name
        corpus = {}
        for key, page in tqdm(soups.items()):
            document = ''
            for line in page:
                document += line.text
            corpus[key] = document
        bundle[name] = corpus    
    return bundle

In [27]:
texts1 = get_text_w_metadata(scripts1)

                                                    

American Standard Version
Good News Translation

                                                    


Holman Christian Standard Bible

                                                    


GOD'S WORD Translation

                                                    






In [13]:
texts2 = get_text_w_metadata(scripts2)

                                                    

Jubilee Bible 2000
Lexham English Bible

                                                    


King James Version

                                                    


Douay-Rhiems Catholic Bible

                                                    






In [14]:
texts3 = get_text_w_metadata(scripts3)

                                                    

New King James Version
New Revised Standard

                                                    


New Living Translation

                                                    


Revised Standard Version

                                                     






In [12]:
texts4 = get_text_w_metadata(scripts4)

                                                   

The Message Bible
Hebrew Names Version

                                                    


New Century Version

                                                     


New International Reader's Version

                                                     






In [13]:
texts5 = get_text_w_metadata(scripts5)

                                                     

World English Bible
The Complete Jewish Bible

                                                     


The Bible in Basic English

                                                     


Third Millennium Bible

                                                     






In [13]:
texts6 = get_text_w_metadata(scripts6)

                                                    

The Webster Bible
Young's Literal Translation

                                                    


The Darby Translation

                                                    


Wycliffe

                                                    






In [25]:
test_text = get_text(test_scriptures)
#test_text['American Standard Version']

In [33]:
text1 = get_text(scriptures1)



In [18]:
text2 = get_text(scriptures2)



In [17]:
text3 = get_text(scriptures3)



In [16]:
text4 = get_text(scriptures4)



In [16]:
text5 = get_text(scriptures5)



In [15]:
text6 = get_text(scriptures6)



In [15]:
text7 = get_text(scriptures7)



In [34]:
def unicode_to_ascii(unicode_text):
    '''
    Params: unicode_text, list of lines of unicode
    Returns: ascii_text, list of lines of ascii
    '''
    ascii = {}
    for name, corpus in unicode_text.items():
        ascii_corpus = []
        for doc in tqdm(corpus):
            ascii_corpus.append(unicodedata.normalize('NFKD', doc).encode(
                    'ascii','ignore'))
        ascii[name] = ascii_corpus
    return ascii

In [33]:
test_ascii = unicode_to_ascii(test_text)
#test_ascii['American Standard Version']

In [35]:
ascii1 = unicode_to_ascii(text1)

  0%|          | 0/1189 [00:00<?, ?it/s]

TypeError: must be unicode, not str

In [41]:
# for doc in test_ascii['American Standard Version']:
#     regex = re.compile('[^a-zA-Z ]')
#     print regex.sub('', doc)

In [None]:
#ascii = unicode_to_ascii()

In [16]:
def condense_data_from_text(ascii_text):
    '''Splits lines by newline. Filters out NoneTypes (viz., empty strings).
    Params: ascii_text, list of ascii lines
    Returns: data, list of condensed lines; each line is a list of strings.
    '''
    data = {}
    for name, corpus in ascii_text.items():
        new_corpus = []
        for doc in tqdm(corpus):
            regex = re.compile('[^a-zA-Z ]')
            new_corpus.append(regex.sub('', doc))
        data[name] = new_corpus
    return data

In [14]:
def get_data_w_metadata(texts):
    bundle = {}
    for name, old_corpus in texts.items():
        print name
        corpus = {}
        for key, doc in tqdm(old_corpus.items()):
            regex = re.compile('[^a-zA-Z ]')
            corpus[key] = regex.sub('', doc)
        bundle[name] = corpus
    return bundle

In [29]:
datametadata1 = get_data_w_metadata(texts1)

                                                    

American Standard Version
Good News Translation

                                                    


Holman Christian Standard Bible

                                                     


GOD'S WORD Translation

                                                    






In [15]:
datametadata2 = get_data_w_metadata(texts2)

                                                    

Jubilee Bible 2000
Lexham English Bible

                                                     


King James Version

                                                    


Douay-Rhiems Catholic Bible

                                                    






In [16]:
datametadata3 = get_data_w_metadata(texts3)

                                                    

New King James Version
New Revised Standard

                                                    


New Living Translation

                                                    


Revised Standard Version

                                                    






In [14]:
datametadata4 = get_data_w_metadata(texts4)

                                                    

New International Reader's Version
New Century Version

                                                    


Hebrew Names Version

                                                    


The Message Bible

                                                    






In [15]:
datametadata5 = get_data_w_metadata(texts5)

                                                     

World English Bible
Third Millennium Bible

                                                    


The Bible in Basic English

                                                    


The Complete Jewish Bible

                                                    






In [15]:
datametadata6 = get_data_w_metadata(texts6)

                                                    

The Webster Bible
Young's Literal Translation

                                                     


Wycliffe

                                                    


The Darby Translation

                                                    






In [30]:
#pickling!
with open('bibles_w_metadata_1.pkl', 'w') as picklefile:
    pickle.dump(datametadata1, picklefile)

In [16]:
#pickling!
with open('bibles_w_metadata_2.pkl', 'w') as picklefile:
    pickle.dump(datametadata2, picklefile)

In [17]:
#pickling!
with open('bibles_w_metadata_3.pkl', 'w') as picklefile:
    pickle.dump(datametadata3, picklefile)

In [15]:
#pickling!
with open('bibles_w_metadata_4.pkl', 'w') as picklefile:
    pickle.dump(datametadata4, picklefile)

In [16]:
#pickling!
with open('bibles_w_metadata_5.pkl', 'w') as picklefile:
    pickle.dump(datametadata5, picklefile)

In [16]:
#pickling!
with open('bibles_w_metadata_6.pkl', 'w') as picklefile:
    pickle.dump(datametadata6, picklefile)

In [37]:
data1 = condense_data_from_text(text1)



In [20]:
data2 = condense_data_from_text(text2)



In [19]:
data3 = condense_data_from_text(text3)



In [18]:
data4 = condense_data_from_text(text4)



In [18]:
data5 = condense_data_from_text(text5)



In [17]:
data6 = condense_data_from_text(text6)



In [17]:
data7 = condense_data_from_text(text7)



In [42]:
test_data = condense_data_from_text(test_ascii)
#test_data['American Standard Version']

In [45]:
test_data['American Standard Version']

['                                    Paul called to be an apostle of Jesus Christ through the will of God and Sosthenes our brother                                                                    unto the church of God which is at Corinth even them that are sanctified in Christ Jesus called to be saints with all that call upon the name of our Lord Jesus Christ in every place their Lord and ours                                                                    Grace to you and peace from God our Father and the Lord Jesus Christ                                                                    I thank my God always concerning you for the grace of God which was given you in Christ Jesus                                                                    that in everything ye were enriched in him in all utterance and all knowledge                                                                    even as the testimony of Christ was confirmed in you                                     

In [44]:
data = condense_data_from_text()

In [38]:
#pickling!
with open('bibles_data_1.pkl', 'w') as picklefile:
    pickle.dump(data1, picklefile)

In [21]:
with open('bibles_data_2.pkl', 'w') as picklefile:
    pickle.dump(data2, picklefile)

In [20]:
with open('bibles_data_3.pkl', 'w') as picklefile:
    pickle.dump(data3, picklefile)

In [19]:
with open('bibles_data_4.pkl', 'w') as picklefile:
    pickle.dump(data4, picklefile)

In [19]:
with open('bibles_data_5.pkl', 'w') as picklefile:
    pickle.dump(data5, picklefile)

In [18]:
with open('bibles_data_6.pkl', 'w') as picklefile:
    pickle.dump(data6, picklefile)

In [18]:
with open('bibles_data_7.pkl', 'w') as picklefile:
    pickle.dump(data7, picklefile)

#3. to mongo

In [None]:
from pymongo import MongoClient
import pymongo

In [None]:
def collections_init():
    '''Initializes collections X_train, y_train, and X_test.'''
    client = MongoClient()
    db = client.legislation
    # Our collections: 
    X_train = db.X_train
    y_train = db.y_train
    X_test = db.X_test
    return X_train, y_train, X_test

In [None]:
def fill_collection(iterable, collection):
    '''Transfers iterable contents (documents) to a MongoDB collection.'''
    collection.drop() # Clear out any data we had in the collection.
    for document in tqdm(iterable):
        collection.save(document)
    #tqdm(collection.insert_many(iterable))
    return collection   

###andy's tor scraper...

In [7]:
import datetime
from stem import Signal
from stem.control import Controller
from MyBrowser import MyBrowser
import random
from time import sleep
import socket

class UseTor(object):
    """
    Uses tor to change the IP so Zillow cant block the IP
    Also, uses mechanize to minic browser
    """

    
    def __init__(self,password, proxy_port,controller_port,pass_function=None):
        """
        password is for the set tor password
        proxy_port is the port use by the given tor
        pass_function is the function that given a html it will return True if not detected as ROBOT
        """
        self._password = password
        self._proxy_port = proxy_port
        self._controller_port = controller_port
        self._pass_function = pass_function
        self._br = MyBrowser(self._proxy_port)
        self._new_ip()        
        self._randtime()
        
    def _randtime(self):
        """
        Sets the date time used to change IP ever so often
        """
        ##change every 30 to 60 mins
        self._next_time = datetime.datetime.now() + datetime.timedelta(0,random.randint(1800,3600))

        
    def _new_ip(self):
        """
        Changes the IP address that tor uses.
        It will check to make sure that the IP is not on Zillows Robot list or blocked list
        If on list will repeat new IP till not on list
        """
        def get_new_ip():
            with Controller.from_port(port = self._controller_port) as controller:
                controller.authenticate(self._password)
                controller.signal(Signal.NEWNYM)
                sleep(controller.get_newnym_wait())
                controller.close()
            
                
            # reset the browser and its settings by calling a new one
            self._br.close()
            self._br = MyBrowser(self._proxy_port)

            #reset the time before next change is required  
            self._randtime()
            
        get_new_ip()
        if self._pass_function != None:
            while(self._pass_function(self._br)==False):
                ##get_new till zillow doesn't fail robot test
                get_new_ip()
        
    
    def request(self,url):
        """
        Will return html of the requested url
        If needed it will change the Tor IP address to get the data
        """
        #see if it is time to find a new IP address 
        if datetime.datetime.now() > self._next_time:
            self._new_ip()

        ## try to connect to website and get the html
        try:
            r = self._br.open(url)
            html = r.read()
        except:
            self._new_ip()
            return self.request(url) #not successful try again
            
        # if pass function given and html received see if it passes the test function
        if self._pass_function != None:
            if self._pass_function(self._br, html=html)==False:
                self._new_ip()
                return self.request(url) #not successful try again

        # everything was successful return the html
        return html

ImportError: No module named MyBrowser

In [None]:
import mechanize
import cookielib
from fake_useragent import UserAgent
import urllib2
import httplib
import socks

class SocksiPyConnection(httplib.HTTPConnection):
    def __init__(self, proxytype, proxyaddr, proxyport = None, rdns = True, username = None, password = None, *args, **kwargs):
        self.proxyargs = (proxytype, proxyaddr, proxyport, rdns, username, password)
        httplib.HTTPConnection.__init__(self, *args, **kwargs)

    def connect(self):
        self.sock = socks.socksocket()
        self.sock.setproxy(*self.proxyargs)
        if isinstance(self.timeout, float):
            self.sock.settimeout(self.timeout)
        self.sock.connect((self.host, self.port))

class SocksiPyHandler(urllib2.HTTPHandler):
    def __init__(self, *args, **kwargs):
        self.args = args
        self.kw = kwargs
        urllib2.HTTPHandler.__init__(self)

    def http_open(self, req):
        def build(host, port=None, strict=None, timeout=0):
            conn = SocksiPyConnection(*self.args, host=host, port=port, strict=strict, timeout=timeout, **self.kw)
            return conn
        return self.do_open(build, req)


class MyBrowser(object):
    """
    Uses Mechanize to act as browser
    The browser make use of the proxy port and host to get its data through
    """

    def __new__(self,proxy_port, proxy_host = 'localhost', debug=False):
        # Browser
        br = mechanize.Browser()

        # Use opener to connect to Tor
        opener = urllib2.build_opener(SocksiPyHandler(socks.PROXY_TYPE_SOCKS5, 'localhost', proxy_port))
        opener.addheaders = [('User-agent', UserAgent().random)]
        br.handlers = opener.handlers

        # Cookie Jar
        cj = cookielib.LWPCookieJar()
        br.set_cookiejar(cj)

        # Browser options
        br.set_handle_equiv(True)
        #br.set_handle_gzip(True)
        br.set_handle_redirect(True)
        br.set_handle_referer(True)
        br.set_handle_robots(False)

        # Follows refresh 0 but not hangs on refresh > 0
        br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)

        # Want debugging messages?
        if debug:
            br.set_debug_http(True)
            br.set_debug_redirects(True)
            br.set_debug_responses(True)

        return br 