-
Notifications
You must be signed in to change notification settings - Fork 1
/
book_loader.py
278 lines (200 loc) · 9.19 KB
/
book_loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
import os
import requests
import urllib
from pyquery import PyQuery as pq
from model import db, Book, Author, ISBN10, ISBN13, GoodreadsInfo
WORLDCAT_STANDARD_URL = "http://www.worldcat.org"
WORLDCAT_SEARCH_URL = "http://www.worldcat.org/search?q="
WORLDCAT_FILTER_LANG_EN_PRINT_ONLY = "&fq=%20(%28x0%3Abook+x4%3Aprintbook%29)%20%3E%20ln%3Aeng&se=&sd=&qt=facet_fm_checkbox&refinesearch=true&refreshFormat=undefined"
WORLDCAT_FILTER_LANG_EN_EBOOKS_ONLY = "&fq=%20(%28x0%3Abook+x4%3Adigital%29)%20>%20ln%3Aeng&se=&sd=&qt=facet_fm_checkbox&refinesearch=true&refreshFormat=undefined"
HTTP_URL = "http:"
def get_crawl_results_with_cache_check(keywords):
"""
:param keywords:
:return:
"""
final_dict = dict()
# list_of_oclc_ids_as_strings, list_of_oclc_ids_as_ints = get_crawl_results(keywords)
list_of_oclc_ids = get_crawl_results(keywords)
books_in_db = Book.query.filter(Book.book_id.in_(list_of_oclc_ids)).all()
for current_book_in_db in books_in_db:
final_dict[current_book_in_db.book_id] = current_book_in_db
for current_id in list_of_oclc_ids:
# check to see if current_id is in db already
if current_id in final_dict:
continue
# create a new book
current_book = create_book_from_worldcat_id(current_id)
# add new book to db
db.session.add(current_book)
db.session.flush()
# add new book to final_dict
final_dict[current_id] = current_book
print '%s - %s' % (current_id, current_book.title)
db.session.commit()
# Iterate through list of final_dict
all_books = [final_dict[current_id] for current_id in list_of_oclc_ids]
return all_books
def get_crawl_results(keywords):
"""Provided user's search keywords, return list of search results, represented by
a list of OCLC IDs (also our own book IDs)."""
# TODO - consider renaming function to worldcat_ids_from_search_results_page
# take a user's search keywords
# convert string to match the format for the WorldCat search results page
worldcat_ready_keywords = urllib.quote_plus(keywords)
# http://www.worldcat.org/search?q=[ USER KEYWORDS ]&fq=%20(%28x0%3Abook+x4%3Aprintbook%29)%20%3E%20ln%3Aeng&se=&sd=&qt=facet_fm_checkbox&refinesearch=true&refreshFormat=undefined
worldcat_ready_url = WORLDCAT_SEARCH_URL + worldcat_ready_keywords + WORLDCAT_FILTER_LANG_EN_PRINT_ONLY
# requests.get the search results page and
results_page_xml = requests.get(worldcat_ready_url)
list_of_oclc_ids = search_for_print_books(results_page_xml.content)
return list_of_oclc_ids
def search_for_print_books(xml_content):
"""Given the url for a WorldCat search results page of English-language print books,
returns a dictionary about the first 10 search results."""
# use pyquery to query the results_page_xml
pq_results_page_xml = pq(xml_content)
# For each WorldCat search results page, there are four sets of content tags.
# For our purposes, we only care about the HTML text within the last set of content tags.
content_xml = pq_results_page_xml('content').eq(3).text()
pq_content = pq(content_xml)
# oclc_id_as_strings_results_list = [i.text() for i in pq_content('div.oclc_number').items()]
oclc_id_as_ints_results_list = [int(i.text()) for i in pq_content('div.oclc_number').items()]
# return (oclc_id_as_strings_results_list, oclc_id_as_ints_results_list)
return oclc_id_as_ints_results_list
def create_book_from_worldcat_id(oclc_id):
"""
:param oclc_id:
:return:
"""
worldcat_url_with_oclc_id = "http://www.worldcat.org/oclc/%d" % oclc_id
details_page = requests.get(worldcat_url_with_oclc_id)
html_string = details_page.content
return _create_book_from_worldcat_details(html_string, worldcat_url_with_oclc_id, oclc_id)
def _create_book_from_worldcat_details(html_string, worldcat_url_with_oclc_id, oclc_id):
book = _load_book_from_worldcat_details_page(html_string, worldcat_url_with_oclc_id, oclc_id)
for current_isbn13 in book.isbn13s:
current_isbn13.goodreadsinfo = get_goodreads_info_by_isbn13(current_isbn13.isbn13)
return book
def _load_book_from_worldcat_details_page(html_content, details_page_url, oclc_id):
"""Given the HTML content of the details page, create a new Book object with all the necessary details."""
current_book = Book()
pq_page = pq(html_content)
current_book.book_id = oclc_id
current_book.title = _load_title_from_worldcat_details_page(pq_page) # TITLE (e.g. 'Lean in : women, work, and the will to lead')
current_book.publisher = _load_publisher_from_worldcat_details_page(pq_page) # PUBLISHER (e.g. 'New York : Alfred A. Knopf, 2013.')
current_book.worldcaturl = details_page_url
current_book.page_count = _load_page_count_from_worldcat_details_page(pq_page)
current_book.summary = _load_summary_from_worldcat_details_page(pq_page)
current_book.coverurl = _load_cover_url_from_worldcat_details_page(pq_page)
# TODO - check if author_name is unique before insertion
current_book.authors = _load_authors_from_worldcat_details_page(html_content)
isbn10_list, isbn13_list = _load_isbns_from_worldcat_details_page(html_content)
current_book.isbn10s = isbn10_list
current_book.isbn13s = isbn13_list
return current_book
def _load_title_from_worldcat_details_page(pq_html_content):
"""
:param pq_html_content:
:return:
"""
return pq_html_content('h1.title').text()
def _load_publisher_from_worldcat_details_page(pq_html_content):
"""
:param pq_html_content:
:return:
"""
return pq_html_content('#bib-publisher-cell').text()
def _load_page_count_from_worldcat_details_page(pq_html_content):
"""
:param pq_html_content:
:return:
"""
description_string = pq_html_content('#details-description td').text()
page_num_list = []
# TODO - find more pages examples from worldcat and make sure we can cover more page formats
digits_found = False
for char in description_string[description_string.find('pages')::-1]:
if char.isdigit():
digits_found = True
page_num_list.insert(0, char)
elif not char.isdigit() and digits_found:
break
else:
continue
num_of_pages = "".join(page_num_list)
return num_of_pages
def _load_summary_from_worldcat_details_page(pq_html_content):
"""
:param pq_html_content:
:return:
"""
return pq_html_content('div.abstracttxt').text()
def _load_cover_url_from_worldcat_details_page(pq_html_content):
"""
:param pq_html_content:
:return:
"""
cover_url = pq_html_content('div#cover img.cover').attr('src')
final_cover_url = HTTP_URL + cover_url
return final_cover_url
def _load_authors_from_worldcat_details_page(html_content):
"""
:param pq_html_content:
:return:
"""
pq_page = pq(html_content)
authors_string = pq_page('#bib-author-cell').text()
author_list = [author.strip() for author in authors_string.split(';')]
final_author_list = []
for author in author_list:
current_author = Author()
current_author.author_id = None
current_author.author_name = author.strip()
final_author_list.append(current_author)
return final_author_list
def _load_isbns_from_worldcat_details_page(html_content):
"""
:param pq_html_content:
:return:
"""
pq_page = pq(html_content)
isbn_string = pq_page('#details-standardno').eq(0).text()
isbn_list = isbn_string.split(" ")
isbn10_list = []
isbn13_list = []
for current_isbn in isbn_list:
if len(current_isbn) == 10:
isbn10_obj = ISBN10()
isbn10_obj.isbn10 = current_isbn
isbn10_list.append(isbn10_obj)
elif len(current_isbn) == 13:
isbn13_obj = ISBN13()
isbn13_obj.isbn13 = current_isbn
isbn13_list.append(isbn13_obj)
return [isbn10_list, isbn13_list]
def get_goodreads_info_by_isbn13(isbn13):
"""Provided an ISBN13, get ratings and review count info from goodreads."""
goodreads_key = os.environ['GOODREADS_API_KEY']
url = "https://www.goodreads.com/book/isbn"
payload = dict(key=goodreads_key, isbn=isbn13)
goodreads_page = requests.get(url, params=payload)
html_string = goodreads_page.content
goodreads_info_obj = _load_goodreads_info_from_goodreads_api(html_string)
return goodreads_info_obj
def _load_goodreads_info_from_goodreads_api(html_content):
"""
:param html_content:
:return:
"""
# TODO - promote this part to a separate function so we can write tests again test_html
pq_goodreads_page = pq(html_content)
pq_book = pq_goodreads_page('book').eq(0)
pq_work = pq_goodreads_page('book').eq(0)('work').eq(0)
# TODO - Return GoodreadsInfo()
goodreads_info = GoodreadsInfo()
goodreads_info.goodreads_work_id = pq_work('id').text()
goodreads_info.goodreads_rating = float(pq_book.children('average_rating').text() or 0.0)
goodreads_info.goodreads_ratings_count = int(pq_work('ratings_count').text() or 0)
goodreads_info.goodreads_review_count = int(pq_work('reviews_count').text() or 0)
goodreads_info.goodreads_review_text = None
return goodreads_info