Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

EveryBlock code from tarballs.

  • Loading branch information...
commit 25397148223dad81e7fbb9c7cf2f169162df4681 0 parents
Brian Rosner authored
Showing with 23,931 additions and 0 deletions.
  1. +52 −0 ebblog/README.TXT
  2. 0  ebblog/ebblog/__init__.py
  3. 0  ebblog/ebblog/blog/__init__.py
  4. +7 −0 ebblog/ebblog/blog/admin.py
  5. +29 −0 ebblog/ebblog/blog/feeds.py
  6. +18 −0 ebblog/ebblog/blog/models.py
  7. +6 −0 ebblog/ebblog/manage.py
  8. +21 −0 ebblog/ebblog/settings.py
  9. +7 −0 ebblog/ebblog/templates/404.html
  10. +7 −0 ebblog/ebblog/templates/500.html
  11. +32 −0 ebblog/ebblog/templates/base.html
  12. +18 −0 ebblog/ebblog/templates/blog/archive.html
  13. +19 −0 ebblog/ebblog/templates/blog/entry_archive_day.html
  14. +19 −0 ebblog/ebblog/templates/blog/entry_archive_month.html
  15. +15 −0 ebblog/ebblog/templates/blog/entry_archive_year.html
  16. +13 −0 ebblog/ebblog/templates/blog/entry_detail.html
  17. +1 −0  ebblog/ebblog/templates/feeds/rss_description.html
  18. +1 −0  ebblog/ebblog/templates/feeds/rss_title.html
  19. +17 −0 ebblog/ebblog/templates/homepage.html
  20. +28 −0 ebblog/ebblog/urls.py
  21. +82 −0 ebdata/README.TXT
  22. 0  ebdata/ebdata/__init__.py
  23. 0  ebdata/ebdata/blobs/__init__.py
  24. +48 −0 ebdata/ebdata/blobs/auto_purge.py
  25. +27 −0 ebdata/ebdata/blobs/create_seeds.py
  26. +192 −0 ebdata/ebdata/blobs/geotagging.py
  27. +60 −0 ebdata/ebdata/blobs/manual.py
  28. +181 −0 ebdata/ebdata/blobs/models.py
  29. +205 −0 ebdata/ebdata/blobs/scrapers.py
  30. +307 −0 ebdata/ebdata/blobs/update_feeds.py
  31. 0  ebdata/ebdata/nlp/__init__.py
  32. +297 −0 ebdata/ebdata/nlp/addresses.py
  33. +30 −0 ebdata/ebdata/nlp/datelines.py
  34. +51 −0 ebdata/ebdata/nlp/places.py
  35. 0  ebdata/ebdata/nlp/tests/__init__.py
  36. +153 −0 ebdata/ebdata/nlp/tests/datelines.py
  37. +760 −0 ebdata/ebdata/nlp/tests/tests.py
  38. 0  ebdata/ebdata/parsing/__init__.py
  39. +157 −0 ebdata/ebdata/parsing/dbf.py
  40. +77 −0 ebdata/ebdata/parsing/excel.py
  41. +31 −0 ebdata/ebdata/parsing/mdb.py
  42. +40 −0 ebdata/ebdata/parsing/pdftotext.py
  43. +46 −0 ebdata/ebdata/parsing/unicodecsv.py
  44. +1 −0  ebdata/ebdata/retrieval/__init__.py
  45. +39 −0 ebdata/ebdata/retrieval/log.py
  46. +16 −0 ebdata/ebdata/retrieval/log_debug.py
  47. +36 −0 ebdata/ebdata/retrieval/models.py
  48. +181 −0 ebdata/ebdata/retrieval/retrievers.py
  49. +1 −0  ebdata/ebdata/retrieval/scrapers/__init__.py
  50. +35 −0 ebdata/ebdata/retrieval/scrapers/base.py
  51. +337 −0 ebdata/ebdata/retrieval/scrapers/list_detail.py
  52. +143 −0 ebdata/ebdata/retrieval/scrapers/new_newsitem_list_detail.py
  53. +270 −0 ebdata/ebdata/retrieval/scrapers/newsitem_list_detail.py
  54. 0  ebdata/ebdata/retrieval/updaterdaemon/__init__.py
  55. +33 −0 ebdata/ebdata/retrieval/updaterdaemon/config.py
  56. +38 −0 ebdata/ebdata/retrieval/updaterdaemon/initscript
  57. +108 −0 ebdata/ebdata/retrieval/updaterdaemon/runner.py
  58. +89 −0 ebdata/ebdata/retrieval/utils.py
  59. +2 −0  ebdata/ebdata/templatemaker/__init__.py
  60. +137 −0 ebdata/ebdata/templatemaker/articletext.py
  61. +76 −0 ebdata/ebdata/templatemaker/brain.py
  62. +116 −0 ebdata/ebdata/templatemaker/clean.py
  63. +57 −0 ebdata/ebdata/templatemaker/hole.py
  64. +146 −0 ebdata/ebdata/templatemaker/htmlutils.py
  65. +85 −0 ebdata/ebdata/templatemaker/listdiff.c
  66. +84 −0 ebdata/ebdata/templatemaker/listdiff.py
  67. +248 −0 ebdata/ebdata/templatemaker/sst.py
  68. +60 −0 ebdata/ebdata/templatemaker/template.py
  69. 0  ebdata/ebdata/templatemaker/tests/__init__.py
  70. +63 −0 ebdata/ebdata/templatemaker/tests/articletext.py
  71. +144 −0 ebdata/ebdata/templatemaker/tests/brain.py
  72. +212 −0 ebdata/ebdata/templatemaker/tests/clean.py
  73. +99 −0 ebdata/ebdata/templatemaker/tests/hole.py
  74. +362 −0 ebdata/ebdata/templatemaker/tests/htmlutils.py
  75. +262 −0 ebdata/ebdata/templatemaker/tests/listdiff.py
  76. +17 −0 ebdata/ebdata/templatemaker/tests/listdiffc.py
  77. +8 −0 ebdata/ebdata/templatemaker/tests/run_all.py
  78. +838 −0 ebdata/ebdata/templatemaker/tests/sst.py
  79. +352 −0 ebdata/ebdata/templatemaker/tests/template.py
  80. +53 −0 ebdata/ebdata/templatemaker/tests/textlist.py
  81. +70 −0 ebdata/ebdata/templatemaker/tests/webmining.py
  82. +46 −0 ebdata/ebdata/templatemaker/textlist.py
  83. +59 −0 ebdata/ebdata/templatemaker/webmining.py
  84. 0  ebdata/ebdata/textmining/__init__.py
  85. 0  ebdata/ebdata/textmining/tests/__init__.py
  86. +100 −0 ebdata/ebdata/textmining/tests/treeutils.py
  87. +68 −0 ebdata/ebdata/textmining/treeutils.py
  88. 0  ebdata/ebdata/utils/__init__.py
  89. +162 −0 ebdata/ebdata/utils/daemon.py
  90. +5 −0 ebdata/requirements.txt
  91. +111 −0 ebgeo/README.TXT
  92. +175 −0 ebgeo/bin/render_tiles
  93. +19 −0 ebgeo/config/tilecache.cfg
  94. 0  ebgeo/ebgeo/__init__.py
  95. 0  ebgeo/ebgeo/maps/__init__.py
  96. +51 −0 ebgeo/ebgeo/maps/bin/locator_calcs.py
  97. +169 −0 ebgeo/ebgeo/maps/bins.py
  98. +29 −0 ebgeo/ebgeo/maps/cached_image.py
  99. +85 −0 ebgeo/ebgeo/maps/colors.py
  100. +1 −0  ebgeo/ebgeo/maps/constants.py
  101. +73 −0 ebgeo/ebgeo/maps/extent.py
  102. +205 −0 ebgeo/ebgeo/maps/mapserver.py
  103. +47 −0 ebgeo/ebgeo/maps/markers.py
  104. +7 −0 ebgeo/ebgeo/maps/notes.txt
  105. +24 −0 ebgeo/ebgeo/maps/projections.py
  106. +82 −0 ebgeo/ebgeo/maps/shortcuts.py
  107. +126 −0 ebgeo/ebgeo/maps/tess.py
  108. +120 −0 ebgeo/ebgeo/maps/tests.py
  109. +93 −0 ebgeo/ebgeo/maps/tile.py
  110. +69 −0 ebgeo/ebgeo/maps/tilecache_service.py
  111. +10 −0 ebgeo/ebgeo/maps/urls.py
  112. +123 −0 ebgeo/ebgeo/maps/utils.py
  113. +58 −0 ebgeo/ebgeo/maps/views.py
  114. 0  ebgeo/ebgeo/utils/__init__.py
  115. +29 −0 ebgeo/ebgeo/utils/clustering/__init__.py
  116. +37 −0 ebgeo/ebgeo/utils/clustering/bunch.py
  117. +59 −0 ebgeo/ebgeo/utils/clustering/cluster.py
  118. +9 −0 ebgeo/ebgeo/utils/clustering/json.py
  119. +62 −0 ebgeo/ebgeo/utils/clustering/sample.py
  120. +10 −0 ebgeo/ebgeo/utils/clustering/shortcuts.py
  121. +67 −0 ebgeo/ebgeo/utils/clustering/tests.py
  122. +26 −0 ebgeo/ebgeo/utils/correcting.py
  123. +103 −0 ebgeo/ebgeo/utils/feature_reducer.py
  124. +89 −0 ebgeo/ebgeo/utils/geodjango.py
  125. +97 −0 ebgeo/ebgeo/utils/progressbar.py
  126. +51 −0 ebgeo/ebgeo/utils/shapeindex.py
  127. +5 −0 ebgeo/mapnik_styles/main.xml
  128. +2 −0  ebgeo/requirements.txt
  129. +47 −0 ebinternal/README.TXT
  130. 0  ebinternal/ebinternal/__init__.py
  131. 0  ebinternal/ebinternal/citypoll/__init__.py
  132. +32 −0 ebinternal/ebinternal/citypoll/models.py
  133. +10 −0 ebinternal/ebinternal/citypoll/normalize.py
  134. +47 −0 ebinternal/ebinternal/citypoll/views.py
  135. 0  ebinternal/ebinternal/feedback/__init__.py
  136. +47 −0 ebinternal/ebinternal/feedback/models.py
  137. +105 −0 ebinternal/ebinternal/feedback/views.py
  138. +315 −0 ebinternal/ebinternal/media/styles/internal.css
  139. +41 −0 ebinternal/ebinternal/settings.py
  140. +1 −0  ebinternal/ebinternal/templates/404.html
  141. +1 −0  ebinternal/ebinternal/templates/500.html
  142. +17 −0 ebinternal/ebinternal/templates/base.html
  143. +32 −0 ebinternal/ebinternal/templates/citypoll/city_detail.html
  144. +24 −0 ebinternal/ebinternal/templates/citypoll/vote_list.html
  145. +88 −0 ebinternal/ebinternal/templates/feedback/feedback_detail.html
  146. +128 −0 ebinternal/ebinternal/templates/feedback/feedback_list.html
  147. +7 −0 ebinternal/ebinternal/templates/homepage.html
  148. +45 −0 ebinternal/ebinternal/templates/includes/feedback_ajax.html
  149. +30 −0 ebinternal/ebinternal/urls.py
  150. +368 −0 ebpub/README.TXT
  151. +64 −0 ebpub/bin/export_newsitems.py
  152. 0  ebpub/ebpub/__init__.py
  153. 0  ebpub/ebpub/accounts/__init__.py
  154. +51 −0 ebpub/ebpub/accounts/callbacks.py
  155. +5 −0 ebpub/ebpub/accounts/constants.py
  156. +9 −0 ebpub/ebpub/accounts/context_processors.py
  157. +74 −0 ebpub/ebpub/accounts/forms.py
  158. +43 −0 ebpub/ebpub/accounts/middleware.py
  159. +86 −0 ebpub/ebpub/accounts/models.py
  160. +20 −0 ebpub/ebpub/accounts/urls.py
  161. +108 −0 ebpub/ebpub/accounts/utils.py
  162. +228 −0 ebpub/ebpub/accounts/views.py
  163. 0  ebpub/ebpub/alerts/__init__.py
  164. +55 −0 ebpub/ebpub/alerts/models.py
  165. +85 −0 ebpub/ebpub/alerts/sending.py
  166. +146 −0 ebpub/ebpub/alerts/views.py
  167. 0  ebpub/ebpub/db/__init__.py
  168. 0  ebpub/ebpub/db/bin/__init__.py
  169. +49 −0 ebpub/ebpub/db/bin/activate_schema.py
  170. +92 −0 ebpub/ebpub/db/bin/add_location.py
  171. +15 −0 ebpub/ebpub/db/bin/alphabetize_locations.py
  172. +43 −0 ebpub/ebpub/db/bin/export_schema.py
  173. +65 −0 ebpub/ebpub/db/bin/geocode_newsitems.py
  174. +125 −0 ebpub/ebpub/db/bin/import_hoods.py
  175. +158 −0 ebpub/ebpub/db/bin/update_aggregates.py
  176. +19 −0 ebpub/ebpub/db/constants.py
  177. +181 −0 ebpub/ebpub/db/feeds.py
  178. +65 −0 ebpub/ebpub/db/fixtures/article_schema.yaml
  179. +369 −0 ebpub/ebpub/db/fixtures/crimes.json
  180. +132 −0 ebpub/ebpub/db/fixtures/test-schema.yaml
  181. +687 −0 ebpub/ebpub/db/models.py
  182. +1 −0  ebpub/ebpub/db/sql/attribute.sql
  183. +34 −0 ebpub/ebpub/db/sql/location_functions.sql
  184. +1 −0  ebpub/ebpub/db/sql/locationtype.sql
  185. +1 −0  ebpub/ebpub/db/sql/lookup.sql
  186. +35 −0 ebpub/ebpub/db/sql/newsitem.sql
  187. +86 −0 ebpub/ebpub/db/sql/newsitemlocation_functions.sql
  188. +1 −0  ebpub/ebpub/db/sql/schemafield.sql
  189. +7 −0 ebpub/ebpub/db/sql/unknown_locations.sql
  190. 0  ebpub/ebpub/db/templatetags/__init__.py
  191. +9 −0 ebpub/ebpub/db/templatetags/dateutils.py
  192. +233 −0 ebpub/ebpub/db/templatetags/eb.py
  193. +75 −0 ebpub/ebpub/db/templatetags/eb_filter.py
  194. +21 −0 ebpub/ebpub/db/templatetags/eb_json.py
  195. +30 −0 ebpub/ebpub/db/templatetags/full_links.py
  196. +42 −0 ebpub/ebpub/db/templatetags/mapping.py
  197. +39 −0 ebpub/ebpub/db/templatetags/raw.py
  198. +259 −0 ebpub/ebpub/db/tests.py
  199. +121 −0 ebpub/ebpub/db/utils.py
  200. +1,250 −0 ebpub/ebpub/db/views.py
  201. +2 −0  ebpub/ebpub/geocoder/__init__.py
  202. +313 −0 ebpub/ebpub/geocoder/base.py
  203. +35 −0 ebpub/ebpub/geocoder/models.py
  204. 0  ebpub/ebpub/geocoder/parser/__init__.py
  205. +70 −0 ebpub/ebpub/geocoder/parser/abbr_state.txt
  206. +6 −0 ebpub/ebpub/geocoder/parser/cities.py
  207. +66 −0 ebpub/ebpub/geocoder/parser/make_cf_tests.py
  208. +301 −0 ebpub/ebpub/geocoder/parser/numbered_streets.py
  209. +254 −0 ebpub/ebpub/geocoder/parser/parsing.py
  210. +61 −0 ebpub/ebpub/geocoder/parser/states.py
  211. +213 −0 ebpub/ebpub/geocoder/parser/suffixes.py
  212. +534 −0 ebpub/ebpub/geocoder/parser/suffixes.txt
  213. +360 −0 ebpub/ebpub/geocoder/parser/tests.py
  214. +43 −0 ebpub/ebpub/geocoder/reverse.py
  215. +534 −0 ebpub/ebpub/geocoder/suffixes.txt
  216. +59 −0 ebpub/ebpub/geocoder/tests.py
  217. 0  ebpub/ebpub/geocoder/tests/__init__.py
  218. +78 −0 ebpub/ebpub/geocoder/tests/parser.py
  219. 0  ebpub/ebpub/metros/__init__.py
  220. +9 −0 ebpub/ebpub/metros/allmetros.py
  221. +50 −0 ebpub/ebpub/metros/fixtures/metros.json
  222. +64 −0 ebpub/ebpub/metros/loader.py
  223. +51 −0 ebpub/ebpub/metros/models.py
  224. +53 −0 ebpub/ebpub/metros/tests.py
  225. +9 −0 ebpub/ebpub/metros/urls.py
  226. +29 −0 ebpub/ebpub/metros/views.py
  227. 0  ebpub/ebpub/petitions/__init__.py
  228. +33 −0 ebpub/ebpub/petitions/models.py
  229. +61 −0 ebpub/ebpub/petitions/views.py
  230. 0  ebpub/ebpub/preferences/__init__.py
  231. +19 −0 ebpub/ebpub/preferences/models.py
  232. +48 −0 ebpub/ebpub/preferences/views.py
  233. 0  ebpub/ebpub/savedplaces/__init__.py
  234. +32 −0 ebpub/ebpub/savedplaces/models.py
  235. +76 −0 ebpub/ebpub/savedplaces/views.py
  236. +165 −0 ebpub/ebpub/settings.py
  237. 0  ebpub/ebpub/streets/__init__.py
  238. 0  ebpub/ebpub/streets/bin/__init__.py
  239. +11 −0 ebpub/ebpub/streets/bin/populate_suburbs.py
  240. +1 −0  ebpub/ebpub/streets/blockimport/__init__.py
  241. +60 −0 ebpub/ebpub/streets/blockimport/base.py
  242. 0  ebpub/ebpub/streets/blockimport/esri/__init__.py
  243. +4 −0 ebpub/ebpub/streets/blockimport/esri/base.py
  244. +1 −0  ebpub/ebpub/streets/blockimport/esri/importers/__init__.py
  245. +107 −0 ebpub/ebpub/streets/blockimport/esri/importers/blocks.py
  246. +75 −0 ebpub/ebpub/streets/blockimport/esri/importers/zipcodes.py
  247. 0  ebpub/ebpub/streets/blockimport/esri/management/__init__.py
  248. 0  ebpub/ebpub/streets/blockimport/esri/management/commands/__init__.py
  249. +24 −0 ebpub/ebpub/streets/blockimport/esri/management/commands/importesri.py
  250. 0  ebpub/ebpub/streets/blockimport/tiger/__init__.py
  251. +183 −0 ebpub/ebpub/streets/blockimport/tiger/import_blocks.py
  252. +55 −0 ebpub/ebpub/streets/fixtures/wabash.yaml
  253. +442 −0 ebpub/ebpub/streets/models.py
  254. +85 −0 ebpub/ebpub/streets/name_utils.py
  255. +218 −0 ebpub/ebpub/streets/populate_streets.py
  256. +13 −0 ebpub/ebpub/streets/update_block_pretty_names.py
  257. +65 −0 ebpub/ebpub/streets/utils.py
  258. +9 −0 ebpub/ebpub/templates/404.html
  259. +9 −0 ebpub/ebpub/templates/500.html
  260. +131 −0 ebpub/ebpub/templates/accounts/dashboard.html
  261. +14 −0 ebpub/ebpub/templates/accounts/email_sent.html
  262. +11 −0 ebpub/ebpub/templates/accounts/hash_error.html
  263. +32 −0 ebpub/ebpub/templates/accounts/login_form.html
  264. +13 −0 ebpub/ebpub/templates/accounts/logout_form.html
  265. +29 −0 ebpub/ebpub/templates/accounts/password_change_form.html
  266. +16 −0 ebpub/ebpub/templates/accounts/password_reset_email.html
  267. +15 −0 ebpub/ebpub/templates/accounts/password_reset_email.txt
  268. +17 −0 ebpub/ebpub/templates/accounts/register_email.html
  269. +12 −0 ebpub/ebpub/templates/accounts/register_email.txt
  270. +27 −0 ebpub/ebpub/templates/accounts/register_form_1.html
  271. +30 −0 ebpub/ebpub/templates/accounts/register_form_2.html
  272. +27 −0 ebpub/ebpub/templates/accounts/request_password_change_form.html
  273. +14 −0 ebpub/ebpub/templates/alerts/confirm_unsubscription.html
  274. +25 −0 ebpub/ebpub/templates/alerts/email.html
  275. +25 −0 ebpub/ebpub/templates/alerts/email.txt
  276. +37 −0 ebpub/ebpub/templates/alerts/signup_form.html
  277. +12 −0 ebpub/ebpub/templates/alerts/signup_thanks.html
  278. +50 −0 ebpub/ebpub/templates/base.html
  279. +38 −0 ebpub/ebpub/templates/base_place.html
  280. +29 −0 ebpub/ebpub/templates/db/block_list.html
  281. +14 −0 ebpub/ebpub/templates/db/city_list.html
  282. +18 −0 ebpub/ebpub/templates/db/did_you_mean.html
  283. +65 −0 ebpub/ebpub/templates/db/feed_signup.html
  284. +51 −0 ebpub/ebpub/templates/db/filter.html
  285. +23 −0 ebpub/ebpub/templates/db/filter_bad_address.html
  286. +42 −0 ebpub/ebpub/templates/db/filter_lookup_list.html
  287. +43 −0 ebpub/ebpub/templates/db/location_type_detail.html
  288. +81 −0 ebpub/ebpub/templates/db/newsitem_detail.html
  289. +178 −0 ebpub/ebpub/templates/db/place_detail.html
  290. +145 −0 ebpub/ebpub/templates/db/place_overview.html
  291. +28 −0 ebpub/ebpub/templates/db/schema_about.html
  292. +125 −0 ebpub/ebpub/templates/db/schema_detail.html
  293. +29 −0 ebpub/ebpub/templates/db/schema_detail_special_report.html
  294. +26 −0 ebpub/ebpub/templates/db/schema_list.html
  295. +21 −0 ebpub/ebpub/templates/db/search_error.html
  296. +22 −0 ebpub/ebpub/templates/db/search_error_zip_list.html
  297. +20 −0 ebpub/ebpub/templates/db/search_invalid_block.html
  298. +12 −0 ebpub/ebpub/templates/db/search_special_case.html
  299. +61 −0 ebpub/ebpub/templates/db/snippets/date_chart.html
  300. +94 −0 ebpub/ebpub/templates/db/snippets/filter_left_nav.html
Sorry, we could not display the entire diff because too many files (576) changed.
52 ebblog/README.TXT
@@ -0,0 +1,52 @@
+======
+ebblog
+======
+
+The blog application used by http://blog.everyblock.com
+
+Requirements
+============
+
+A recent Django release. It has been tested with Django revision 11079 from
+Subversion. When Django 1.1 is released, that should work as well.
+
+Quickstart
+==========
+
+0. Install Django.
+
+1. Install the ebblog package by putting it on your Python path.
+
+2. Start a Django project (using Django 1.1, not 1.0). See the Django Book and
+ Django docs for more:
+
+ http://djangobook.com/en/2.0/
+ http://docs.djangoproject.com/en/dev/
+
+3. Update your settings file. It's probably easiest to just start with the
+ file ebblog/settings.py and tweak that (or import from it in your own
+ settings file). The application won't work until you set the following:
+
+ DATABASE_USER
+ DATABASE_NAME
+ DATABASE_HOST
+ DATABASE_PORT
+
+ If you decide not to start with ebblog.settings, you'll also need to add
+ ebblob.urls to your URLconf and add the absolute path to ebblog/templates
+ to your TEMPLATE_DIRS setting.
+
+4. Run "django-admin.py syncdb" to create all of the database tables.
+
+5. Run "django-admin.py runserver" and go to http://127.0.0.1:8000/ in your
+ Web browser to see the site in action. Go to http://127.0.0.1:8000/admin/
+ to add and edit blog entries.
+
+Customization
+=============
+
+The title, description, and link of the RSS feed should be set in
+ebblog/blog/feeds.py. Also, most of the visual customization can be
+accomplished by editing ebblog/templates/base.html. All of the other templates
+inherit from base.html, so any styles added there will apply to the other
+pages.
0  ebblog/ebblog/__init__.py
No changes.
0  ebblog/ebblog/blog/__init__.py
No changes.
7 ebblog/ebblog/blog/admin.py
@@ -0,0 +1,7 @@
+from django.contrib.admin import ModelAdmin, site
+from ebblog.blog.models import Entry
+
+class EntryAdmin(ModelAdmin):
+ list_display = ('pub_date', 'headline', 'author')
+
+site.register(Entry, EntryAdmin)
29 ebblog/ebblog/blog/feeds.py
@@ -0,0 +1,29 @@
+from django.contrib.syndication.feeds import Feed
+from django.utils.feedgenerator import Rss201rev2Feed
+from ebblog.blog.models import Entry
+
+# RSS feeds powered by Django's syndication framework use MIME type
+# 'application/rss+xml'. That's unacceptable to us, because that MIME type
+# prompts users to download the feed in some browsers, which is confusing.
+# Here, we set the MIME type so that it doesn't do that prompt.
+class CorrectMimeTypeFeed(Rss201rev2Feed):
+ mime_type = 'application/xml'
+
+# This is a django.contrib.syndication.feeds.Feed subclass whose feed_type
+# is set to our preferred MIME type.
+class BlogFeed(Feed):
+ feed_type = CorrectMimeTypeFeed
+
+class BlogEntryFeed(Feed):
+ title = ""
+ link = ""
+ description = ""
+
+ def items(self):
+ return Entry.objects.order_by('-pub_date')[:10]
+
+ def item_link(self, item):
+ return item.url()
+
+ def item_pubdate(self, item):
+ return item.pub_date
18 ebblog/ebblog/blog/models.py
@@ -0,0 +1,18 @@
+from django.db import models
+
+class Entry(models.Model):
+ pub_date = models.DateTimeField()
+ author = models.CharField(max_length=32, help_text='Use the full name, e.g., "John Lennon".')
+ slug = models.CharField(max_length=32)
+ headline = models.CharField(max_length=255)
+ summary = models.TextField(help_text='Use plain text (no HTML).')
+ body = models.TextField(help_text='Use raw HTML, including <p> tags.')
+
+ class Meta:
+ verbose_name_plural = 'entries'
+
+ def __unicode__(self):
+ return self.headline
+
+ def url(self):
+ return "/%s/%s/" % (self.pub_date.strftime("%Y/%b/%d").lower(), self.slug)
6 ebblog/ebblog/manage.py
@@ -0,0 +1,6 @@
+#!/usr/bin/env python
+from django.core.management import execute_manager
+import settings_devel
+
+if __name__ == "__main__":
+ execute_manager(settings_devel)
21 ebblog/ebblog/settings.py
@@ -0,0 +1,21 @@
+import os
+
+DATABASE_ENGINE = 'sqlite3'
+DATABASE_NAME = '/tmp/ebblog.db'
+DATABASE_USER = ''
+DATABASE_HOST = ''
+DATABASE_PORT = ''
+
+ROOT_URLCONF = 'ebblog.urls'
+
+INSTALLED_APPS = (
+ 'django.contrib.admin',
+ 'django.contrib.auth',
+ 'django.contrib.contenttypes',
+ 'django.contrib.sessions',
+ 'ebblog.blog',
+)
+
+TEMPLATE_DIRS = (
+ os.path.normpath(os.path.join(os.path.dirname(__file__), 'templates')),
+)
7 ebblog/ebblog/templates/404.html
@@ -0,0 +1,7 @@
+{% extends "base.html" %}
+
+{% block title %}Page not found{% endblock %}
+
+{% block content %}
+<h1>Page not found</h1>
+{% endblock %}
7 ebblog/ebblog/templates/500.html
@@ -0,0 +1,7 @@
+{% extends "base.html" %}
+
+{% block title %}Page unavailable{% endblock %}
+
+{% block content %}
+<h1>Page unavailable</h1>
+{% endblock %}
32 ebblog/ebblog/templates/base.html
@@ -0,0 +1,32 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
+
+<html lang="en">
+<head>
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
+ <title>{% block fulltitle %}{% block title %}{% endblock %} / ebblog{% endblock %}</title>
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
+ <link rel="home" title="Home" href="/">
+ <link rel="alternate" type="application/rss+xml" title="RSS" href="/rss/">
+ {% block extrahead %}{% endblock %}
+</head>
+<body class="{% block pageid %}generic{% endblock %}">
+ <div id="container">
+ <div id="siteheader">
+ <h2 id="sitetitle"><a href="/">ebblog</a></h2>
+ </div><!--/siteheader-->
+ <div id="main">
+ <div id="prenav" class="globalnav">
+ <ul class="navlist">
+ <li class="latest"><a href="/">Latest posts</a></li>
+ <li class="archives"><a href="/archives/">Blog archives</a></li>
+ <li class="feed"><a href="/rss/">RSS feed</a></li>
+ </ul>
+ </div><!--/prenav-->
+ <div id="content">
+ {% block content %}{% endblock %}
+ </div><!--/content-->
+ </div><!--/main-->
+ </div><!--/container-->
+
+</body>
+</html>
18 ebblog/ebblog/templates/blog/archive.html
@@ -0,0 +1,18 @@
+{% extends "base.html" %}
+
+{% block title %}Archives{% endblock %}
+
+{% block content %}
+
+<h1>Archive</h1>
+
+{% regroup object_list by pub_date.year as entries_by_year %}
+{% for group in entries_by_year %}
+<h2 class="yeargrouper">{{ group.grouper }}</h2>
+<ul id="archiveslist" class="linklist">
+{% for entry in group.list %}
+ <li><strong class="date">{{ entry.pub_date|date:"M j" }}</strong> <a class="title" href="{{ entry.url }}">{{ entry.headline }}</a></li>
+{% endfor %}
+</ul>
+{% endfor %}
+{% endblock %}
19 ebblog/ebblog/templates/blog/entry_archive_day.html
@@ -0,0 +1,19 @@
+{% extends "base.html" %}
+
+{% block title %}Archives / {{ day|date:"F j" }}{% endblock %}
+
+{% block content %}
+
+<h1>{{ day|date:"F j" }}</h1>
+
+{% for object in object_list %}
+<div class="post">
+ <h2 class="title"><a class="url" href="{{ object.url }}">{{ object.headline }}</a></h2>
+ <p class="dateline">Posted <span class="date">{{ object.pub_date|date:"F j, Y" }}</span> by <strong class="author">{{ object.author }}</strong></p>
+ <div class="body">
+ {{ object.body|safe }}
+ </div>
+</div><!--/post-->
+{% endfor %}
+
+{% endblock %}
19 ebblog/ebblog/templates/blog/entry_archive_month.html
@@ -0,0 +1,19 @@
+{% extends "base.html" %}
+
+{% block title %}Archives / {{ month|date:"F Y" }}{% endblock %}
+
+{% block content %}
+
+<h1>{{ month|date:"F Y" }}</h1>
+
+{% for object in object_list %}
+<div class="post">
+ <h2 class="title"><a class="url" href="{{ object.url }}">{{ object.headline }}</a></h2>
+ <p class="dateline">Posted <span class="date">{{ object.pub_date|date:"F j, Y" }}</span> by <strong class="author">{{ object.author }}</strong></p>
+ <div class="body">
+ {{ object.body|safe }}
+ </div>
+</div><!--/post-->
+{% endfor %}
+
+{% endblock %}
15 ebblog/ebblog/templates/blog/entry_archive_year.html
@@ -0,0 +1,15 @@
+{% extends "base.html" %}
+
+{% block title %}Archives / {{ year }}{% endblock %}
+
+{% block content %}
+
+<h1>{{ year }}</h1>
+
+<ul class="linklist">
+{% for date in date_list %}
+ <li><a href="{{ date|date:"M"|lower }}/">{{ date|date:"F" }}</a></li>
+{% endfor %}
+</ul>
+
+{% endblock %}
13 ebblog/ebblog/templates/blog/entry_detail.html
@@ -0,0 +1,13 @@
+{% extends "base.html" %}
+
+{% block title %}{{ object.headline }}{% endblock %}
+
+{% block content %}
+<div class="post">
+ <h1 class="title">{{ object.headline }}</h1>
+ <p class="dateline">Posted <span class="date">{{ object.pub_date|date:"F j, Y" }}</span> by <strong class="author">{{ object.author }}</strong></p>
+ <div class="body">
+ {{ object.body|safe }}
+ </div>
+</div>
+{% endblock %}
1  ebblog/ebblog/templates/feeds/rss_description.html
@@ -0,0 +1 @@
+{{ obj.body|safe }}
1  ebblog/ebblog/templates/feeds/rss_title.html
@@ -0,0 +1 @@
+{{ obj.headline }}
17 ebblog/ebblog/templates/homepage.html
@@ -0,0 +1,17 @@
+{% extends "base.html" %}
+
+{% block fulltitle %}ebblog{% endblock %}
+
+{% block pageid %}latest{% endblock %}
+
+{% block content %}
+{% for object in latest %}
+<div class="post">
+ <h2 class="title"><a class="url" href="{{ object.url }}">{{ object.headline }}</a></h2>
+ <p class="dateline">Posted <span class="date">{{ object.pub_date|date:"F j, Y" }}</span> by <strong class="author">{{ object.author }}</strong></p>
+ <div class="body">
+ {{ object.body|safe }}
+ </div>
+</div><!--/post-->
+{% endfor %}
+{% endblock %}
28 ebblog/ebblog/urls.py
@@ -0,0 +1,28 @@
+from django.conf.urls.defaults import *
+from django.contrib.syndication.views import feed as feed_view
+from django.views.generic import date_based, list_detail
+from django.contrib import admin
+from ebblog.blog.models import Entry
+from ebblog.blog import feeds
+
+admin.autodiscover()
+
+info_dict = {
+ 'queryset': Entry.objects.order_by('pub_date'),
+ 'date_field': 'pub_date',
+}
+
+FEEDS = {
+ 'rss': feeds.BlogEntryFeed,
+}
+
+urlpatterns = patterns('',
+ (r'^(?P<year>\d{4})/(?P<month>[a-z]{3})/(?P<day>\w{1,2})/(?P<slug>\w+)/$', date_based.object_detail, dict(info_dict, slug_field='slug')),
+ (r'^(?P<year>\d{4})/(?P<month>[a-z]{3})/(?P<day>\w{1,2})/$', date_based.archive_day, info_dict),
+ (r'^(?P<year>\d{4})/(?P<month>[a-z]{3})/$', date_based.archive_month, info_dict),
+ (r'^(?P<year>\d{4})/$', date_based.archive_year, info_dict),
+ (r'^(rss)/$', feed_view, {'feed_dict': FEEDS}),
+ (r'^archives/', list_detail.object_list, {'queryset': Entry.objects.order_by('-pub_date'), 'template_name': 'blog/archive.html'}),
+ (r'^$', date_based.archive_index, dict(info_dict, template_name='homepage.html')),
+ ('^admin/', include(admin.site.urls)),
+)
82 ebdata/README.TXT
@@ -0,0 +1,82 @@
+======
+ebdata
+======
+
+Code to help write scripts that import/crawl/parse data into ebpub.
+
+ebdata.blobs
+============
+
+The blobs package is a Django app responsible for crawling, scraping,
+extracting, and geocoding news articles from the web.
+
+The blobs app contains two models, Seed and Page. Seed is a news source, like
+the Chicago Tribune, and a Page is a particular html page that was crawled from
+a Seed.
+
+
+ebdata.nlp
+==========
+
+The nlp package contains utilities for detecting locations in text. This
+package is used by blobs, but if you want to use it directly, check out the
+docstrings for the functions in ebdata.parsing.addresses.
+
+
+ebdata.parsing
+==============
+
+The parsing package contains helpers for reading different file types.
+
+The dbf, excel, mdb, and unicodecsv modules are for reading stuctured data,
+and generally follow the python csv reader api. See the code for more details
+on how to use the.
+
+The pdf module is for converting pdf to text, and requires Xpdf.
+http://www.foolabs.com/xpdf/download.html
+
+
+ebdata.retrieval
+================
+
+The retrieval package contains a framework for writing scrapers for structured
+data. There are many examples of how to use this framework in different
+situation in the everyblock package.
+
+The most commonly used scraper is the NewsItemListDetailScraper. It handles
+scraping list/detail types of sites, and creating or updating NewsItem
+objects.
+
+Generally, to run a scraper, you need to instantiate it, and then call its
+update method. Sometimes the scraper will take arguments, but it varies on a
+case-by-case basis. You can read the scrapers in the everyblock package for
+examples. You can also run a scraper by calling its display_data method. This
+will run the scraper, but won't actually save any of the scraped data. It's
+very useful for debugging, or when writing a scraper for the first time.
+
+All of the methods and parameters you'll need to use are documented in
+docstrings of ebdata.retrieval.scrapers.list_detail.ListDetailScraper and in
+ebdata.retrieval.scrapers.newsitem_list_detail.NewsItemListDetailScraper.
+ListDetailScraper is a base class of NewsItemListDetailScraper that handles
+scraping, but doesn't actually have any methods for saving data.
+
+The retrieval package also contains updaterdaemon, which is a cron-like
+facility for running scrapers. It comes with a unix-style init script, and its
+configuration and examples are in ebdata/retrieval/updaterdaemon/config.py.
+
+
+ebdata.templatemaker
+====================
+
+The templatemaker package contains utilities for detecting the actual content
+given a set of html pages that were generated from a template. For instance,
+templatemaker helps detect and extract the actual article from a page that
+could also contain navigation links, ads, etc.
+
+
+ebdata.textmining
+=================
+
+The textmining package contains utilities for preprocessing html to strip out
+things that templatemaker doesn't care about like comments, scripts, styles,
+meta information, etc.
0  ebdata/ebdata/__init__.py
No changes.
0  ebdata/ebdata/blobs/__init__.py
No changes.
48 ebdata/ebdata/blobs/auto_purge.py
@@ -0,0 +1,48 @@
+from ebdata.blobs.models import Page, IgnoredDateline
+from ebdata.nlp.datelines import guess_datelines
+from ebpub.streets.models import Suburb
+import re
+
+def dateline_should_be_purged(dateline):
+ dateline = dateline.upper()
+ try:
+ IgnoredDateline.objects.get(dateline=dateline)
+ return True
+ except IgnoredDateline.DoesNotExist:
+ pass
+ try:
+ Suburb.objects.get(normalized_name=dateline)
+ return True
+ except Suburb.DoesNotExist:
+ pass
+ return False
+
+def all_relevant_datelines():
+ """
+ Prints all datelines that are in articles but not in ignored_datelines,
+ for all unharvested Pages in the system.
+ """
+ seen = {}
+ for page in Page.objects.filter(has_addresses__isnull=True, is_pdf=False):
+ for bit in page.mine_page():
+ for dateline in guess_datelines(bit):
+ dateline = dateline.upper()
+ if dateline not in seen and not dateline_should_be_purged(dateline):
+ print dateline
+ seen[dateline] = 1
+
+def page_should_be_purged(paragraph_list):
+ """
+ Returns a tuple of (purge, reason). purge is True if the given list of
+ strings can be safely purged. reason is a string.
+ """
+ datelines = []
+ for para in paragraph_list:
+ datelines.extend(guess_datelines(para))
+ if datelines:
+ dateline_text = ', '.join([str(d) for d in datelines])
+ if not [d for d in datelines if not dateline_should_be_purged(d)]:
+ return (True, 'Dateline(s) %s safe to purge' % dateline_text)
+ else:
+ return (False, 'Dateline(s) %s found but not safe to purge' % dateline_text)
+ return (False, 'No datelines')
27 ebdata/ebdata/blobs/create_seeds.py
@@ -0,0 +1,27 @@
+from ebdata.blobs.models import Seed
+from ebpub.db.models import Schema
+
+def create_rss_seed(url, base_url, rss_full_entry, pretty_name, guess_article_text=True, strip_noise=False):
+ if rss_full_entry:
+ guess_article_text = strip_noise = False
+ if 'www.' in base_url:
+ normalize_www = 2
+ else:
+ normalize_www = 1
+ Seed.objects.create(
+ url=url,
+ base_url=base_url,
+ delay=3,
+ depth=1,
+ is_crawled=False,
+ is_rss_feed=True,
+ is_active=True,
+ rss_full_entry=rss_full_entry,
+ normalize_www=normalize_www,
+ pretty_name=pretty_name,
+ schema=Schema.objects.get(slug='news-articles'),
+ autodetect_locations=True,
+ guess_article_text=guess_article_text,
+ strip_noise=strip_noise,
+ city='',
+ )
192 ebdata/ebdata/blobs/geotagging.py
@@ -0,0 +1,192 @@
+from django.conf import settings
+from ebdata.blobs.auto_purge import page_should_be_purged
+from ebdata.blobs.models import Page
+from ebdata.nlp.addresses import parse_addresses
+from ebpub.db.models import NewsItem, SchemaField, Lookup
+from ebpub.geocoder import SmartGeocoder, AmbiguousResult, DoesNotExist, InvalidBlockButValidStreet
+from ebpub.geocoder.parser.parsing import normalize, ParsingError
+from ebpub.streets.models import Suburb
+from ebpub.utils.text import slugify, smart_excerpt
+import datetime
+import time
+
+
+def save_locations_for_page(p):
+ """
+ Given a Page object, this function parses the text, finds all valid
+ locations and creates a NewsItem for each location.
+ """
+ paragraph_list = p.auto_excerpt()
+ do_purge, no_purge_reason = page_should_be_purged(paragraph_list)
+ robot_report = [no_purge_reason]
+ if do_purge:
+ p.set_no_locations(geocoded_by='confidentrobot')
+ else:
+ if p.seed.autodetect_locations:
+ if not p.article_headline:
+ return
+ if not p.article_date:
+ return
+
+ # Add a paragraph of the article's headline so that we find any/all
+ # addresses in the headline, too.
+ paragraph_list = [p.article_headline] + paragraph_list
+
+ locations, location_report = auto_locations(paragraph_list, p.seed.city)
+ if location_report:
+ robot_report.append(location_report)
+
+ if locations:
+ # Check for existing NewsItems with this exact pub_date,
+ # headline, location_name and source.
+ do_geotag = True
+ try:
+ source_schemafield = SchemaField.objects.get(schema__id=p.seed.schema_id, name='source')
+ except SchemaField.DoesNotExist:
+ pass
+ else:
+ existing_newsitems = NewsItem.objects.filter(schema__id=p.seed.schema_id,
+ pub_date=p.article_date, title=p.article_headline,
+ location_name=locations[0][0]).by_attribute(source_schemafield, p.seed.pretty_name, is_lookup=True).count()
+ if existing_newsitems:
+ robot_report.append('article appears to exist already')
+ do_geotag = False
+ if do_geotag:
+ geotag_page(p.id, p.seed.pretty_name, p.seed.schema, p.url,
+ locations, p.article_headline, p.article_date)
+ p.has_addresses = bool(locations)
+ p.when_geocoded = datetime.datetime.now()
+ p.geocoded_by = 'robot'
+ p.robot_report = '; '.join(robot_report)[:255]
+ p.save()
+
+def geotag_page(page_id, source, schema, url, data_tuples, article_headline, article_date):
+ """
+ Given a Page ID and a list of (location, wkt, excerpt, block) tuples
+ representing the addresses in the page, creates a NewsItem for each
+ address. Returns a list of all created NewsItems.
+ """
+ if not data_tuples:
+ return
+ if not source:
+ raise ValueError('Provide a source')
+ if not url:
+ raise ValueError('Provide a URL')
+ if not article_headline:
+ raise ValueError('Provide an article headline')
+ if not article_date:
+ raise ValueError('Provide an article date')
+ if not isinstance(article_date, datetime.date):
+ article_date = datetime.date(*time.strptime(article_date, '%Y-%m-%d')[:3])
+
+ # If this schema has a "source" SchemaField, then get or create it.
+ try:
+ sf = SchemaField.objects.get(schema__id=schema.id, name='source')
+ except SchemaField.DoesNotExist:
+ source = None
+ else:
+ try:
+ source = Lookup.objects.get(schema_field__id=sf.id, code=source)
+ except Lookup.DoesNotExist:
+ source = Lookup.objects.create(
+ schema_field_id=sf.id,
+ name=source,
+ code=source,
+ slug=slugify(source)[:32],
+ description=''
+ )
+ ni_list = []
+ for location, wkt, excerpt, block in data_tuples:
+ description = excerpt = excerpt.replace('\n', ' ')
+ if source is not None:
+ # u'\u2014' is an em dash.
+ description = u'%s \u2014 %s' % (source.name, description)
+ ni = NewsItem.objects.create(
+ schema=schema,
+ title=article_headline,
+ description=description,
+ url=url,
+ pub_date=article_date,
+ item_date=article_date,
+ location=wkt,
+ location_name=location,
+ block=block,
+ )
+ atts = {'page_id': page_id, 'excerpt': excerpt}
+ if source is not None:
+ atts['source'] = source.id
+ ni.attributes = atts
+ ni_list.append(ni)
+ return ni_list
+
+def auto_locations(paragraph_list, default_city=''):
+ """
+ Given a list of strings, detects all valid, unique addresses and returns a
+ tuple (result, report), where result is a list of tuples in the format
+ (address, wkt, excerpt, block) and report is a string of what happened.
+
+ If default_city is given, it will be used in the geocoding for detected
+ addresses that don't specify a city.
+ """
+ result, report = [], []
+ addresses_seen = set()
+ geocoder = SmartGeocoder()
+ for para in paragraph_list:
+ for addy, city in parse_addresses(para):
+ # Skip addresses if they have a city that's a known suburb.
+ if city and Suburb.objects.filter(normalized_name=normalize(city)).count():
+ report.append('got suburb "%s, %s"' % (addy, city))
+ continue
+
+ # Try geocoding the address. If a city was provided, first try
+ # geocoding with the city, then fall back to just the address
+ # (without the city).
+ point = None
+ attempts = [addy]
+ if default_city:
+ attempts.insert(0, '%s, %s' % (addy, default_city))
+ if city and city.lower() != default_city.lower():
+ attempts.insert(0, '%s, %s' % (addy, city))
+ for attempt in attempts:
+ try:
+ point = geocoder.geocode(attempt)
+ break
+ except AmbiguousResult:
+ report.append('got ambiguous address "%s"' % attempt)
+ # Don't try any other address attempts, because they only
+ # get *more* ambiguous. Plus, the subsequent attempts could
+ # be incorrect. For example, with this:
+ # addy = '100 Broadway'
+ # city = 'Manhattan'
+ # default_city = 'Brooklyn'
+ # There are multiple "100 Broadway" addresses in Manhattan,
+ # so geocoding should fail at this point. It should not
+ # roll back to try the default_city (Brooklyn).
+ break
+ except (DoesNotExist, InvalidBlockButValidStreet):
+ report.append('got nonexistent address "%s"' % attempt)
+ except ParsingError:
+ report.append('got parsing error "%s"' % attempt)
+ if point is None:
+ continue # This address could not be geocoded.
+
+ if point['address'] in addresses_seen:
+ continue
+ if len(para) > 300:
+ try:
+ excerpt = smart_excerpt(para, addy)
+ except ValueError:
+ excerpt = para
+ else:
+ excerpt = para
+ result.append((addy, point['point'], excerpt, point['block']))
+ addresses_seen.add(point['address'])
+ return (result, '; '.join(report))
+
+def save_locations_for_ungeocoded_pages():
+ for p in Page.objects.filter(when_geocoded__isnull=True).iterator():
+ save_locations_for_page(p)
+
+if __name__ == "__main__":
+ from ebdata.retrieval import log_debug
+ save_locations_for_ungeocoded_pages()
60 ebdata/ebdata/blobs/manual.py
@@ -0,0 +1,60 @@
+"""
+Helper functions for manually adding news article NewsItems.
+"""
+
+from ebdata.blobs.models import Seed, Page
+from ebdata.retrieval import UnicodeRetriever
+from ebpub.db.models import Schema
+from ebpub.geocoder import SmartGeocoder
+from geotagging import geotag_page # relative import
+import datetime
+
+def add_newsitem(seed_url, seed_name, url, article_headline, article_date, name_excerpts):
+ schema = Schema.objects.get(slug='news-articles')
+ geocoder = SmartGeocoder()
+ try:
+ s = Seed.objects.get(url=seed_url)
+ except Seed.DoesNotExist:
+ s = Seed.objects.create(
+ url=seed_url,
+ base_url=seed_url,
+ delay=0,
+ depth=0,
+ is_crawled=False,
+ is_rss_feed=False,
+ is_active='t',
+ rss_full_entry=False,
+ normalize_www=3,
+ pretty_name=seed_name,
+ schema=schema,
+ autodetect_locations=True,
+ guess_article_text=False,
+ strip_noise=False,
+ city='',
+ )
+ try:
+ p = Page.objects.get(url=url)
+ except Page.DoesNotExist:
+ html = UnicodeRetriever().get_html(url)
+ p = Page.objects.create(
+ seed=s,
+ url=url,
+ scraped_url=url,
+ html=html,
+ when_crawled=datetime.datetime.now(),
+ is_article=True,
+ is_pdf=False,
+ is_printer_friendly=False,
+ article_headline=article_headline,
+ article_date=article_date,
+ has_addresses=None,
+ when_geocoded=None,
+ geocoded_by='',
+ times_skipped=0,
+ robot_report=''
+ )
+ data_tuples = []
+ for location_name, excerpt in name_excerpts:
+ point = geocoder.geocode(location_name) # Let exceptions bubble up.
+ data_tuples.append((location_name, point['point'], excerpt, point['block']))
+ return geotag_page(p.id, seed_name, schema, url, data_tuples, article_headline, article_date)
181 ebdata/ebdata/blobs/models.py
@@ -0,0 +1,181 @@
+from ebpub.db.models import Schema
+from django.db import models
+import datetime
+
+class Seed(models.Model):
+ url = models.CharField(max_length=512)
+ base_url = models.CharField(max_length=512) # e.g., 'http://www.suntimes.com/'
+ delay = models.SmallIntegerField()
+ depth = models.SmallIntegerField()
+ is_crawled = models.BooleanField()
+ is_rss_feed = models.BooleanField()
+ is_active = models.BooleanField()
+ rss_full_entry = models.BooleanField() # If True, then an RSS <entry> contains the whole article.
+ normalize_www = models.SmallIntegerField() # 1 = Remove www, 2 = Add www, 3 = Ignore subdomain
+ pretty_name = models.CharField(max_length=128) # e.g., 'Chicago Sun-Times'
+ schema = models.ForeignKey(Schema) # news-articles, missed-connections, etc.
+
+ # If True, then Pages from this Seed will be automatically address-detected.
+ autodetect_locations = models.BooleanField()
+
+ # If True, then robot will use templatemaker.articletext.article_text() to
+ # determine Page excerpts.
+ guess_article_text = models.BooleanField()
+
+ # If True, then robot will use templatemaker.clean.strip_template() to
+ # determine Page excerpts.
+ strip_noise = models.BooleanField()
+
+ # An uppercase string of the city that this seed covers -- e.g., 'BROOKLYN'.
+ # If given, this city will be used to disambiguate addresses in automatic
+ # geocoding.
+ city = models.CharField(max_length=64, blank=True)
+
+ def __unicode__(self):
+ return self.url
+
+class PageManager(models.Manager):
+ def increment_skip(self, page_id):
+ # Use this to increment the 'times_skipped' column atomically.
+ # I.e., it's better to use this than to call save() on Page objects,
+ # because that introduces the possibility of clashes.
+ from django.db import connection
+ cursor = connection.cursor()
+ cursor.execute("UPDATE %s SET times_skipped = times_skipped + 1 WHERE id = %%s" % Page._meta.db_table, (page_id,))
+ connection._commit()
+
+ def next_ungeocoded(self, seed_id):
+ "Returns the next ungeocoded Page for the given seed_id."
+ try:
+ return self.select_related().filter(has_addresses__isnull=True, is_article=True, seed__id=seed_id).order_by('times_skipped', 'when_crawled')[0]
+ except IndexError:
+ raise self.model.DoesNotExist
+
+class Page(models.Model):
+ seed = models.ForeignKey(Seed)
+
+ # The publicly displayed URL for this page.
+ url = models.CharField(max_length=512, db_index=True)
+
+ # The URL that we actually scraped for this page (possibly a
+ # printer-friendly version).
+ scraped_url = models.CharField(max_length=512)
+
+ html = models.TextField()
+ when_crawled = models.DateTimeField()
+
+ # Whether this page is an "article," as opposed to some sort of index page.
+ is_article = models.NullBooleanField()
+
+ # Whether this page is the extracted text of a PDF.
+ is_pdf = models.BooleanField()
+
+ # Whether this page is the printer-friendly version.
+ is_printer_friendly = models.BooleanField()
+
+ article_headline = models.CharField(max_length=255, blank=True)
+ article_date = models.DateField(blank=True, null=True)
+
+ # True = addresses were found
+ # False = addresses were not found
+ # None = page has not yet been examined
+ has_addresses = models.NullBooleanField()
+
+ when_geocoded = models.DateTimeField(blank=True, null=True)
+ geocoded_by = models.CharField(max_length=32, blank=True)
+
+ # The number of times this page has been "skipped" in the blob geocoder.
+ times_skipped = models.SmallIntegerField()
+
+ robot_report = models.CharField(max_length=255, blank=True)
+
+ objects = PageManager()
+
+ def __unicode__(self):
+ return u'%s scraped %s' % (self.url, self.when_crawled)
+
+ def set_no_locations(self, geocoded_by='robot'):
+ """
+ Marks this Page as geocoded with no locations. Does NOT save it.
+ """
+ self.has_addresses = False
+ self.when_geocoded = datetime.datetime.now()
+ self.geocoded_by = geocoded_by
+ set_no_locations.alters_data = True
+
+ def mine_page(self):
+ """
+ Runs templatemaker on this Page and returns the raw mined content, as
+ a list of strings.
+ """
+ from ebdata.templatemaker.webmining import mine_page
+ try:
+ other_page = self.companion_page()
+ except IndexError:
+ return [self.html]
+ return mine_page(self.html, [other_page.html])
+
+ def auto_excerpt(self):
+ """
+ Attempts to detect the text of this page (ignoring all navigation and
+ other clutter), returning a list of strings. Each string represents a
+ paragraph.
+ """
+ from ebdata.textmining.treeutils import make_tree
+ tree = make_tree(self.html)
+ if self.seed.rss_full_entry:
+ from ebdata.templatemaker.textlist import html_to_paragraph_list
+ paras = html_to_paragraph_list(tree)
+ else:
+ if self.seed.strip_noise:
+ from ebdata.templatemaker.clean import strip_template
+ try:
+ html2 = self.companion_page().html
+ except IndexError:
+ pass
+ else:
+ tree2 = make_tree(html2)
+ strip_template(tree, tree2)
+ if self.seed.guess_article_text:
+ from ebdata.templatemaker.articletext import article_text
+ paras = article_text(tree)
+ else:
+ from ebdata.templatemaker.textlist import html_to_paragraph_list
+ paras = html_to_paragraph_list(tree)
+ return paras
+
+ def companion_page(self):
+ """
+ Returns another Page for self.seed, for use in a templatemaker
+ duplicate-detection algorithm. Raises IndexError if none exist.
+ """
+ # To avoid the problem of site redesigns affecting the layout, get an
+ # example page that was crawled *just before* the current Page.
+ try:
+ return Page.objects.filter(seed__id=self.seed_id, is_article=True,
+ when_crawled__lt=self.when_crawled, is_pdf=False).order_by('-when_crawled')[0]
+ except IndexError:
+ # If no pages were crawled directly before this one, then get the page
+ # that was crawled directly *after* this one.
+ return Page.objects.filter(seed__id=self.seed_id, is_article=True,
+ when_crawled__gt=self.when_crawled, is_pdf=False).order_by('when_crawled')[0]
+
+ def newsitems(self):
+ """
+ Returns a list of {excerpt, location_name} dictionaries for every
+ location found in this Page, or an empty list if it has no addresses.
+ """
+ from ebpub.db.models import Attribute, SchemaField
+ if not self.has_addresses:
+ return []
+ # First, figure out the SchemaFields.
+ real_names = dict([(sf.name, sf.real_name.encode('utf8')) for sf in SchemaField.objects.filter(schema__id=self.seed.schema_id, name__in=('excerpt', 'page_id'))])
+ return [{'id': att.news_item_id, 'url': att.news_item.item_url_with_domain(), 'excerpt': getattr(att, real_names['excerpt']), 'location_name': att.news_item.location_name} \
+ for att in Attribute.objects.select_related().filter(**{real_names['page_id']: self.id, 'schema__id': self.seed.schema_id})]
+
+# Datelines that should be ignored by the blob updater.
+class IgnoredDateline(models.Model):
+ dateline = models.CharField(max_length=255, unique=True)
+
+ def __unicode__(self):
+ return self.dateline
205 ebdata/ebdata/blobs/scrapers.py
@@ -0,0 +1,205 @@
+"""
+Generic scrapers that create Pages based on some common Web site patterns.
+"""
+
+from django.conf import settings
+from django.utils.html import strip_tags
+from ebdata.blobs.geotagging import save_locations_for_page
+from ebdata.blobs.models import Seed, Page
+from ebdata.retrieval import UnicodeRetriever, RetrievalError
+from ebdata.retrieval import log # Register the logging hooks.
+from ebpub.utils.dates import parse_date
+import datetime
+import logging
+
+class NoPagesYet(Exception):
+ pass
+
+class NoSeedYet(Exception):
+ pass
+
+class SpecializedCrawler(object):
+ """
+ Base class for Page crawlers.
+ """
+
+ schema = None
+ seed_url = None
+ date_headline_re = None
+ date_format = None
+ retriever = None
+
+ def __init__(self):
+ try:
+ self.seed = Seed.objects.get(url=self.seed_url)
+ except Seed.DoesNotExist:
+ raise NoSeedYet('You need to add a Seed with the URL %r' % self.seed_url)
+ self.logger = logging.getLogger('eb.retrieval.%s.%s' % (settings.SHORT_NAME, self.schema))
+ if self.retriever is None:
+ self.retriever = UnicodeRetriever(cache=None, sleep=self.seed.delay)
+
+ def save_page(self, unique_id):
+ """
+ Downloads the page with the given unique ID (possibly a numeric ID, or
+ a URL) and saves it as a Page object. Returns the Page object, or None
+ if the page couldn't be found.
+
+ The page won't be retrieved/saved if it's already in the database. In
+ this case, the existing Page object will be returned.
+ """
+ self.logger.debug('save_page(%s)', unique_id)
+ retrieval_url = self.retrieval_url(unique_id)
+ public_url = self.public_url(unique_id)
+
+ try:
+ p = Page.objects.get(seed__id=self.seed.id, url=public_url)
+ except Page.DoesNotExist:
+ pass
+ else:
+ self.logger.debug('Skipping already-saved URL %s', public_url)
+ return p
+
+ try:
+ html = self.retriever.get_html(retrieval_url).strip()
+ except (RetrievalError, UnicodeDecodeError):
+ return None
+ if not html:
+ self.logger.debug('Got empty page for %s', retrieval_url)
+ return None
+ self.logger.debug('Got VALID page for %s', retrieval_url)
+
+ m = self.date_headline_re.search(html)
+ if not m:
+ self.logger.debug('Could not find date/headline on %s', retrieval_url)
+ return None
+ article_date, article_headline = m.groupdict()['article_date'], m.groupdict()['article_headline']
+ try:
+ article_date = parse_date(article_date, self.date_format)
+ except ValueError:
+ self.logger.debug('Got unparseable date %r on %s', article_date, retrieval_url)
+ return None
+ article_headline = strip_tags(article_headline)
+ if len(article_headline) > 255:
+ article_headline = article_headline[:252] + '...'
+
+ p = Page.objects.create(
+ seed=self.seed,
+ url=public_url,
+ scraped_url=retrieval_url,
+ html=html,
+ when_crawled=datetime.datetime.now(),
+ is_article=True,
+ is_pdf=False,
+ is_printer_friendly=False,
+ article_headline=article_headline,
+ article_date=article_date,
+ has_addresses=None,
+ when_geocoded=None,
+ geocoded_by='',
+ times_skipped=0,
+ robot_report='',
+ )
+ self.logger.debug('Created Page ID %s' % p.id)
+ save_locations_for_page(p)
+ return p
+
+ ######################################
+ # METHODS SUBCLASSES SHOULD OVERRIDE #
+ ######################################
+
+ def public_url(self, unique_id):
+ "Given the ID value, returns the URL that we should publish."
+ raise NotImplementedError()
+
+ def retrieval_url(self, unique_id):
+ "Given the ID value, returns the URL that we should scrape."
+ return self.public_url(unique_id)
+
+class IncrementalCrawler(SpecializedCrawler):
+ """
+ Crawler that populates the blobs.Page table by incrementing IDs.
+
+ This is a very "dumb" but effective technique for crawling sites such
+ as cityofchicago.org whose pages have incremental ID numbers.
+
+ LIMITATIONS/ASSUMPTIONS:
+
+ * This assumes that the URL for each retrieved page is in the same format,
+ such that ordering by the URL will result in the highest ID.
+ * This assumes that a Seed exists with url=self.seed_url.
+ * Before running update(), at least one Page with the given seed must
+ exist. Otherwise the retriever won't know what the latest page is!
+ """
+
+ max_blanks = 10
+
+ ##################################################
+ # METHODS SUBCLASSES SHOULD NOT HAVE TO OVERRIDE #
+ ##################################################
+
+ def max_id(self):
+ "Returns the ID of the latest page we've already crawled."
+ try:
+ latest_page = Page.objects.filter(seed__id=self.seed.id).order_by('-url')[0]
+ except IndexError:
+ raise NoPagesYet('Seed ID %s has no pages yet' % self.seed.id)
+ return int(self.id_for_url(latest_page.url))
+
+ def update(self):
+ """
+ Determines the ID of the latest page we've already crawled, and crawls
+ until self.max_blanks blank pages are reached.
+ """
+ current_id = self.max_id()
+ num_blanks = 0
+ while num_blanks < self.max_blanks:
+ current_id += 1
+ page = self.save_page(current_id)
+ if page:
+ num_blanks = 0
+ else:
+ num_blanks += 1
+
+ def save_id_range(self, first_id, last_id):
+ """
+ Downloads and saves Pages for the given ID range, inclusive. Pages
+ won't be saved if they're already in the database.
+ """
+ for id_value in range(int(first_id), int(last_id)+1):
+ self.save_page(id_value)
+
+ ######################################
+ # METHODS SUBCLASSES SHOULD OVERRIDE #
+ ######################################
+
+ def id_for_url(self, url):
+ "Given a URL, returns its ID value. This can be either a string or int."
+ raise NotImplementedError()
+
+class PageAreaCrawler(SpecializedCrawler):
+ """
+ Crawler that finds specific links on a given index page (seed_url)
+ and creates a blobs.Page for each link that hasn't yet been created.
+ """
+
+ ##################################################
+ # METHODS SUBCLASSES SHOULD NOT HAVE TO OVERRIDE #
+ ##################################################
+
+ def update(self):
+ seed_html = self.retriever.get_html(self.seed_url)
+ for url in self.get_links(seed_html):
+ self.save_page(url)
+
+ def public_url(self, unique_id):
+ return unique_id
+
+ ######################################
+ # METHODS SUBCLASSES SHOULD OVERRIDE #
+ ######################################
+
+ def get_links(self, html):
+ """
+ Given the seed HTML, returns the list of links.
+ """
+ raise NotImplementedError()
307 ebdata/ebdata/blobs/update_feeds.py
@@ -0,0 +1,307 @@
+"""
+RSS-feed retriever
+"""
+
+from ebdata.blobs.geotagging import save_locations_for_page
+from ebdata.blobs.models import Seed, Page
+from ebdata.retrieval import UnicodeRetriever
+from ebdata.retrieval import log # Register the logging hooks.
+from ebdata.templatemaker.htmlutils import printer_friendly_link
+from ebdata.textmining.treeutils import make_tree
+from ebpub.utils.dates import parse_date
+import feedparser
+import cgi
+import datetime
+import logging
+import re
+import time
+import urllib
+import urlparse
+
+strip_tags = lambda x: re.sub(r'<[^>]*>', ' ', x).replace('&nbsp;', ' ')
+server_authority_re = re.compile('^(?:([^\@]+)\@)?([^\:]+)(?:\:(.+))?$')
+url_collapse_re = re.compile('([^/]+/\.\./?|/\./|//|/\.$|/\.\.$)')
+
+def remove_query_string(url):
+ bits = urlparse.urlparse(url)
+ return urlparse.urlunparse(bits[:4] + ('',) + bits[5:])
+
+def add_query_string(url, new_values):
+ bits = urlparse.urlparse(url)
+ qs = cgi.parse_qs(bits[4], keep_blank_values=True)
+ qs.update(new_values)
+ return urlparse.urlunparse(bits[:4] + (urllib.urlencode(qs, doseq=True),) + bits[5:])
+
+def normalize_url(base_href, url, normalize_www_flag):
+ """
+ Normalizes the given URL:
+ * Joins it with base_href if it doesn't already have a domain.
+ * Lowercases the scheme (WWW.GOOGLE.COM -> www.google.com).
+ * Removes the port (80 or 443) if it's default.
+ * Collapses '../' and './'.
+ * Alphabetizes the query string by its keys.
+ * If it ends in '/index.html', removes the 'index.html'.
+ * Normalizes the 'www.' subdomain according to normalize_www_flag.
+ Returns None if the URL is invalid.
+
+ normalize_www_flag should be either 1, 2 or 3:
+ * 1 = Remove the 'www.' subdomain, if it exists.
+ * 2 = Add a 'www.' subdomain, if a subdomain doesn't exist.
+ * 3 = Don't touch the subdomain.
+ """
+ # Inspired by http://www.mnot.net/python/urlnorm.py -- BSD license.
+ url = urlparse.urljoin(base_href, url)
+ scheme, authority, path, parameters, query, fragment = urlparse.urlparse(url)
+ scheme = scheme.lower()
+ if '.' not in authority:
+ return None
+ if authority:
+ userinfo, host, port = server_authority_re.match(authority).groups()
+ if host[-1] == '.':
+ host = host[:-1]
+
+ # Normalize the www subdomain, if necessary.
+ if normalize_www_flag == 1 and host.startswith('www.'):
+ host = host[4:]
+ elif normalize_www_flag == 2 and host.count('.') == 1:
+ host = 'www.' + host
+
+ authority = host.lower()
+ if userinfo:
+ authority = "%s@%s" % (userinfo, authority)
+ if port and port != {'http': '80', 'https': '443'}.get(scheme):
+ authority = "%s:%s" % (authority, port)
+
+ if scheme.startswith('http'):
+ last_path = path
+ while 1:
+ path = url_collapse_re.sub('/', path, 1)
+ if last_path == path:
+ break
+ last_path = path
+ if not path:
+ path = '/'
+ if path.endswith('/index.html'):
+ path = path[:-10] # Trim trailing "index.html".
+ if query:
+ # Reorder the query string to alphabetize the keys.
+ query_bits = sorted(cgi.parse_qsl(query, keep_blank_values=True))
+ query = '&'.join(['%s=%s' % (k, v) for k, v in query_bits])
+ return urlparse.urlunparse((scheme, authority, path, parameters, query, ''))
+
+try:
+ # any() built-in only in Python >= 2.5
+ any
+except NameError:
+ def any(iterable):
+ for element in iterable:
+ if element:
+ return True
+ return False
+
+class FeedUpdater(object):
+ def __init__(self, seed, retriever, logger):
+ self.seed = seed
+ self.retriever = retriever
+ self.logger = logger
+
+ def update(self):
+ try:
+ feed = feedparser.parse(self.seed.url)
+ except UnicodeDecodeError:
+ self.logger.info('UnicodeDecodeError on %r', self.seed.url)
+ return
+ for entry in feed['entries']:
+ if 'feedburner_origlink' in entry:
+ url = entry['feedburner_origlink']
+ elif 'pheedo_origLink' in entry:
+ url = entry['pheedo_origLink']
+ elif 'link' in entry:
+ url = entry['link']
+ else:
+ continue # Skip entries with no link.
+
+ try:
+ url = normalize_url(self.seed.base_url, url, self.seed.normalize_www)
+ except Exception:
+ self.logger.warn('Problem normalizing URL: %r, %r, %r', self.seed.base_url, url, self.seed.normalize_www)
+ continue
+
+ if not url:
+ self.logger.info('Skipping article with empty URL: %r, %r', self.seed.base_url, url)
+ continue
+
+ if len(url) > 512:
+ self.logger.warning('Skipping long URL %s', url)
+ continue
+
+ article_date = entry.get('updated_parsed') and datetime.date(*entry['updated_parsed'][:3]) or None
+ if article_date and article_date > datetime.date.today():
+ # Skip articles in the future, because sometimes articles show
+ # up in the feed before they show up on the site, and we don't
+ # want to retrieve the article until it actually exists.
+ self.logger.info('Skipping article_date %s, which is in the future', article_date)
+ continue
+
+ url = self.normalize_url(url)
+
+ try:
+ title = entry['title']
+ except KeyError:
+ self.logger.debug('Skipping %s due to missing title', url)
+ continue
+
+ if not self.download_page(url, title):
+ self.logger.debug('Skipping %s due to download_page()', url)
+ continue
+
+ # If we've already retrieved the page, there's no need to retrieve
+ # it again.
+ try:
+ Page.objects.filter(url=url)[0]
+ except IndexError:
+ pass
+ else:
+ self.logger.debug('URL %s has already been retrieved', url)
+ continue
+
+ # If this seed contains the full content in the RSS feed <summary>,
+ # then we just use it instead of downloading the contents.
+ if self.seed.rss_full_entry:
+ is_printer_friendly = False
+ try:
+ html = entry['summary']
+ except KeyError:
+ html = entry['description']
+ else:
+ is_printer_friendly = False
+ html = None
+ time.sleep(self.seed.delay)
+
+ # First, try deducing for the printer-friendly page, given the URL.
+ print_url = self.get_printer_friendly_url(url)
+ if print_url is not None:
+ try:
+ html = self.get_article_page(print_url)
+ is_printer_friendly = True
+ except Exception, e:
+ self.logger.info('Error retrieving supposedly accurate printer-friendly page %s: %s', print_url, e)
+
+ # If a printer-friendly page didn't exist, get the real page.
+ if html is None:
+ try:
+ html = self.get_article_page(url)
+ except Exception, e:
+ self.logger.info('Error retrieving %s: %s', url, e)
+ continue
+
+ # If a page was downloaded, try looking for a printer-friendly
+ # link, and download that.
+ print_page = self.get_printer_friendly_page(html, url)
+ if print_page is not None:
+ is_printer_friendly = True
+ html = print_page
+
+ new_html = self.scrape_article_from_page(html)
+ if new_html is not None:
+ html = new_html
+
+ if article_date is None:
+ article_date = self.scrape_article_date_from_page(html)
+
+ if not html.strip():
+ self.logger.debug('Got empty HTML page')
+ continue
+
+ article_headline = strip_tags(title)
+ if len(article_headline) > 252:
+ article_headline = article_headline[252:] + '...'
+ p = Page.objects.create(
+ seed=self.seed,
+ url=url,
+ scraped_url=(is_printer_friendly and print_url or url),
+ html=html,
+ when_crawled=datetime.datetime.now(),
+ is_article=True,
+ is_pdf=False,
+ is_printer_friendly=is_printer_friendly,
+ article_headline=article_headline,
+ article_date=article_date,
+ has_addresses=None,
+ when_geocoded=None,
+ geocoded_by='',
+ times_skipped=0,
+ robot_report='',
+ )
+ self.logger.info('Created %s story %r', self.seed.base_url, article_headline)
+ save_locations_for_page(p)
+
+ def normalize_url(self, url):
+ """
+ Given the article URL, returns a normalized version of the URL.
+ """
+ return url
+
+ def download_page(self, url, article_headline):
+ """
+ Given the URL and headline from RSS, returns True if this page should
+ be downloaded, and False if it can be skipped.
+ """
+ return True
+
+ def get_article_page(self, url):
+ return self.retriever.get_html(url)
+
+ def get_printer_friendly_url(self, url):
+ """
+ Given a story URL, returns the printer-friendly URL, or None if it
+ can't be determined.
+ """
+ return None
+
+ def get_printer_friendly_page(self, html, url):
+ """
+ Parses the given detail page and returns the printer-friendly page, or
+ None if not found.
+ """
+ print_link = printer_friendly_link(make_tree(html))
+ if print_link:
+ print_link = urlparse.urljoin(url, print_link)
+ try:
+ return self.get_article_page(print_link)
+ except Exception, e:
+ self.logger.debug('Error retrieving printer-friendly page %s: %s', url, e)
+ return None
+ else:
+ return None
+
+ def scrape_article_from_page(self, html):
+ """
+ Parses the given detail page and returns the article as a string, or
+ None if it can't be found.
+ """
+ return html
+
+ def scrape_article_date_from_page(self, html):
+ """
+ Parses the given detail page and returns the article date as a
+ datetime.date, or None if it can't be found.
+ """
+ return None
+
+def update(seed_id=None):
+ """
+ Retrieves and saves every new item for every Seed that is an RSS feed.
+ """
+ retriever = UnicodeRetriever(cache=None)
+ logger = logging.getLogger('eb.retrieval.blob_rss')
+ qs = Seed.objects.filter(is_rss_feed=True, is_active=True)
+ if seed_id is not None:
+ qs = qs.filter(id=seed_id)
+ for seed in qs:
+ updater = FeedUpdater(seed, retriever, logger)
+ updater.update()
+
+if __name__ == "__main__":
+ from ebdata.retrieval import log_debug
+ update()
0  ebdata/ebdata/nlp/__init__.py
No changes.
297 ebdata/ebdata/nlp/addresses.py
@@ -0,0 +1,297 @@
+import re
+
+# Regex notes:
+# * This is *not* a case-insensitive regex, because we assume
+# capitalized words are special (street names).
+# * All data matched by capturing parentheses is concatenated together, so
+# if you don't want to include something in the resulting string, don't
+# capture it.
+
+# STREET_NAME is a fragment of a regular expression that is used in several
+# places in our "real" regular expression (ADDRESSES_RE) below. The one tricky
+# thing about it is that it includes a "CAPTURE_START" placeholder instead of
+# a capturing opening parenthesis. This lets us create two versions of the
+# regex -- STREET_NAME_CAPTURE and STREET_NAME_NOCAPTURE.
+STREET_NAME = r"""
+ # Here, we define some common false positives and tell the regex to ignore them.
+ (?!
+ [Aa][Ss][Ss][Oo][Cc][Ii][Aa][Tt][Ee][Dd]\ [Pp][Rr][Ee][Ss][Ss] # associated press
+ |
+ [Uu][Nn][Ii][Vv][Ee][Rr][Ss][Ii][Tt][Yy]\ [Oo][Ff] # university of
+ )
+ # DIRECTION
+ %(CAPTURE_START)s
+ (?:
+ [NSEWnsew]\.?
+ |
+ (?:
+ [Nn][Oo][Rr][Tt][Hh] |
+ [Ss][Oo][Uu][Tt][Hh] |
+ [Ee][Aa][Ss][Tt] |
+ [Ww][Ee][Ss][Tt] |
+ [Nn][Oo][Rr][Tt][Hh][Ee][Aa][Ss][Tt] |
+ [Ee][Aa][Ss][Tt][Ww][Ee][Ss][Tt] |
+ [Ss][Oo][Uu][Tt][Hh][Ee][Aa][Ss][Tt] |
+ [Ss][Oo][Uu][Tt][Hh][Ww][Ee][Ss][Tt]
+ )
+ |
+ (?:
+ N\.?W | S\.?W | N\.?E | S\.?E
+ )\.?
+ )
+ \ + # space (but not newline)
+ )?
+ (?:
+ # STREET NAME
+ %(CAPTURE_START)s
+ # Numbered street names with a suffix ("3rd", "4th").
+ \d+(?:st|ST|nd|ND|rd|RD|th|TH|d|D)
+
+ |
+
+ # Or, numbered street names without a suffix ("3", "4")
+ # but with a street type.
+ \d+
+ (?=
+ \ +
+ (?:Ave|Avenue|Blvd|Boulevard|Bvd|Cir|Circle|Court|Ct|Dr|Drive|
+ Lane|Ln|Parkway|Pkwy|Place|Plaza|Pl|Plz|Point|Pt|Pts|Rd|Rte|
+ Sq|Sqs|Street|Streets|St|Sts|Terrace|Ter|Terr|Trl|Way|Wy
+ )
+ \b
+ )
+
+ |
+
+ # Or, street names that don't start with numbers.
+ (?:
+ # Optional prefixes --
+ # "St", as in "St Louis"
+ # "Dr. Martin", as in "Dr. Martin Luther King"
+ (?:
+ [Ss][Tt]\.?
+ |
+ [Dd][Rr]\.?\ [Mm][Aa][Rr][Tt][Ii][Nn]
+ )
+ \ +
+ )?
+ (?:
+ Mass\.(?=\ +[Aa]ve) # Special case: "Mass." abbr. for "Massachussetts Ave."
+ # Needs to be special-cased because of the period.
+ |
+ (?:Avenue|Ave\.?)\ +[A-Z] # Special case: "Avenue X"
+ |
+ [A-Z][a-z][A-Za-z]* # One initial-capped word
+ |
+ [A-Z]\b # Single-letter street name (e.g., K St. in DC)
+ (?!\.\w) # Avoid '20 U.S.A.'
+ )
+ )
+ (?:
+ # Here, we list the options with street suffixes first, so that
+ # the suffix abbreviations are treated as the last part of the
+ # street name, to avoid overeagerly capturing "123 Main St. The".
+ %(CAPTURE_START)s
+ \ +(?:Ave|Blvd|Bvd|Cir|Ct|Dr|Ln|Pkwy|Pl|Plz|Pt|Pts|Rd|Rte|Sq|Sqs|St|Sts|Ter|Terr|Trl|Wy)\.
+ |
+ \ +[A-Z][a-z][A-Za-z]*\ (?:Ave|Blvd|Bvd|Cir|Ct|Dr|Ln|Pkwy|Pl|Plz|Pt|Pts|Rd|Rte|Sq|Sqs|St|Sts|Ter|Terr|Trl|Wy)\.
+ |
+ (?:,?\ Jr\.?,?|\ +[A-Z][a-z][A-Za-z]*){2}\ +(?:Ave|Blvd|Bvd|Cir|Ct|Dr|Ln|Pkwy|Pl|Plz|Pt|Pts|Rd|Rte|Sq|Sqs|St|Sts|Ter|Terr|Trl|Wy)\.
+ |
+ (?:,?\ Jr\.?,?|\ +[A-Z][a-z][A-Za-z]*){3}\ +(?:Ave|Blvd|Bvd|Cir|Ct|Dr|Ln|Pkwy|Pl|Plz|Pt|Pts|Rd|Rte|Sq|Sqs|St|Sts|Ter|Terr|Trl|Wy)\.
+ |
+ (?:,?\ Jr\.?,?|\ +[A-Z][a-z][A-Za-z]*){4}\ +(?:Ave|Blvd|Bvd|Cir|Ct|Dr|Ln|Pkwy|Pl|Plz|Pt|Pts|Rd|Rte|Sq|Sqs|St|Sts|Ter|Terr|Trl|Wy)\.
+ |
+ (?:,?\ Jr\.?,?|\ +[A-Z][a-z][A-Za-z]*){5}\ +(?:Ave|Blvd|Bvd|Cir|Ct|Dr|Ln|Pkwy|Pl|Plz|Pt|Pts|Rd|Rte|Sq|Sqs|St|Sts|Ter|Terr|Trl|Wy)\.
+ |
+ (?:,?\ Jr\.?,?|\ +[A-Z][a-z][A-Za-z]*){1,5}
+ )?
+ # OPTIONAL POST-DIR
+ (?:
+ # Standard post-dir format
+ %(CAPTURE_START)s
+ ,?\s(?:N\.?E|S\.?E|N\.?W|S\.?W|N|S|E|W)\.?
+ )
+ # Avoid greedily capturing more letters, like
+ # '123 Main St, New England' to '123 Main St, N'
+ (?![A-Za-z])
+
+ |
+
+ # Or, a special-case for DC quadrants, to find stuff like:
+ # "600 H Street in NE Washington"
+ # "600 H Street in the NE quadrant"
+ # "600 H Street in northeast DC"
+
+ # Note that this is NOT captured, so that it's excluded from
+ # the final output.
+ ,?
+ \s in
+ %(CAPTURE_START)s
+ \s
+ )
+ (?:
+ (?:the|far) \s
+ )?
+
+ %(CAPTURE_START)s
+ (?:NE|SE|NW|SW|[Nn]ortheast|[Ss]outheast|[Nn]orthwest|[Ss]outhwest)
+ (?=
+ \s (?:quadrant|D\.?C\.?|Washington)
+ )
+ )
+ )?
+ )?
+ )
+"""
+STREET_NAME_CAPTURE = STREET_NAME % {'CAPTURE_START': '('}
+STREET_NAME_NOCAPTURE = STREET_NAME % {'CAPTURE_START': '(?:'}
+
+ADDRESSES_RE = re.compile(r"""(?x)
+ (?<!-|/|:|,|\.|\$) # These various characters are not allowed before an address/intersection.
+ \b
+
+ # Ignore things that look like dates -- e.g., "21 May 2009".
+ # This is a problem e.g. in cases where there's a May Street.
+ (?!
+ \d+\s+
+ (?:January|February|March|April|May|June|July|August|September|October|November|December)
+ ,?\s+
+ \d\d\d\d
+ )
+
+ # Ignore intersections that are prefixed by "University of", like
+ # "University of Texas at Austin". This is a common false positive.
+ (?<!
+ [Uu][Nn][Ii][Vv][Ee][Rr][Ss][Ii][Tt][Yy]\s[Oo][Ff]\s
+ )
+
+ (?:
+ # SEGMENT ("FOO BETWEEN BAR AND BAZ")
+ (?:
+ %(STREET_NAME_CAPTURE)s (,?\ + between \ +) %(STREET_NAME_CAPTURE)s (,?\ + and \ +) %(STREET_NAME_CAPTURE)s
+ |
+ %(STREET_NAME_CAPTURE)s (,?\ + from \ +) %(STREET_NAME_CAPTURE)s (,?\ + to \ +) %(STREET_NAME_CAPTURE)s
+ )
+
+ |
+
+ # BLOCK/ADDRESS
+ (?:
+ (
+ (?:
+ (?:\d+|[Ff][Ii][Rr][Ss][Tt])[-\ ]
+ (?:(?:[Nn][Oo][Rr][Tt][Hh]|[Ss][Oo][Uu][Tt][Hh]|[Ee][Aa][Ss][Tt]|[Ww][Ee][Ss][Tt])\ )?
+ [Bb][Ll][Oo][Cc][Kk]\ [Oo][Ff]
+ |
+ \d+\ *-\ *\d+
+ |
+ \d+
+ )
+ \ +
+ )
+ %(STREET_NAME_CAPTURE)s
+
+ # ignore the intersection in parenthesis so that it's not picked
+ # up as a separate location. We do this by consuming the string
+ # but *not* capturing it.
+ (?:
+ \ +
+ \(?
+ between
+ \ +
+ %(STREET_NAME_NOCAPTURE)s
+ \ +
+ and
+ \ +
+ %(STREET_NAME_NOCAPTURE)s
+ \)?
+ )?
+ )
+
+ |
+
+ # INTERSECTION
+ (?:
+ # Common intersection prefixes. They're included here so that the
+ # regex doesn't include them as part of the street name.
+ (?:
+ (?:
+ [Nn]ear |
+ [Aa]t |
+ [Oo]n |
+ [Tt]o |
+ [Aa]round |
+ [Ii]ntersection\ of |
+ [Cc]orner\ of |
+ [Aa]rea\ of |
+ [Aa]reas?\ surrounding |
+ vicinity\ of |
+ ran\ down |
+ running\ down |
+ crossed
+ )
+ \ +
+ )?
+ \b
+ (?:%(STREET_NAME_CAPTURE)s)
+ (\ +)
+ (
+ (?:
+ [Aa][Nn][Dd] |
+ [Aa][Tt] |
+ [Nn][Ee][Aa][Rr] |
+ & |
+ [Aa][Rr][Oo][Uu][Nn][Dd] |
+ [Tt][Oo][Ww][Aa][Rr][Dd][Ss]? |
+ [Oo][Ff][Ff] |
+ (?:[Jj][Uu][Ss][Tt]\ )?(?:[Nn][Oo][Rr][Tt][Hh]|[Ss][Oo][Uu][Tt][Hh]|[Ee][Aa][Ss][Tt]|[Ww][Ee][Ss][Tt])\ [Oo][Ff] |
+ (?:[Jj][Uu][Ss][Tt]\ )?[Pp][Aa][Ss][Tt]
+ )
+ \ +
+ )
+ (?:%(STREET_NAME_CAPTURE)s)
+ )
+ )
+
+ # OPTIONAL CITY SUFFIX
+ (?:
+ (?:
+ ,?\s+in |
+ ,
+ )
+ \s+
+
+ # CITY NAME
+ (
+ [A-Z][a-z][A-Za-z]* # One initial-capped word
+ (?:
+ ,?\ Jr\.?,?
+ |
+ \ [A-Z][a-z][A-Za-z]*
+ |
+ -[A-Za-z]+ # Hyphenated words (e.g. "Croton-on-Hudson" in NY)
+ ){0,4} # Initial-capped words
+ )
+ )?
+ """ % {'STREET_NAME_CAPTURE': STREET_NAME_CAPTURE, 'STREET_NAME_NOCAPTURE': STREET_NAME_NOCAPTURE})
+
+def parse_addresses(text):
+ """
+ Returns a list of all addresses found in the given string, as tuples in the
+ format (address, city).
+ """
+ # This assumes the last parenthetical grouping in ADDRESSES_RE is the city.
+ return [(''.join(bits[:-1]), bits[-1]) for bits in ADDRESSES_RE.findall(text)]
+
+def tag_addresses(text, pre='<addr>', post='</addr>'):
+ """
+ "Tags" any addresses in the given string by surrounding them with pre and post.
+ Returns the resulting string.
+
+ Note that only the addresses are tagged, not the cities (if cities exist).
+ """
+ def _re_handle_address(m):
+ bits = m.groups()
+ return pre + ''.join(filter(None, bits[:-1])) + (bits[-1] and (', %s' % bits[-1]) or '') + post
+ return ADDRESSES_RE.sub(_re_handle_address, text)
30 ebdata/ebdata/nlp/datelines.py
@@ -0,0 +1,30 @@
+import re
+
+dateline_re = re.compile(ur"""
+ (?:
+ (?: # Either a newline, or a
+ ^ # <p> / <div>, followed by tags/space
+ |
+ </?\s*(?:[Pp]|[Dd][Ii][Vv])[^>]*>
+ )
+ (?:<[^>]*>|\s)* # The start of a line
+ )
+ (?:\(\d\d?-\d\d?\)\s+\d\d?:\d\d\s+[PMCE][SD]T\s+)? # Optional timestamp -- e.g., "(07-17) 13:09 PDT"
+ ([A-Z][A-Z.]*[A-Z.,](?:\s+[A-Z][A-Za-z.]*[A-Za-z.,]){0,4}) # The dateline itself
+ (?: # Optional parenthetical news outlet
+ \s+
+ \(
+ [-A-Za-z0-9]{1,15}
+ (?:\s+[-A-Za-z0-9]{1,15}){0,4}
+ \)
+ )?
+ \s* # Optional space before dash
+ (?:\xa0--\xa0|--|\x97|\u2015|&\#8213;|&\#151;|&\#x97;|) # Dash (or emdash)
+ """, re.MULTILINE | re.VERBOSE)
+
+def guess_datelines(text):
+ """
+ Given some text (with or without HTML), returns a list of the dateline(s)
+ in it. Returns an empty list if none are found.
+ """
+ return dateline_re.findall(text)
51 ebdata/ebdata/nlp/places.py
@@ -0,0 +1,51 @@
+import re
+from ebpub.db.models import Location
+from ebpub.streets.models import Place, Misspelling
+
+def phrase_tagger(phrases, pre='<span>', post='</span>'):
+ # Sort the phrases and then reverse them so, for example, Lake View East
+ # will come before Lake View in the regex, and will match more greedily.
+ # Use the decorate-sort-undecorate pattern and then list.reverse() to
+ # avoid the overhead of calling a custom comparison function.
+ decorated = [(len(p), p) for p in phrases]
+ decorated.sort()
+ decorated.reverse()
+ phrases = [i[1] for i in decorated]
+ # use a closure here to cache the value for phrases
+ def tag_phrases(text):
+ """
+ Returns text with any matches from phrases wrapped with pre and post.
+ """
+ # If no phrases were provided, just return the text we received.
+ if len(phrases) == 0:
+ return text
+
+ def _re_handle_match(m):
+ output = (m.group(1) or '') + m.group(2) + (m.group(3) or '')
+ if m.group(1) and m.group(3):
+ return output
+ return pre + output + post
+ phrases_re = '|'.join([r'\b%s\b' % re.escape(p) for p in phrases])
+
+ # In addition to identifying every phrase, this regex grabs the "pre"
+ # and "post" before the phrase, optionally. Then the _re_handle_match()
+ # function checks whether the "pre" and "post" were provided. If both
+ # were found, that means this phrase was already tagged (perhaps by
+ # tag_addresses(), and thus the new tags aren't inserted. Note that
+ # this assumes that each tagging of the text (whether it's
+ # tag_addresses(), place_tagger() or location_tagger()) uses a
+ # consistent "pre" and "post".
+ return re.sub('(?i)(%s[^<]*)?(%s)([^<]*%s)?' % \
+ (re.escape(pre), phrases_re, re.escape(post)), _re_handle_match, text)
+
+ return tag_phrases
+
+def place_tagger(pre='<addr>', post='</addr>'):
+ phrases = [p['pretty_name'] for p in Place.objects.values('pretty_name').order_by('-pretty_name')]
+ return phrase_tagger(phrases, pre, post)
+
+def location_tagger(pre='<addr>', post='</addr>'):
+ location_qs = Location.objects.values('name').order_by('-name').exclude(location_type__slug__in=('boroughs', 'cities'))
+ locations = [p['name'] for p in location_qs]
+ misspellings = [m['incorrect'] for m in Misspelling.objects.values('incorrect').order_by('-incorrect')]
+ return phrase_tagger(locations + misspellings, pre, post)
0  ebdata/ebdata/nlp/tests/__init__.py
No changes.
153 ebdata/ebdata/nlp/tests/datelines.py
@@ -0,0 +1,153 @@
+from ebdata.nlp.datelines import guess_datelines
+import unittest
+
+class DatelineTestCase(unittest.TestCase):
+ def assertDatelines(self, text, expected):
+ self.assertEqual(guess_datelines(text), expected)
+
+ def test_basic1(self):
+ self.assertDatelines('CHICAGO -- Something happened', ['CHICAGO'])
+
+ def test_basic2(self):
+ self.assertDatelines('CHICAGO-- Something happened', ['CHICAGO'])
+
+ def test_basic3(self):
+ self.assertDatelines('CHICAGO --Something happened', ['CHICAGO'])
+
+ def test_basic4(self):
+ self.assertDatelines('CHICAGO--Something happened', ['CHICAGO'])
+
+ def test_lowercase1(self):
+ self.assertDatelines('chicago -- Something happened', [])
+
+ def test_lowercase2(self):
+ self.assertDatelines('That was in Chicago -- where something happened', [])
+
+ def test_emdash1(self):
+ self.assertDatelines('CHICAGO\x97Something happened', ['CHICAGO'])
+
+ def test_emdash2(self):
+ self.assertDatelines('CHICAGO \x97Something happened', ['CHICAGO'])
+
+ def test_emdash3(self):
+ self.assertDatelines('CHICAGO \x97Something happened', ['CHICAGO'])
+
+ def test_emdash4(self):
+ self.assertDatelines(u'CHICAGO \u2015 Something happened', ['CHICAGO'])
+
+ def test_emdash5(self):
+ self.assertDatelines(u'CHICAGO\xa0--\xa0Something happened', ['CHICAGO'])
+
+ def test_emdash6(self):
+ self.assertDatelines(u'CHICAGO \xa0--\xa0 Something happened', ['CHICAGO'])
+
+ def test_html_entity_dash1(self):