Skip to content

Commit

Permalink
some stuff to knit all of the sitemap stuff together
Browse files Browse the repository at this point in the history
  • Loading branch information
JoeGermuska committed Jul 2, 2016
1 parent 8ea081f commit c13df90
Show file tree
Hide file tree
Showing 3 changed files with 83 additions and 94 deletions.
20 changes: 20 additions & 0 deletions sitemap/build_all.py
@@ -0,0 +1,20 @@
#!/usr/bin/env python
"""This would typically be used once after each database update to refresh the sitemap files.
It has hard-coded the path where the files should be written, which is expected to be
a checkout of the Census Reporter public webapp in a directory adjacent to this repository.
It also has the database connect string hardcoded, because it will only get run once or twice a year.
"""
from table import write_table_sitemap
from profile import write_profile_sitemaps

DEFAULT_OUTPUT_DIR = '../../censusreporter/censusreporter/apps/census/static/sitemap/'
# this connect string uses a non-standard port, as in the case when something is being
# SSH tunneled from production. Fiddle with this as appropriate.
DEFAULT_CONNECT_STRING = 'postgresql://census:censuspassword@localhost:5433/census'
def main():
write_table_sitemap(DEFAULT_OUTPUT_DIR,DEFAULT_CONNECT_STRING)
write_profile_sitemaps(DEFAULT_OUTPUT_DIR,DEFAULT_CONNECT_STRING)
if __name__ == '__main__':
main()
129 changes: 54 additions & 75 deletions sitemap/profile.py
@@ -1,55 +1,68 @@
from jinja2 import Environment, FileSystemLoader
import psycopg2
import re
import os.path

EXCLUDED_SUMMARY_LEVELS = ['250'] # we know that these don't work correctly right now.

def build_all_sitemaps():
def write_profile_sitemaps(output_dir,db_connect_string='postgresql://census:censuspassword@localhost:5432/census'):
''' Builds sitemap XML files for all summary levels. Each XML file contains pages for one
summary level, with a maximum of 50,000 URLs.
params: none
return: none
'''
sitemaps_created = []
for summary_level in query_all_levels(db_connect_string):
if summary_level not in EXCLUDED_SUMMARY_LEVELS:
print "querying level {}".format(summary_level)
results = query_one_level(summary_level, db_connect_string)
urls = []

levels_urls = build_all_page_lists()
for result in results:
(display_name, full_geoid) = result
urls.append(build_url(display_name, full_geoid))

for level in levels_urls:
num_urls = len(levels_urls[level])
num_urls = len(urls)

# If there are <= 50k URLs, write them immediately
if num_urls <= 50000:
fname = 'profiles/sitemap_' + level + '.xml'
f = open(fname, 'w')
# If there are <= 50k URLs, write them immediately
if num_urls <= 50000:
filename = 'sitemap_' + summary_level + '.xml'
f = open(os.path.join(output_dir,filename), 'w')

f.write(build_sitemap(levels_urls[level]))
f.write(build_sitemap(urls))

print 'Wrote sitemap to file %s' % (fname)

f.close()

# Otherwise, split up the URLs into groups of 50,000
else:
num_files = num_urls / 50000 + 1

for i in range(num_files):
fname = 'profiles/sitemap_' + level + '_' + str(i + 1) + '.xml'
f = open(fname, 'w')
print 'Wrote sitemap to file %s' % (filename)
sitemaps_created.append(filename)
f.close()

for url in levels_urls[level][i * 50000 : (i + 1) * 50000]:
# Python allows list indexing out of bounds without complaint,
# i.e., if L = [1, 2, 3], then L[2:4] just gives [3]
# Otherwise, split up the URLs into groups of 50,000
else:
num_files = num_urls / 50000 + 1

f.write("%s\n" % url)
for i in range(num_files):
filename = 'sitemap_' + summary_level + '_' + str(i + 1) + '.xml'
f = open(os.path.join(output_dir,filename), 'w')
f.write(build_sitemap(urls[i * 50000 : (i + 1) * 50000]))
print 'Wrote sitemap to file %s' % (filename)
sitemaps_created.append(filename)
f.close()

print 'Wrote sitemap to file %s' % (fname)
write_master_sitemap(output_dir, sitemaps_created)

f.close()
def write_master_sitemap(output_dir,files):
files = files[:]
files.extend(['sitemap_tables.xml','topics/sitemap.xml'])
urls = ['https://censusreporter.org/{}'.format(f) for f in files]

with open(os.path.join(output_dir,'sitemap.xml'),'w') as f:
f.write(build_sitemap(urls))
print 'wrote index sitemap.xml file'

def build_sitemap(page_data):
''' Builds sitemap from template in sitemap.xml using data provided
in page_data.
in page_data.
params: page_data = list of page URLs
returns: XML template with the page URLs
Expand All @@ -58,56 +71,22 @@ def build_sitemap(page_data):

env = Environment(loader = FileSystemLoader('.'))
template = env.get_template('sitemap.xml')

return template.render(pages = page_data)


def build_all_page_lists():
''' Builds a URL/page list for all sumlevels.
params: none
return: dict of {level:list of URLs for that level}
'''

levels = query_all_levels()
urls = {}

for level in levels:
urls[level] = build_one_page_list(level)

return urls


def build_one_page_list(level):
''' Builds a URL/page list for one sumlevel ("level")
params: level = string of the summary level code (e.g., '040')
return: list of slugified URLs
'''

results = query_one_level(level)
urls = []

for result in results:
urls.append(build_url(result[1], result[2]))

return urls
return template.render(pages = page_data)


def query_all_levels():
''' Queries database to get list of all sumlevels
def query_all_levels(db_connect_string):
''' Queries database to get list of all sumlevels
params: none
returns: list of all sumlevels (strings)
'''

conn = psycopg2.connect("dbname=census user=census")
conn = psycopg2.connect(db_connect_string)
cur = conn.cursor()

q = "SELECT DISTINCT sumlevel FROM tiger2014.census_name_lookup;"
q = "SELECT DISTINCT sumlevel FROM tiger2014.census_name_lookup order by sumlevel;"
cur.execute(q)
results = cur.fetchall()
# Format of results is [('000',), ('001',), ...]
Expand All @@ -118,19 +97,19 @@ def query_all_levels():
return results_list


def query_one_level(level):
''' Queries database for one sumlevel ("level")
def query_one_level(level,db_connect_string):
''' Queries database for one sumlevel ("level")
params: level = string of the summary level code (e.g., "040")
return: all results found as a list of tuples
return: all results found as a list of tuples
(sumlevel, display_name, full_geoid)
'''

conn = psycopg2.connect("dbname=census user=census")
conn = psycopg2.connect(db_connect_string)
cur = conn.cursor()

q = "SELECT sumlevel, display_name, full_geoid from tiger2014.census_name_lookup where sumlevel = '%s'" % (level)
q = "SELECT display_name, full_geoid from tiger2014.census_name_lookup where sumlevel = '%s'" % (level)
cur.execute(q)
results = cur.fetchall()

Expand Down Expand Up @@ -162,7 +141,7 @@ def slugify(name):
(2) converting to lowercase, (3) turning spaces to dashes
params: name = string to change
return: slugified string
return: slugified string
'''

Expand All @@ -177,7 +156,7 @@ def slugify(name):


def main():
build_all_sitemaps()
write_profile_sitemaps('.')


# Some tests
Expand All @@ -188,4 +167,4 @@ def main():


if __name__ == "__main__":
main()
main()
28 changes: 9 additions & 19 deletions sitemap/table.py
Expand Up @@ -3,18 +3,17 @@
import psycopg2
import os.path

DEFAULT_OUTPUT_DIR = '../../censusreporter/censusreporter/apps/census/static/sitemap/'
def write_table_sitemap():
def write_table_sitemap(output_dir,db_connect_string='postgresql://census:censuspassword@localhost:5432/census'):
''' Builds table.xml sitemap file. There are not more than
50,000 URLs, so we can use one file without issue.
params: none
return: none
'''

table_urls = build_table_page_list()
fname = os.path.join(DEFAULT_OUTPUT_DIR,'sitemap_tables.xml')
table_names = query_table_list(db_connect_string)
table_urls = build_table_page_list(table_names)
fname = os.path.join(output_dir,'sitemap_tables.xml')
with open(fname, 'w') as f:
f.write(build_sitemap(table_urls))
print 'Wrote table sitemap to file %s' % (fname)
Expand All @@ -35,7 +34,7 @@ def build_sitemap(page_data):
return template.render(pages = page_data)


def query_table_list():
def query_table_list(db_connect_string):
''' Queries the database for a list of all one-year
and five-year tables. Removes duplicates from them,
and returns a set of all table IDs.
Expand All @@ -47,7 +46,7 @@ def query_table_list():
# we oughta parameterize this but feeling lazy...
# this is assuming that you're running the DB on a non-standard port, say
# as if you were SSH tunneling from production to your machine
conn = psycopg2.connect('postgresql://census:censuspassword@localhost:5433/census')
conn = psycopg2.connect(db_connect_string)
cur = conn.cursor()

q1 = "SELECT DISTINCT tables_in_one_yr from census_tabulation_metadata;"
Expand All @@ -70,15 +69,14 @@ def query_table_list():
return tables


def build_table_page_list():
def build_table_page_list(table_names):
''' Builds the URL/pages list for all tables.
params: none
return: list of CensusReporter URLs
'''

table_names = query_table_list()
table_urls = []

for table in sorted(table_names):
Expand Down Expand Up @@ -106,16 +104,8 @@ def build_url(table_name):


def main():

write_table_sitemap()
write_table_sitemap('.')


if __name__ == "__main__":
if os.path.isdir(DEFAULT_OUTPUT_DIR):
main()
else:
print """
Expecting to run alongside a Census Reporter git checkout.
Can't find output directory
{}
so didn't do anything.""".format(DEFAULT_OUTPUT_DIR)
main()

0 comments on commit c13df90

Please sign in to comment.