In [1]:
# Import packages
import math
import json
import re
import bs4
import csv
import requests

This notebook scrapes the full PRR archive from all of the cities in our sample that have [Next Request](https://www.nextrequest.com/) online public record request portals. These portals publish the full archive of public record requests but do not offer an option to export the data from the website. Accordingly, we use webscraping to scrape the full archive of data from the respective portals. Note that this code will scrape the full archive starting with the most recent requests on the date it is run.

In [None]:
# Scrape data from Albuquerque Next Request portal which has different url/data format than others

cities = ['Albuquerque']

for city in cities:
    fn = '{}.csv'.format(city)
    with open(fn, 'w') as csvfile:
        fieldnames = ['Reference No', 'Request Status', 'requester', 'request', 'departments', 'cost', 'PoC', 'Create Date', 'Close Date']
        writer = csv.DictWriter(csvfile, fieldnames = fieldnames, delimiter = ',')

        writer.writeheader()
        base_url = 'https://nextrequest.cabq.gov/requests'
        
        req = requests.get(base_url)
        html = req.text.encode('utf8')
        soup = bs4.BeautifulSoup(html, "html5lib")
        
        num_rec = int(soup.find_all('h2')[1].span.get_text().lstrip().rstrip())
        print(num_rec)
        num_page = math.ceil(num_rec/25)
        print(num_page)

        for i in range(2, num_page + 2):
            rl = soup.tbody.find_all('tr', class_ = " demo-data-false") 
            for prr in rl:
                cl = list(prr.children)
                # get information from main page
                req_id = cl[1].a.strong.contents[0]
                status = cl[3]['class'][1]
                requester = cl[5].get_text().lstrip().rstrip()
                departments = cl[9].get_text().lstrip().rstrip()
                cost = cl[11].get_text().lstrip().rstrip().strip('$')
                poc = cl[13].get_text().lstrip().rstrip()
                
                #get information from request detail page
                url_det = base_url + "/" + req_id
                req_det = requests.get(url_det)
                html = req_det.text.encode('utf8')
                soup = bs4.BeautifulSoup(html, "html5lib")
                rdl = soup.find_all(id = 'request-text')
                try:
                    #request = rdl[0].p.contents[0].lstrip().rstrip()
                    request = rdl[0].get_text().rstrip().lstrip()
                except:
                    request = 'error'
                    print(req_id)
                rd = soup.find_all('p', class_= "request_date") 
                create_date = re.findall(r'([^\n]+)', rd[0].get_text().lstrip().rstrip())[0]
                time= soup.find_all('span', class_ = 'time-quotes')
                if status == 'closed' and len(time) > 0:
                    close_date = time[0].get_text().lstrip().rstrip()
                else:
                    close_date = 'NaN'
                
                #write to csv
                writer.writerow({'Reference No': req_id,
                                'Request Status': status,
                                'requester': requester,
                                'request': request,
                                'departments': departments,
                                'cost': cost,
                                'PoC': poc,
                                'Create Date': create_date,
                                'Close Date': close_date})
                
            url = 'https://nextrequest.cabq.gov/requests?requests_smart_listing[page]={}'.format(i)
            req = requests.get(url)
            html = req.text.encode('utf8')
            soup = bs4.BeautifulSoup(html, "html5lib")

In [None]:
# Scrape data from Next Request portals from list of cities given below
 
cities = ['bainbridgewa', 'cityoflascruces', 'mercerisland', 'miami','middleboroughma', 'nola', 'oaklandca', 
          'providenceri', 'sanfrancisco', 'vallejo','westsacramento']

for city in cities:
    print(city)
    fn = '{}.csv'.format(city)
    with open(fn, 'w') as csvfile:
        fieldnames = ['Reference No', 'Request Status', 'requester', 'request', 'departments', 'PoC', 'Create Date', 'Close Date']
        writer = csv.DictWriter(csvfile, fieldnames = fieldnames, delimiter = ',')

        writer.writeheader()
        url = 'https://{}.nextrequest.com/requests'.format(city)
        print(url)
        
        req = requests.get(url)
        html = req.text.encode('utf8')
        soup = bs4.BeautifulSoup(html, "html5lib")
        
        num_rec = int(soup.find_all('h2')[1].span.get_text().lstrip().rstrip())
        print(num_rec)
        num_page = math.ceil(num_rec/25)
        print(num_page)

        for i in range(2, num_page + 2):
            rl = soup.tbody.find_all('tr', class_ = " demo-data-false") 
            for prr in rl:
                cl = list(prr.children)
                # get information from main page
                req_id = cl[1].a.strong.contents[0]
                status = cl[3]['class'][1]
                requester = cl[5].get_text().lstrip().rstrip()
                departments = cl[9].get_text().lstrip().rstrip()
                poc = cl[11].get_text().lstrip().rstrip()        
                
            #get information from request detail page
                url_det = url + "/" + req_id
                req_det = requests.get(url_det)
                html = req_det.text.encode('utf8')
                soup = bs4.BeautifulSoup(html, "html5lib")
                rdl = soup.find_all(id = 'request-text')
                try:
                    rdlz = rdl[0]
                except:
                    print(url_det)
                try:
                    #request = rdl[0].p.contents[0].lstrip().rstrip()
                    request = rdlz.get_text().rstrip().lstrip()
                except:
                    request = 'error'
                    print(url_det)
                rd = soup.find_all('p', class_= "request_date") 
                rdz = rd[0] 
                try:
                    create_date = re.findall(r'([^\n]+)', rdz.get_text().lstrip().rstrip())[0]
                except:
                    create_date = 'error'
                    print(url_det)
                time= soup.find_all('span', class_ = 'time-quotes')
                if status == 'closed' and len(time) > 0:
                    tz = time[0]
                    try:
                        close_date = tz.get_text().lstrip().rstrip()
                    except:
                        close_date = 'NaN'
                        print(url_det)
                
                
                #write to csv
                writer.writerow({'Reference No': req_id,
                                'Request Status': status,
                                'requester': requester,
                                'request': request,
                                'departments': departments,
                                'PoC': poc,
                                'Create Date': create_date,
                                'Close Date': close_date})
                    
            url_2 = url+'?requests_smart_listing[page]={}'.format(i)
            req = requests.get(url_2)
            html = req.text.encode('utf8')
            soup = bs4.BeautifulSoup(html, "html5lib")