# Search keywords on municipal websites

In [16]:
import pandas as pd
from googleapiclient.discovery import build
import csv
import math
import time

Custom Search JSON API provides 100 search queries per day for free. If you need more, you may sign up for billing in the API Console. Additional requests cost $5 per 1000 queries, up to 10k queries per day. Daily quotas reset at midnight Pacific Time (PT).

In [20]:
UScityURL = pd.read_csv("UScityURL.csv")
UScityURL.head()

Unnamed: 0,GISJOIN,city,state,website_available,website_URL
0,G36051000,New York,New York,1,https://www.nyc.gov/
1,G06044000,Los Angeles,California,1,https://www.lacity.org/
2,G17014000,Chicago,Illinois,1,https://www.chicago.gov/
3,G48035000,Houston,Texas,1,http://www.houstontx.gov/
4,G04055000,Phoenix,Arizona,1,https://www.phoenix.gov/


In [21]:
source = UScityURL[UScityURL.website_available==1].reset_index(drop=True)

In [25]:
api_key = open('api_key.txt', 'r').read().strip()
case_id = open('case_id.txt', 'r').read().strip()

In [10]:
def search(search_term, api_key, case_id):
    """
    Use Google Custom Search API to collect search results.
    
    Args:
        search_term: search string. The maximium length is 2048 characters.
        api_key: api key.
        case_id: case_id.
    Returns:
        est_total_num: the number of search results estimated by Google. 
        len(linkl): the number of actual search results.
        titlel: the title of each returned search result.
        linkl: the link of each returned search result.
        snippetl: the snippet of each returned search result.
    """
    
    service = build("customsearch", "v1", developerKey=api_key)
    result = service.cse().list(q=search_term, cx=case_id).execute()
    est_total_num = int(result["searchInformation"]["totalResults"])
    title_list = []
    link_list = []
    snippet_list = []
    if est_total_num == 0:
        return len(link_list), title_list, link_list, snippet_list
    elif est_total_num <= 10:
        for item in result["items"]:
            title_list.append(item["title"])
            link_list.append(item["link"])
            snippet_list.append(item['snippet'])
        return len(link_list), title_list, link_list, snippet_list
    else:
        for item in result["items"]:
            title_list.append(item["title"])
            link_list.append(item["link"])
            snippet_list.append(item['snippet'])
        total_page = math.ceil(est_total_num/10)
        if total_page > 10:
            total_page = 10
        for page in range(1, total_page):
            start = page * 10 + 1
            more_result = service.cse().list(q=search_term, cx=case_id, start=start).execute()
            new_total_num = int(more_result["searchInformation"]["totalResults"])
            if new_total_num == 0:
                return len(link_list), title_list, link_list, snippet_list
            else:
                for item in more_result["items"]:
                    title_list.append(item["title"])
                    link_list.append(item["link"])
                    snippet_list.append(item['snippet'])
        return len(link_list), title_list, link_list, snippet_list

In [9]:
output_file = "test.csv"

In [26]:
for i in range(3, 4):
    search_term = '"inequity" site:' + source.website_URL.iloc[i]
    total_num, title, link, snippet = search(search_term, api_key, case_id)
    csv.writer(open(output_file, "a")).writerow([source.GISJOIN.iloc[i], total_num, title, link, snippet])
    time.sleep(1)

In [28]:
#check output file
df = pd.read_csv(output_file, names=["GISJOIN","total","title","link","snippet"])
df

Unnamed: 0,GISJOIN,total,title,link,snippet
0,G48035000,27,"['Health Disparities Summary 2019', 'Community...",['https://www.houstontx.gov/health/chs/documen...,['To the Residents of Houston/Harris County: T...
