In [None]:
# import libraries
import pandas as pd
from googleapiclient.discovery import build
import csv
import math
import time

In [None]:
# read in API key and case ID
api_key = open('api_key.txt', 'r').read().strip()
case_id = open('case_id.txt', 'r').read().strip()

In [None]:
# a funtion to search keywords on city websites
def google(search_term, api_key, case_id):
    """
    Use Google Custom Search API to collect search results.
    
    Args:
        search_term: search string. The maximium length is 2048 characters.
        api_key: api key.
        case_id: case_id.
    Returns:
        est_total_num: the number of search results estimated by Google. 
        len(linkl): the number of actual search results.
        titlel: the title of each returned search result.
        linkl: the link of each returned search result.
        snippetl: the snippet of each returned search result.
    """
    
    service = build("customsearch", "v1", developerKey=api_key)
    result = service.cse().list(q=search_term, cx=case_id).execute()
    est_total_num = int(result["searchInformation"]["totalResults"])
    titlel = []
    linkl = []
    snippetl = []
    
    if est_total_num == 0:
        return 0, len(linkl), titlel, linkl, snippetl
    
    elif est_total_num <= 10:
        for item in result["items"]:
            titlel.append(item["title"])
            linkl.append(item["link"])
            snippetl.append(item['snippet'])
        return est_total_num, len(linkl), titlel, linkl, snippetl
    
    else:
        for item in result["items"]:
            titlel.append(item["title"])
            linkl.append(item["link"])
            snippetl.append(item['snippet'])
        total_page = math.ceil(est_total_num/10)
        if total_page > 10:
            total_page = 10 # max return from Google
        for page in range(1, total_page):
            start = page * 10 + 1
            more_result = service.cse().list(q=search_term, cx=case_id, start=start).execute()
            new_total_num = int(more_result["searchInformation"]["totalResults"])
            if new_total_num == 0:
                return est_total_num, len(linkl), titlel, linkl, snippetl
            else:
                for item in more_result["items"]:
                    titlel.append(item["title"])
                    linkl.append(item["link"])
                    snippetl.append(item['snippet'])
        return est_total_num, len(linkl), titlel, linkl, snippetl

In [None]:
# web addresses of all cities
url = pd.read_csv("data/allurl20220929.csv")

In [None]:
# drop cities without a website
source = url[url.url.notna()]
source = source.reset_index(drop=True)

In [None]:
# for search keywords
source["BASENAME_nows"] = source.BASENAME.str.replace(' ', '')

In [None]:
source.head()

In [None]:
# define the output file name
filename = "data/smart_url20221005.csv"

In [None]:
# conduct the search and write into a csv file
for i in range(0, len(source)):
    search_term = f'"smart city" OR "smartcity" OR "smart town" OR "smarttown" OR "smart village" OR "smartvillage" OR "smart borough" OR "smartborough" OR "smart county" OR "smartcounty" OR "smart municipality" OR "smartmunicipality" OR "smart {source.BASENAME.iloc[i]}" OR "smart{source.BASENAME_nows.iloc[i]}" site:' + source.url.iloc[i]
    est_total_num, total_num, titlel, linkl, snippetl = google(search_term, api_key, case_id)
    csv.writer(open(filename, "a")).writerow([source.GISJOIN.iloc[i], est_total_num, total_num, titlel, linkl, snippetl])
    time.sleep(1)
    print(i, source.GISJOIN.iloc[i], "done")