In [None]:
import pandas as pd
import csv
from googleapiclient.discovery import build
import datetime
import math
import time
from IPython.display import HTML
from openpyxl import load_workbook
from openpyxl.worksheet.datavalidation import DataValidation

In [None]:
# read in API key and case ID
api_key = open('api_key.txt', 'r').read().strip()
case_id = open('case_id.txt', 'r').read().strip()

In [None]:
def google(search_term, api_key, case_id):
    """
    Use Google Custom Search API to collect search results.
    
    Args:
        search_term: search string. The maximium length is 2048 characters.
        api_key: api key.
        case_id: case_id.
    Returns:
        titlel: the title of each returned search result.
        linkl: the link of each returned search result.
        snippetl: the snippet of each returned search result.
    """
    
    service = build("customsearch", "v1", developerKey=api_key)
    result = service.cse().list(q=search_term, cx=case_id).execute()
    est_total_num = int(result["searchInformation"]["totalResults"])
    titlel = []
    linkl = []
    snippetl = []
    if est_total_num == 0:
        return titlel, linkl, snippetl
    elif est_total_num <= 10:
        for item in result["items"]:
            titlel.append(item["title"])
            linkl.append(item["link"])
            snippetl.append(item['snippet'])
        return titlel, linkl, snippetl
    else:
        for item in result["items"]:
            titlel.append(item["title"])
            linkl.append(item["link"])
            snippetl.append(item['snippet'])
        total_page = math.ceil(est_total_num/10)
        if total_page > 10:
            total_page = 10
        for page in range(1, total_page):
            start = page * 10 + 1
            #print(start)
            more_result = service.cse().list(q=search_term, cx=case_id, start=start).execute()
            new_total_num = int(more_result["searchInformation"]["totalResults"])
            #print(new_total_num)
            if new_total_num == 0:
                return titlel, linkl, snippetl
            else:
                for item in more_result["items"]:
                    titlel.append(item["title"])
                    linkl.append(item["link"])
                    snippetl.append(item['snippet'])
        return titlel, linkl, snippetl

In [None]:
# search website
source = "https://urban-mobility-observatory.transport.ec.europa.eu/"

In [None]:
# define the output file name
today_str = datetime.datetime.now().strftime("%Y%m%d")
filename = f"data/euumo{today_str}.csv"

In [None]:
try:
    search_term = '"artificial intelligence" OR "AI" site:' + source
    
    titlel, linkl, snippetl = google(search_term, api_key, case_id)
    
    date_str = datetime.datetime.now().strftime("%Y-%m-%d")
    
    with open(filename, "a", newline="", encoding="utf-8") as f:
        writer = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
        
        for idx, (title, link, snippet) in enumerate(zip(titlel, linkl, snippetl), start=1):
            id_str   = f'euumo{idx:06d}'  
            hyperlink = f'=HYPERLINK("{link}", "{link}")'
            writer.writerow([id_str, date_str, title, snippet, hyperlink])
    
    time.sleep(1)

except Exception as e:
    print("Error:", e, "— sleeping for 60s then retrying")
    time.sleep(60)
    
    # retry once
    titlel, linkl, snippetl = google(search_term, api_key, case_id)
    date_str = datetime.datetime.now().strftime("%Y-%m-%d")
    
    with open(filename, "a", newline="", encoding="utf-8") as f:
        writer = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
        for idx, (title, link, snippet) in enumerate(zip(titlel, linkl, snippetl), start=1):
            id_str    = f'euumo{idx:06d}'
            hyperlink = f'=HYPERLINK("{link}", "{link}")'
            writer.writerow([id_str, date_str, title, snippet, hyperlink])
    
    print("Done")
    time.sleep(1)


In [None]:
df = pd.read_csv(filename, names=["id", "search_date","link_title","link_des","link"])

In [None]:
df["link"] = df["link"].str.extract(r'HYPERLINK\("([^"]+)"')
df

In [None]:
# prepare for coding
new_cols = [
    "inaccessible",
    "irrelevant",
    "city",
    "state",
    "country",
    "year",
    "use_case",
    "mode",
    "motivation",
    "stakeholder",
    "detail",
    "note",
    "other_ref",
    "coder"
]

for col in new_cols:
    df[col] = ""

In [None]:
filename_check = filename.replace(".csv", "_check.xlsx")
df.to_excel(filename_check, index=False)

In [None]:
# set up the mode coding options
wb = load_workbook(filename_check)
ws = wb["Sheet1"]

In [None]:
dv = DataValidation(
    type="list",
    formula1='"road,rail,air,waterborne,cross_modal"',
    allow_blank=True
)

In [None]:
dv.add("M2:M10000")
ws.add_data_validation(dv)

In [None]:
wb.save(filename_check)