This notebook searches Google to get the boarddocs website of each school district. We iteratively work on the csv marked as "working" to get the schools that we didn't get to before because of errors.

Input:
- `kaggle_school_districts.csv`
- `working_school_districts_with_boarddocs_scraped.csv`

Output:
- `working_school_districts_with_boarddocs_scraped.csv`
- `school_districts_with_boarddocs_scraped.csv`

In [332]:
import pandas as pd
import requests
from tqdm import tqdm
from dotenv import load_dotenv
import os
import random

# Load environment variables from .env file
load_dotenv()

# Get API credentials
google_api_key = os.getenv("GOOGLE_API_KEY")
google_api_key2 = os.getenv("GOOGLE_API_KEY2")
google_api_keys = [google_api_key, google_api_key2]
google_cse_id = os.getenv("GOOGLE_CSE_ID")

if not google_api_key or not google_cse_id:
    raise ValueError("API Key or CSE ID not found. Ensure they are set in the environment.")


# List of potential user agents to simulate different users
user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36",
    "Mozilla/5.0 (Linux; Android 10; SM-G975F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.72 Mobile Safari/537.36",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1",
    # Add more user agents as needed
]

# Function to generate a random IP address
def generate_random_ip():
    return ".".join(str(random.randint(1, 254)) for _ in range(4))

# Function to perform Google API search
def google_search(query, user_id=0):
    url = "https://customsearch.googleapis.com/customsearch/v1"
    # google_api_key = random.choice(google_api_keys)
    # the new API key hasn't register yet
    # test out the above code once it is
    google_api_key = google_api_keys[0]
    
    params = {
        "key": google_api_key,
        "cx": google_cse_id,
        "q": query,
        "num": 1,  # Fetch only the top result
    }
    
    try:
        response = requests.get(url, params=params)
        response.raise_for_status()
        results = response.json()
        if "items" in results:
            return results["items"][0]["link"]
    except Exception as e:
        # before printing the error, remove any secrets
        error_string = str(e)
        # error_string = error_string.replace(google_api_key, "REDACTED_GOOGLE_API_KEY")
        # error_string = error_string.replace(google_cse_id, "REDACTED_GOOGLE_CSE_KEY")
        print(f"Error for query '{query}': {error_string}")
    return None

In [333]:
# Check if the file exists
working_filename = "working_school_districts_with_boarddocs_scraped.csv"

if not os.path.exists(working_filename):
    # Load the school districts CSV
    df = pd.read_csv("kaggle_school_districts.csv")
    # select the two columns
    df = df[["state","school_district"]]
    # Add the unique column
    df["query"] = df["school_district"] + ", " + df["state"]
    # drop the rows with duplicates
    df = df.drop_duplicates(subset="query")
    # Add an empty column called 'url' to the dataframe
    df['url'] = ''
    df['url'] = df['url'].astype('str')
    # Write the dataframe to a CSV file
    df.to_csv(working_filename, index=False)

In [334]:
# Load the working copy
df = pd.read_csv(working_filename)

In [335]:
# get them to be objects as dtype
df = df.astype('object')
df.dtypes

state              object
school_district    object
query              object
url                object
dtype: object

In [336]:
print(f"Total number of all school boards: {df.shape[0]}")

Total number of all school boards: 13079


In [337]:
# get the remaining ones
remaining_df = df[df['url'].isna()]
print(f"Number of remaining school boards to scrape: {remaining_df.shape[0]}")

Number of remaining school boards to scrape: 11533


In [338]:
# test with 10
remaining_df = remaining_df.sample(3000)

In [339]:
# Prepare the queries
import time

queries = [f'{query} "boarddocs"' for query in remaining_df["query"]]

# Perform Google search for each school district with tqdm progress bar
results = []
sleep_flag = True
for query in tqdm(queries, desc="Searching Google", unit="query"):
    # sleep will introduce the lag so that we hit right at the rate limit by Google
    if sleep_flag:
        # 60 seconds per 100 operations
        time.sleep(60/100)
        # time.sleep(60/200)
    results.append(google_search(query))

# Add the results to the DataFrame
remaining_df["url"] = results

Searching Google:   3%|▎         | 81/3000 [01:11<42:38,  1.14query/s]


KeyboardInterrupt: 

In [326]:
# show a sample of results
remaining_df.sample(5)

Unnamed: 0,state,school_district,query,url
7326,NY,Central Square Central School District,"Central Square Central School District, NY",https://go.boarddocs.com/ny/wilcsd/Board.nsf/P...
2197,IL,Bushnell-Prairie City Community Unit School Di...,Bushnell-Prairie City Community Unit School Di...,https://go.boarddocs.com/id/csd132/Board.nsf/f...
7209,NM,Mora Independent Schools,"Mora Independent Schools, NM",https://go.boarddocs.com/nm/taosgov/Board.nsf/...
7046,NJ,Roselle Park Borough School District,"Roselle Park Borough School District, NJ",https://go.boarddocs.com/nj/rtboe/Board.nsf/Pu...
6738,NJ,Clifton City School District,"Clifton City School District, NJ",https://go.boarddocs.com/nj/clifton/Board.nsf/...


In [327]:
# only keep the ones with non NA
remaining_df = remaining_df[~remaining_df['url'].isna()]
print(f"Number of new non-NA results: {remaining_df.shape[0]}")

Number of new non-NA results: 479


In [328]:
# Remove rows that are in remaining_df["school_district"]
df = df[~df["query"].isin(remaining_df["query"])]

# Concatenate remaining_df to df
df = pd.concat([df, remaining_df], ignore_index=True)

In [329]:
# Merge remaining_df with df on 'school_district' and 'state' columns
df.update(remaining_df.set_index('query'), overwrite=False)

In [330]:
# Save the results to the working CSV
df.to_csv(working_filename, index=False)

In [331]:
# percentage done
percentage_done = str(round((df[~df["url"].isna()].shape[0] / df.shape[0])*100,2)) + '%'
print(f"Percentage of total school boards scrapped: {percentage_done}")

Percentage of total school boards scrapped: 11.82%
