This notebook searches Google to get the boarddocs website of each school district. We iteratively work on the csv marked as "working" to get the schools that we didn't get to before because of errors.

Input:
- `school_district_list.html`
- `working_school_districts_with_boarddocs_scraped.csv`

Output:
- `working_school_districts_with_boarddocs_scraped.csv`
- `school_districts_with_boarddocs_scraped.csv`

In [213]:
import pandas as pd
import requests
from tqdm import tqdm
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Get API credentials
google_api_key = os.getenv("GOOGLE_API_KEY")
google_cse_id = os.getenv("GOOGLE_CSE_ID")

if not google_api_key or not google_cse_id:
    raise ValueError("API Key or CSE ID not found. Ensure they are set in the environment.")

# Function to perform Google API search
def google_search(query):
    url = "https://www.googleapis.com/customsearch/v1"
    params = {
        "key": google_api_key,
        "cx": google_cse_id,
        "q": query,
        "num": 1,  # Fetch only the top result
    }
    try:
        response = requests.get(url, params=params)
        response.raise_for_status()
        results = response.json()
        if "items" in results:
            return results["items"][0]["link"]
    except Exception as e:
        print(f"Error for query '{query}': {e}")
    return None

In [214]:
# Check if the file exists
working_filename = "working_school_districts_with_boarddocs_scraped.csv"

if not os.path.exists(working_filename):
    # Load the school districts CSV
    df = pd.read_csv("school_districts.csv")
    # Add an empty column called 'url' to the dataframe
    df['url'] = ''
    df['url'] = df['url'].astype('str')
    # Write the dataframe to a CSV file
    df.to_csv(working_filename, index=False)

In [215]:
# Load the working copy
df = pd.read_csv(working_filename)

In [216]:
# get them to be objects as dtype
df = df.astype('object')
df.dtypes

school_district    object
state              object
url                object
dtype: object

In [217]:
print(f"Total number of all school boards: {df.shape[0]}")

Total number of all school boards: 11757


In [218]:
# get the remaining ones
remaining_df = df[df['url'].isna()]
print(f"Number of remaining school boards to scrape: {remaining_df.shape[0]}")

Number of remaining school boards to scrape: 11718


In [219]:
# test with 10
remaining_df = remaining_df.sample(10)

In [None]:
# Prepare the queries
queries = [f'{district} "boarddocs"' for district in remaining_df["school_district"]]

# Perform Google search for each school district with tqdm progress bar
results = []
for query in tqdm(queries, desc="Searching Google", unit="query"):
    results.append(google_search(query))

# Add the results to the DataFrame
remaining_df["url"] = results

In [221]:
# only keep the ones with non NA
remaining_df = remaining_df[~remaining_df['url'].isna()]
print(f"Number of new non-NA results: {remaining_df.shape[0]}")

Number of new non-NA results: 0


In [222]:
# Remove rows that are in remaining_df["school_district"]
df = df[~df["school_district"].isin(remaining_df["school_district"])]

# Concatenate remaining_df to df
df = pd.concat([df, remaining_df], ignore_index=True)

In [223]:
# Merge remaining_df with df on 'school_district' and 'state' columns
df.update(remaining_df.set_index('school_district'), overwrite=False)

In [224]:
# Save the results to the working CSV
df.to_csv(working_filename, index=False)

In [225]:
# percentage done
percentage_done = str(round((df[~df["url"].isna()].shape[0] / df.shape[0])*100,2)) + '%'
print(f"Percentage of total school boards scrapped: {percentage_done}")

Percentage of total school boards scrapped: 0.33%
