### Scrapper file
I'm creating this file to scrap all the Policies from the PolicyStat URL.

In [105]:
#!/usr/bin/env python3
import csv
import os
import requests
import logging
import sys
import urllib.parse
import pandas as pd

In [106]:
def extract_policy_ids(csv_file):
    ''' Helper function to extract all the Policy Ids from the report.csv file. ''' 
    policy_ids = []
    df = pd.read_csv(csv_file, encoding="utf-16", sep="\t", on_bad_lines='skip')
    
    required_column = {'PolicyStat Id'}
    if not required_column.issubset(df.columns):
        raise ValueError(f"CSV file must contain the columns: {required_columns}")
        
    for _, row in df.iterrows():
        pid = str(row["PolicyStat Id"]).strip()
        if pid:
            policy_ids.append(pid)
        
    return policy_ids

In [97]:
csv_file = "report.csv"
try:
    policy_ids = extract_policy_ids(csv_file)
    if policy_ids:
        logging.info("Fetched %d Policy IDs successfully!", len(policy_ids))
        print(policy_ids)
    else:
        logging.warning("No Policy IDs were found in %s", csv_file)
except Exception as e:
    logging.exception("Error during processing the file: %s", csv_file)
    sys.exit(1)

['10719972', '11114895', '16071276', '6661456', '17347260', '15433540', '15968969', '16173563', '11017158', '14477886', '16708773', '15008510', '11206798', '6590975', '9438271', '9438566', '9438638', '9542392', '13175243', '16896347', '6656914', '14569458', '16119937', '17551306', '9438621', '6682308', '9945966', '17531380', '14463852', '10867130', '13813878', '9540581', '6849253', '6686793', '16054531', '16104447', '16051928', '15423112', '15644796', '16886692', '15555805', '16910284', '17478926', '16708473', '17691026', '16707369', '13647263', '16909281', '15093258', '16412707', '15555732', '16405015', '16132210', '15555649', '16404512', '16447325', '14568297', '15451137', '12796348', '6661915', '6943430', '6943726', '6905976', '6661864', '6590368', '13576706', '6661908', '6905846', '6943480', '15263806', '6589267', '6589500', '6590657', '6590661', '6590655', '6590654', '6664551', '6664488', '6664625', '6592182', '6588751', '6695976', '6589702', '6695947', '6661957', '6589312', '6587

In [110]:
# Retrieve sensitive values from environment variables.
COOKIE_VALUE = os.environ.get("POLICYSTAT_COOKIE")
CSRF_TOKEN = os.environ.get("POLICYSTAT_CSRF_TOKEN")
GRAPHQL_URL = os.environ.get("GRAPHQL_URL")

print(GRAPHQL_URL)

None


In [1]:
COOKIE_VALUE = (
    "policystat_sessionid=<>"
    "policystat_csrftoken=<>"
)
CSRF_TOKEN = "<>"

GRAPHQL_URL = "https://calstate.policystat.com/graphql/"

In [2]:
def build_payload(policy_id):
    return {
        "variables": {
            "input": {
                "documentPk": int(policy_id),
                "watermark": True,
                "showHistory": False,
                "showApplicability": True,
                "draft": False,
                "showChanges": False,
                "includePastApprovals": False
            }
        },
        "query_hash": "f735ccac9857809e0190b1de2b3495da"
    }

In [100]:
def get_pdf_url(policy_id):
    '''  '''
    payload = build_payload(policy_id)
    
    headers = {
        "Accept": "*/*",
        "Content-Type": "application/json",
        "Cookie": COOKIE_VALUE,
        "X-CSRFToken": CSRF_TOKEN,
        "X-Requested-With": "XMLHttpRequest",
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
            "(KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36"
        ),
        "Referer": "https://calstate.policystat.com/",
    }
    
    try:
        response = requests.post(GRAPHQL_URL, json=payload, headers=headers)
        if not response.ok:
            print(f"[DEBUG] {response.status_code} response: {response.text}")
            response.raise_for_status()
    except Exception as e:
        print(f"[ERROR] GraphQL request failed for policy {policy_id}: {e}")
        return None

    try:
        data = response.json()
        return data["data"]["printDocument"]["url"]
    except Exception as e:
        print(f"[ERROR] JSON parsing failed for policy {policy_id}: {e}")
        return None

In [104]:
def download_pdf(pdf_url, output_folder="Policies"):
    ''' Helper function to download PDF using the URL provided in the GraphQL response. '''
    
    os.makedirs(output_folder, exist_ok=True)
    
    headers = {
        "Cookie": COOKIE_VALUE,
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
            "(KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36"
        ),
    }
    
    try:
        r = requests.get(pdf_url, headers=headers, stream=True)
        r.raise_for_status()
    except Exception as e:
        print(f"[ERROR] Could not download PDF from {pdf_url}: {e}")
        return None
    
    parsed_url = urllib.parse.urlparse(pdf_url)
    filename = os.path.basename(parsed_url.path)
    filename = urllib.parse.unquote(filename)
    if not filename.lower().endswith(".pdf"):
        filename += ".pdf"
    
    filepath = os.path.join(output_folder, filename)
    with open(filepath, "wb") as f:
        f.write(r.content)
    return filename

In [103]:
for pid in policy_ids:
    print(f"\n[INFO] Fetching PDF for policy {pid} ...")
    pdf_url = get_pdf_url(pid)

    # print(pdf_url)
    
    if not pdf_url:
        print(f"[WARNING] No PDF URL returned. Skipping {pid}.")
        continue
    
    # print(f"  [INFO] PDF URL: {pdf_url}")
    local_file = download_pdf(pdf_url)
    if local_file:
        print(f"[INFO] Downloaded {local_file}")
    else:
        print(f"[WARNING] Download failed for {pid}")
print("\n[DONE] All policies processed.")


[INFO] Fetching PDF for policy 10719972 ...
[INFO] Downloaded 2021 - 2022 Emergency Grant Allocation.pdf

[INFO] Fetching PDF for policy 11114895 ...
[INFO] Downloaded 2023-24 Impacted Campuses and Programs Policy Procedures.pdf

[INFO] Fetching PDF for policy 16071276 ...
[INFO] Downloaded 2024-25 Academic and Course Reporting Schedule.pdf

[INFO] Fetching PDF for policy 6661456 ...
[INFO] Downloaded Academic Access- Enhancement and Excellence -A2E2- Fee- California State University- East Bay.pdf

[INFO] Fetching PDF for policy 17347260 ...
[INFO] Downloaded Academic Freedom Policy.pdf

[INFO] Fetching PDF for policy 15433540 ...
[INFO] Downloaded Academic Preparation and Placement in First-Year General Education Written Communication and Mathematics-Quantitative Reasoning Courses.pdf

[INFO] Fetching PDF for policy 15968969 ...
[INFO] Downloaded Academic Program Discontinuation.pdf

[INFO] Fetching PDF for policy 16173563 ...
[INFO] Downloaded Accessible Technology Initiative Policy

In [1]:
%%bash
zip -r notebook_content.zip ./*

bash: line 1: zip: command not found


CalledProcessError: Command 'b'zip -r notebook_content.zip ./*\n'' returned non-zero exit status 127.