In [53]:
import re
import requests
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
from xhtml2pdf import pisa
from io import BytesIO
import os
from PyPDF2 import PdfReader

def getStateAndSchoolCode(boardDocUrl):
    match = re.search(r"boarddocs\.com/([^/]+)/([^/]+)/", boardDocUrl)

    if match:
        state, district = match.groups()
        return state, district

    return "", ""

def getAllMeetings(stateCode, schoolCode):
    url = "https://go.boarddocs.com/" + stateCode.lower() + "/" + schoolCode.lower() + "/Board.nsf/BD-GETMeetingsListForSEO?open"

    response = requests.get(url)

    if response.status_code != 200:
        print(f"Get All Meetings Error: {response.status_code}" + " " + schoolCode)
        return ""
    
    meetingIds = ""
    for meeting in response.json():
        meetingIds += meeting['Unique'] + "~"

    if meetingIds.endswith("~"):
        meetingIds = meetingIds.rstrip("~")

    return meetingIds

def getFilteredMeetings(keyword, meetingIds, stateCode, schoolCode):
    url = "https://go.boarddocs.com/" + stateCode.lower() + "/" + schoolCode.lower() + "/Board.nsf/BD-SearchInContext?open"

    data = {
        "ids": meetingIds,  
        "searchstring": keyword,
    }

    response = requests.post(url, data=data)

    if response.status_code != 200:
        print(f"Get Filtered Meetings Error: {response.status_code}" + " " + schoolCode)
        return []
    
    print(response.text)
    
    root = ET.fromstring(response.text)
    matched_ids = []

    for id in root.findall("unid"):
        if id.text in meetingIds:
            matched_ids.append(id.text)

    return matched_ids

def getMeetingMinutes(meetingId, stateCode, schoolCode):
    url = "https://go.boarddocs.com/" + stateCode.lower() + "/" + schoolCode.lower() + "/Board.nsf/BD-GetMinutes?open"

    data = {
        'id': meetingId,
    }

    response = requests.post(url, data=data)

    if response.status_code != 200:
        print(f"Get Meeting Minutes Error: {response.status_code}" + " " + schoolCode)
        return False

    return response.text

def getMeetingAgenda(meetingId, stateCode, schoolCode):
    url = "https://go.boarddocs.com/" + stateCode.lower() + "/" + schoolCode.lower() + "/Board.nsf/PRINT-AgendaDetailed?open"

    data = {
        'id': meetingId,
    }

    response = requests.post(url, data=data)

    if response.status_code != 200:
        print(f"Get Meeting Minutes Error: {response.status_code}" + " " + schoolCode)
        return False

    return response.text

def checkKeyword(keyword, htmlText):
    soup = BeautifulSoup(htmlText, 'html.parser')
    text_content = soup.get_text()  # Extract only text, excluding HTML tags
    
    return keyword.lower() in text_content.lower()

def checkKeywordPdf(pdfUrl, keyword):
    response = requests.get(pdfUrl)
    if response.status_code == 200:
        with open("temp.pdf", "wb") as temp_file:
            temp_file.write(response.content)
        # Open and read the PDF
        reader = PdfReader("temp.pdf")
        for page in reader.pages:
            if keyword.lower() in page.extract_text().lower():
                return True
    return False

def getAllFilteredDocuments(folderName, keyword, leaid, meetingId, htmlText):
    count = 1

    soup = BeautifulSoup(htmlText, "html.parser")

    folderPath = os.path.join(folderName, leaid)

    os.makedirs(folderPath, exist_ok=True)

    # Find all anchor <a> tags with href (links)
    for link in soup.find_all("a", href=True):
        fileUrl = link["href"]
        
        # Check if it's a downloadable file by looking at the extension
        if fileUrl.lower().endswith((".pdf")):
            fileUrl = "https://go.boarddocs.com/" + fileUrl

            try:
                if checkKeywordPdf(fileUrl, keyword):
                    fileName = os.path.join(folderPath, "Doc_" + meetingId + "_" + str(count) + ".pdf")

                    # Download the file
                    response = requests.get(fileUrl, stream=True)
                    if response.status_code == 200:
                        with open(fileName, "wb") as file:
                            for chunk in response.iter_content(1024):
                                file.write(chunk)
                        print(f"Downloaded: {os.path.basename(fileUrl)} as {fileName}")
                        count += 1
                    else:
                        print(f"Failed to download: {fileUrl}")
            except Exception as e:
                # If you get an AES error or any other exception, print the fileUrl
                print(f"Error with {fileUrl}: {e}")

def htmlToPdf(folderName, leaid, pdfName, htmlText):
    htmlText = htmlText.replace('undefined', 'black')
    htmlText = htmlText.replace('initial', 'black')

    pdfOutput = BytesIO()

    pisa.CreatePDF(htmlText, dest=pdfOutput, encoding='utf-8')

    folderPath = os.path.join(folderName, leaid)

    # Ensure the folder exists
    os.makedirs(folderPath, exist_ok=True)

    # Construct the full file path
    pdfPath = os.path.join(folderPath, pdfName + ".pdf")

    with open(pdfPath, "wb") as f:
        f.write(pdfOutput.getvalue())



In [None]:
import requests
import os
import fitz  # PyMuPDF
from bs4 import BeautifulSoup

def getAllFilteredDocuments(folderName, keyword, leaid, meetingId, htmlText):
    count = 1

    print("in")

    soup = BeautifulSoup(htmlText, "html.parser")

    print(soup)
    print("\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n")

    folderPath = os.path.join(folderName, leaid)

    os.makedirs(folderPath, exist_ok=True)

    # Find all anchor <a> tags with href (links)
    for link in soup.find_all("a", href=True):
        fileUrl = link["href"]

        print(fileUrl)
        
        # Check if it's a downloadable file by looking at the extension
        if fileUrl.lower().endswith((".pdf", ".docx", ".doc", ".pptx")):
            fileUrl = "https://go.boarddocs.com/" + fileUrl

            print(f"Creating document: {fileUrl}")

            try:
                if fileUrl.lower().endswith((".pdf")):
                    if checkKeywordPdf(fileUrl, keyword):
                        fileName = os.path.join(folderPath, "Doc_" + meetingId + "_" + str(count) + ".pdf")

                        # Download the file
                        response = requests.get(fileUrl, stream=True)
                        if response.status_code == 200:
                            with open(fileName, "wb") as file:
                                for chunk in response.iter_content(1024):
                                    file.write(chunk)
                            print(f"Downloaded: {os.path.basename(fileUrl)} as {fileName}")
                            count += 1
                        else:
                            print(f"Failed to download: {fileUrl}")
                elif fileUrl.lower().endswith((".docx")):
                    if checkKeywordDocx(fileUrl, keyword):
                        fileName = os.path.join(folderPath, "Doc_" + meetingId + "_" + str(count) + ".docx")

                        # Download the file
                        response = requests.get(fileUrl, stream=True)
                        if response.status_code == 200:
                            with open(fileName, "wb") as file:
                                for chunk in response.iter_content(1024):
                                    file.write(chunk)
                            print(f"Downloaded: {os.path.basename(fileUrl)} as {fileName}")
                            count += 1
                        else:
                            print(f"Failed to download: {fileUrl}")
                elif fileUrl.lower().endswith((".doc")):
                    print("exist")
                    if checkKeywordDoc(fileUrl, keyword):
                        fileName = os.path.join(folderPath, "Doc_" + meetingId + "_" + str(count) + ".doc")

                        # Download the file
                        response = requests.get(fileUrl, stream=True)
                        if response.status_code == 200:
                            with open(fileName, "wb") as file:
                                for chunk in response.iter_content(1024):
                                    file.write(chunk)
                            print(f"Downloaded: {os.path.basename(fileUrl)} as {fileName}")
                            count += 1
                        else:
                            print(f"Failed to download: {fileUrl}")
                elif fileUrl.lower().endswith((".pptx")):
                    print("exist")
                    if checkKeywordPptx(fileUrl, keyword):
                        fileName = os.path.join(folderPath, "Doc_" + meetingId + "_" + str(count) + ".pptx")

                        # Download the file
                        response = requests.get(fileUrl, stream=True)
                        if response.status_code == 200:
                            with open(fileName, "wb") as file:
                                for chunk in response.iter_content(1024):
                                    file.write(chunk)
                            print(f"Downloaded: {os.path.basename(fileUrl)} as {fileName}")
                            count += 1
                        else:
                            print(f"Failed to download: {fileUrl}")
            except Exception as e:
                # If you get an AES error or any other exception, print the fileUrl
                print(f"Error with {fileUrl}: {e}")

def checkKeywordPdf(pdfUrl, keyword):
    try:
        # Download the PDF file
        response = requests.get(pdfUrl, stream=True)
        if response.status_code == 200:
            # Save the PDF temporarily
            with open("temp.pdf", "wb") as temp_pdf:
                for chunk in response.iter_content(1024):
                    temp_pdf.write(chunk)

            # Open the PDF with PyMuPDF
            doc = fitz.open("temp.pdf")
            
            # Check if the PDF is encrypted
            if doc.is_encrypted:
                print(f"PDF is encrypted: {pdfUrl}")
                return False

            # Search for the keyword in all pages
            for page_num in range(len(doc)):
                page = doc.load_page(page_num)  # Get a page
                text = page.get_text("text")  # Extract text
                if keyword.lower() in text.lower():  # Case-insensitive search
                    return True
        return False
    except Exception as e:
        print(f"Error with PDF {pdfUrl}: {e}")
        return False


In [64]:
def getFilteredMeetingPdf(leaid, boardDocUrl, keyword):
    stateCode, schoolCode = getStateAndSchoolCode(boardDocUrl)
    if stateCode == "" or schoolCode == "":
        return
    
    meetingIds = getAllMeetings(stateCode, schoolCode)

    if meetingIds == "":
        return
    
    filtredMeetingIds = getFilteredMeetings(keyword, meetingIds, stateCode, schoolCode)
    print("a")
    print(filtredMeetingIds)

    for meetingId in filtredMeetingIds:
        # print("hello")
        htmlMinutes = getMeetingMinutes(meetingId, stateCode, schoolCode)
        if checkKeyword(keyword, htmlMinutes):
            htmlToPdf("meetings", leaid, "Minutes_" + meetingId, htmlMinutes) # ADD PATH TO FOLDER

        htmlMinutes = getMeetingAgenda(meetingId, stateCode, schoolCode)
        if checkKeyword(keyword, htmlMinutes):
            htmlToPdf("meetings", leaid, "Agenda_" + meetingId, htmlMinutes) # ADD PATH TO FOLDER

        getAllFilteredDocuments("meetings", keyword, leaid, meetingId, htmlMinutes)

In [67]:
from docx import Document
import subprocess
from pptx import Presentation

def checkKeywordDocx(docxUrl, keyword):
    try:
        # Download the DOCX file
        response = requests.get(docxUrl)
        if response.status_code == 200:
            # Save the DOCX file temporarily
            with open("temp.docx", "wb") as temp_doc:
                temp_doc.write(response.content)

            # Open the DOCX file with python-docx
            doc = Document("temp.docx")
            
            # Check for the keyword in all paragraphs
            for para in doc.paragraphs:
                if keyword.lower() in para.text.lower():  # Case-insensitive search
                    return True
        return False
    except Exception as e:
        print(f"Error with DOCX {docxUrl}: {e}")
        return False
    

def checkKeywordDoc(docUrl, keyword):
    print("opened")
    try:
        # Download the .doc file
        response = requests.get(docUrl)
        if response.status_code == 200:
            # Save the .doc file temporarily
            doc_path = "temp.doc"
            with open(doc_path, "wb") as temp_doc:
                temp_doc.write(response.content)

            # Use antiword to extract text from the .doc file
            result = subprocess.run(["antiword", doc_path], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
            
            # Check for the keyword in the extracted text
            if result.returncode == 0:
                doc_text = result.stdout
                if keyword.lower() in doc_text.lower():  # Case-insensitive search
                    return True

        return False
    except Exception as e:
        print(f"Error with DOC {docUrl}: {e}")
        return False
    
def checkKeywordPptx(pptxUrl, keyword):
    print("opened")
    try:
        # Download the .pptx file
        response = requests.get(pptxUrl)
        if response.status_code == 200:
            # Save the .pptx file temporarily
            pptx_path = "temp.pptx"
            with open(pptx_path, "wb") as temp_pptx:
                temp_pptx.write(response.content)

            # Open the .pptx file using python-pptx
            presentation = Presentation(pptx_path)

            # Check for the keyword in all slides
            for slide in presentation.slides:
                for shape in slide.shapes:
                    if hasattr(shape, "text"):  # Check if the shape contains text
                        if keyword.lower() in shape.text.lower():  # Case-insensitive search
                            return True

        return False
    except Exception as e:
        print(f"Error with PPTX {pptxUrl}: {e}")
        return False


In [65]:
getFilteredMeetingPdf("3904430", "https://go.boarddocs.com/oh/rlsd/Board.nsf/Public", "This Board hereby")

<?xml version="1.0"?>
<result>
<unid>AQRF763DB117</unid>
<unid>AQRGJJ437EA7</unid>
<unid>AQRF6D3D935C</unid>
<unid>AQRGHX436255</unid>
<unid>AQRF6P3D9F37</unid>
<unid>AQRGJ5436B4B</unid>
<unid>B45G6U4254D3</unid>
<unid>B4D5XY127328</unid>
<unid>AQRF5E3D6F9F</unid>
<unid>AQRGGY433136</unid>
<unid>AQRGRL4505A3</unid>
<unid>AQRM6V5630E7</unid>
<unid>B8GK5C4FAE9F</unid>
<unid>B8GK7S4FAEEE</unid>
<unid>AZ8GE842F5D2</unid>
<unid>AZ8GEP42F5E1</unid>
<unid>AQRGVQ457937</unid>
<unid>AQRM9756856A</unid>
<unid>ARLL8T5417DB</unid>
<unid>ARLL9A5417FB</unid>
<unid>AQPJ464B2BB2</unid>
<unid>AQPJ4D4B350B</unid>
<unid>AQQK8X4E8D6C</unid>
<unid>AQQKMA50E6C9</unid>
<unid>AQRF8J3DE4AB</unid>
<unid>AQRGKN43B8F1</unid>
<unid>AQZK7X4F39EF</unid>
<unid>AQZK874F4482</unid>
<unid>AQRF823DD175</unid>
<unid>AQRGK4439C56</unid>
<unid>AQQK9U4EDA51</unid>
<unid>AQQKRX517614</unid>
<unid>AQRF7F3DBB90</unid>
<unid>AQRGJQ438A16</unid>
<unid>AQRF8C3DDCBD</unid>
<unid>AQRGKG43AC3D</unid>
<unid>AUWL2T547D8B</unid>
<unid>A

Invalid FloatObject b'0.00-80'
Invalid FloatObject b'0.00-80'
Invalid FloatObject b'0.00-80'
Invalid FloatObject b'0.00-80'
Invalid FloatObject b'0.00-80'


Downloaded: RLSD%20-%20Board%20Resolution%20-%20Private-Purpose%20Special%20Trust%20Fund%20(007).pdf as meetings/3904430/Doc_CZCMBV5A7F0D_1.pdf
Downloaded: RLSD%20-%20Board%20Resolution%20-%20Private-Purpose%20Special%20Trust%20Fund%20(007).pdf as meetings/3904430/Doc_CZCMBV5A7F0D_1.pdf
Downloaded: RLSD%20-%20Board%20Resolution%20-%20Girls'%20Golf%2C%20Girls'%20Gymnastics%2C%20and%20Girls'%20Wrestling.pdf as meetings/3904430/Doc_CZCMNL5AF948_1.pdf
Downloaded: RLSD%20-%20Board%20Resolution%20-%20Girls'%20Golf%2C%20Girls'%20Gymnastics%2C%20and%20Girls'%20Wrestling.pdf as meetings/3904430/Doc_CZCMNL5AF948_1.pdf
Downloaded: RLSD%20-%20Board%20Resolution%20-%20Girls'%20Golf%2C%20Girls'%20Gymnastics%2C%20and%20Girls'%20Wrestling.pdf as meetings/3904430/Doc_CZCMNL5AF948_1.pdf
Downloaded: RLSD%20-%20Board%20Resolution%20-%20Girls'%20Golf%2C%20Girls'%20Gymnastics%2C%20and%20Girls'%20Wrestling.pdf as meetings/3904430/Doc_CZCMNL5AF948_1.pdf
Downloaded: Casement%20TIF%20Board%20Resolution%2001-08-

In [62]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv('deliverable_2.csv')

# Loop through each row of the DataFrame
for index, row in df.iterrows():
    lea_id = row['LEAID']
    
    # Check if lea_id is not NaN
    if pd.notna(lea_id):
        lea_id = str(int(lea_id))  # Convert LEAID to an integer if it is not NaN
        boarddocs_url = row['boarddocs_url']
        # Assuming the third argument is fixed, e.g., 'lunch', based on your example
        third_argument = 'lunch'
        
        # Print the function call
        print(f'getFilteredMeetingPdf("{lea_id}", "{boarddocs_url}", "{third_argument}")')
        
        # Call the function
        getFilteredMeetingPdf(lea_id, boarddocs_url, third_argument)
    else:
        print(f"Skipping row {index} due to NaN LEAID")


getFilteredMeetingPdf("2632850", "https://go.boarddocs.com/mi/sjs/Board.nsf/Public", "lunch")
getFilteredMeetingPdf("4204710", "https://go.boarddocs.com/pa/cali/Board.nsf/Public", "lunch")
getFilteredMeetingPdf("3904430", "https://go.boarddocs.com/oh/mapleheights/Board.nsf/Public", "lunch")
Downloaded: July%202023%20BOE%20Agenda.pdf as meetings/3904430/Doc_CTVLFU565653_1.pdf
Downloaded: July%202023%20BOE%20Agenda.pdf as meetings/3904430/Doc_CTVLFU565653_1.pdf
Downloaded: COC%202023-24.pdf as meetings/3904430/Doc_CUAM8V5A0F06_1.pdf


KeyboardInterrupt: 