# POLI 179 Final Project
### By: Alyson Otañez 

## Scraping city council agenda text for: Ontario, Fontana, Rialto, and March Joint Powers Authority 

### Website Links:
* Ontario - https://www.ontarioca.gov/Agendas/CityCouncil
* Fontana - https://fontana.legistar.com/Calendar.aspx
* Rialto - https://rialto.legistar.com/Calendar.aspx
* March Joint Powers Authority - https://marchjpa.com/meetings-agendas/
* Chino - http://chinocityca.iqm2.com/Citizens/Calendar.aspx

#### Note: This code only reads the text for pdf's, some pdf's were scanned, meaning they were images, this text was read using R. This extra step was needed for: Ontario and the March Joint Powers Authority. The entire text of the city of Chino was extracted using R, date and year was extracted using python.

#### Note: This code should be ran separately for each website. The code below is just a template

In [None]:
# Import packages, may need to use 'pip install' before importing

import codecs
from PyPDF2 import PdfReader
import webbrowser
import requests
from bs4 import BeautifulSoup
from tika import parser
import pandas as pd
import requests
from PyPDF2 import PdfReader
import pandas as pd
from requests.exceptions import ChunkedEncodingError
import time
from datetime import datetime
import re

In [None]:
# Read html's and save contents 
html = [] # Download html contents and store them in your working directory, read them in here as strings 
          # Will need to download all pages if the contents span more than one page

full_html = '' # Empty string to store contents of html's

for filename in html: # For loop to open and read the contents of each html file
    with codecs.open(filename, 'r') as f:
        full_html += f.read()

In [None]:
# Find the keyword that corresponds to all agenda links 

keyword = '' # Agendas have a common name. 
             # Example: all links contain the phrase 'agenda-file' that would be the keyword
count = 0
found = [0] * len(full_html)
for i in range(len(full_html)):
    if full_html[i:i+len(keyword)] == keyword:
        found[count] = i
        count += 1
        
found = found[0:count]

In [None]:
# Extract the agenda links 

file_links = [] # Empty list to store links 

for i in range(len(found)): # For loop to extract links based on where they were "found" above
    index = found[i]
    while(full_html[index] != '\"'):
        index += 1
    file_links = file_links + [full_html[found[i]:index]]

In [None]:
# Add the base url to complete the links

complete_links = [] # Empty list to store complete links 
base_url = "https://www.ontarioca.gov/" # Example
for link in file_links:
    complete_links.append(base_url + link)

In [None]:
# Function to download pdfs based on the links in complete_links and extract text 

def download_pdf(url, max_retries=3): # Downloads pdf temporarily within the directory 
    for _ in range(max_retries):
        try:
            response = requests.get(url)
            return response.content
        except ChunkedEncodingError: # Code proceeds if there is still an error after the 3rd attempt 
            print("Connection error occurred. Retrying after 5 seconds...")
            time.sleep(5)
    
    return None

def extract_text_from_pdf(url):
    pdf_data = download_pdf(url) # Downloads pdf using function above
    if pdf_data is None:
        print(f"Failed to download PDF from URL: {url}")
        return ''

    with open('temp.pdf', 'wb') as f: # Opens pdf
        f.write(pdf_data)

    with open('temp.pdf', 'rb') as f: # Extracts text from each page, reading each pdf page together
        reader = PdfReader(f)
        text = ''
        for page_num, page in enumerate(reader.pages):
            try:
                text += page.extract_text()
            except Exception as e:
                print(f"Error extracting text from page {page_num + 1}: {e}") # Code proceeds if there is an error 
        
    return text

In [None]:
# Apply function the complete_links 

df = pd.DataFrame(columns=['PDF Link', 'Text']) # Empty dataframe to store data 

for link in complete_links:
    try:
        text = extract_text_from_pdf(link)
        df = df.append({'PDF Link': link, 'Text': text}, ignore_index=True)
    except Exception as e:
        print(f"Error processing PDF link: {link}. Exception: {e}") # Code proceeds if there is an error 

print(df)

# Next step is initially for Fontana and Rialto 
### March Joint Powers and Ontario had missing text given that some of their PDFs were scanned. This text was then extracted in R. The data was then read back in python where the functions below were applied to get the date and year. Dates and year for the city of Chino were read using python, text was gathered in R.

In [None]:
# Function to extract the date from the text

date_regex = r"(Sunday|Monday|Tuesday|Wednesday|Thursday|Friday|Saturday), (January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}, \d{4}"

def extract_date(text):
    if isinstance(text, str):
        match = re.search(date_regex, text)
        if match:
            return match.group(0) # Returns date in Day, Month, Year format
    return None

# Apply function to df
df['Date'] = df['Text'].apply(extract_date)
df

In [None]:
# Function to extract the year

date_regex = r"(Sunday|Monday|Tuesday|Wednesday|Thursday|Friday|Saturday), (January|February|March|April|May|June|July|August|September|October|November|December) (\d{1,2}), (\d{4})"

def extract_year(text):
    if isinstance(text, str):
        match = re.search(date_regex, text)
        if match:
            return match.group(4) # Returns the year only 
    return None

df['Year'] = df['Text'].apply(extract_year)
df

In [None]:
# Save df as csv 

df.to_csv('df.csv', index=False)