<a href="https://colab.research.google.com/github/amoheric/Data-Science-Projects/blob/main/Council_of_Accountability_Court_Judges_Report_scrape.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing Python Libraries


In [67]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import re
from matplotlib import pyplot as plt

 ## Getting the HTML Content

In [68]:
# Function to get the HTML content of a given URL
def get_html_content(url):
    """
    Fetch the content of the provided URL.

    Parameters:
    url (str): The URL of the webpage to scrape.

    Returns:
    soup (BeautifulSoup): Parsed HTML content of the webpage.
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup

### Extracting Report Links From The Webpage

In [69]:
# Function to extract report links from the webpage
def extract_report_links(soup):
    """
    Extracts links to annual reports or other important documents from the webpage.

    Parameters:
    soup (BeautifulSoup): Parsed HTML content of the webpage.

    Returns:
    reports (list): A list of dictionaries containing report names and their URLs.
    """
    reports = []
    for link in soup.find_all('a', href=True):

        # Example condition to identify report links (modify as needed)
        if 'Annual Report' in link.text or 'Performance Metrics' in link.text:
            reports.append({
                'Report Name': link.text.strip(),
                'URL': link['href']
            })

    return reports

#### Getting The HTML Structure For Review

In [70]:
# Function to fetch HTML content
def get_html_content(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad status codes
        return BeautifulSoup(response.content, 'lxml')  # Or 'html.parser', 'html5lib'
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL {url}: {e}")
        return None

### Extracting Quantitative Data

In [71]:
# Function to extract quantitative data from the reports
def extract_quantitative_data(reports):
    """
    Extracts quantitative data such as recidivism rates, completion rates, etc., from the reports.

    Parameters:
    reports (list): A list of dictionaries containing report names and their URLs.

    Returns:
    data (pd.DataFrame): DataFrame containing the extracted quantitative data.
    """
    data = []
    for report in reports:
        # Fetch the report content (this is a placeholder for actual logic)
        report_content = get_html_content(report['URL'])


        if report_content:
            # Print content to verify it's being captured
            print(f"Scraping report: {report['Report Name']}")
            print(report_content.text[:500])  # Print first 500 characters to inspect


        # Example extraction logic (modify based on actual data structure)
        # Use regex or other parsing methods to extract numerical data
        # For example, extracting recidivism rates
        recidivism_rate = re.findall(r'Recidivism Rate: (\d+)%', report_content.text)


        if recidivism_rate:
            data.append({
                'Report Name': report['Report Name'],
                'Recidivism Rate (%)': recidivism_rate[0]
            })


        else:
            print(f"No data found for {report['Report Name']} at {report['URL']}")


    else:
            print(f"Skipping {report['Report Name']} due to error fetching content.")

    # Convert list of dictionaries to DataFrame for further analysis
    df = pd.DataFrame(data)
    return df



### Extracting Qualitative Data

In [72]:
# Function to extract qualitative data (e.g., descriptions, feedback)
def extract_qualitative_data(reports):
    """
    Extracts qualitative data such as descriptions, feedback, and case studies from the reports.

    Parameters:
    reports (list): A list of dictionaries containing report names and their URLs.

    Returns:
    qualitative_data (list): A list of extracted qualitative data.
    """
    qualitative_data = []
    for report in reports:
        # Fetch the report content
        report_content = get_html_content(report['URL'])


        # Example extraction logic (modify based on actual content)
        # Extract paragraphs containing case studies or feedback
        paragraphs = report_content.find_all('p')
        for para in paragraphs:

            if 'Case Study' in para.text or 'Feedback' in para.text:
                qualitative_data.append(para.text.strip())

    return qualitative_data

### Visualizing The Extracted Quantitative Data

In [73]:
# Function to visualize the quantitative data
def visualize_quantitative_data(df):
    """
    Visualizes the extracted quantitative data using bar charts.

    Parameters:
    df (pd.DataFrame): DataFrame containing the extracted quantitative data.
    """
   # Check if the DataFrame is empty
    if df.empty:
        print("No data available to visualize.")
        return

    # Print the columns to verify the correct names
    print("DataFrame columns:", df.columns)

    # Visualize the data if columns exist
    try:
        plt.figure(figsize=(10, 6))
        plt.bar(df['Report Name'], df['Recidivism Rate (%)'].astype(int), color='blue')
        plt.xlabel('Report')
        plt.ylabel('Recidivism Rate (%)')
        plt.title('Recidivism Rates by Report')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.show()
    except KeyError as e:
        print(f"KeyError: {e} - Check if the column names match the expected format.")

## Functions To Run The Data

In [74]:
# Main function to run the data scraping and analysis
def main():

    # URL of the CACJ resources or reports page
    url = 'https://cacj.georgia.gov/resources','https://cacj.georgia.gov/reports','https://cacj.georgia.gov/data-research','https://cacj.georgia.gov/fy2024-quarterly-data-collection-materials','https://cacj.georgia.gov/fy2023-quarterly-data-collection-materials','https://cacj.georgia.gov/fy2022-quarterly-data-collection-materials','https://cacj.georgia.gov/fy2021-quarterly-data-collection-materials','https://cacj.georgia.gov/fy2020-quarterly-data-collection-materials','https://cacj.georgia.gov/fy2019-quarterly-data-collection-materials','https://cacj.georgia.gov/treatment'

    # Get the HTML content of the webpage
    soup = get_html_content(url)


    # Extract report links
    reports = extract_report_links(soup)


    # Extract quantitative data
    quantitative_df = extract_quantitative_data(reports)


    # Debugging: Check if quantitative_df is defined and its structure
    print(quantitative_df.columns)
    print(quantitative_df.head())


    # Visualize quantitative data
    visualize_quantitative_data(quantitative_df)


    # Print qualitative data for review
    qualitative_data = extract_qualitative_data(reports)
    for i, data in enumerate(qualitative_data):
        print(f"Qualitative Data {i+1}: {data}\n")


    # Optionally, save the data to CSV or JSON
    quantitative_df.to_csv('quantitative_data.csv', index=False)
    with open('qualitative_data.json', 'w') as f:
        json.dump(qualitative_data, f, indent=4)



if __name__ == "__main__":
    reports = [
        {'Report Name': 'Annual Report 2023', 'URL': 'https://online.pubhtml5.com/krpu/ryir/'},
        {'Report Name': 'Annual Report 2022', 'URL': 'https://cacj.georgia.gov/document/document/fy22-cacj-statewide-report/download'},
        {'Report Name': 'Annual Report 2021', 'URL': 'https://cacj.georgia.gov/document/document/fy21-cacj-statewide-report/download'},
        {'Report Name': 'Annual Report 2020', 'URL': 'https://cacj.georgia.gov/document/document/fy20-cacj-statewide-report/download'},
        {'Report Name': 'Annual Report 2019', 'URL': 'https://cacj.georgia.gov/document/data/fy19-cacj-statewide-report/download'}
    ]

    df = extract_quantitative_data(reports)

    if df.empty:
        print("No data extracted.")
    else:
        print(df)



Scraping report: Annual Report 2023


FINAL_FY23 CACJ Annual Report _Published












































No data found for Annual Report 2023 at https://online.pubhtml5.com/krpu/ryir/
Scraping report: Annual Report 2022
%âãÏÓ
           
<>/Filter/FlateDecode/ID[<194D99256300EA488357B6DEDE8B71D7><
No data found for Annual Report 2022 at https://cacj.georgia.gov/document/document/fy22-cacj-statewide-report/download
Scraping report: Annual Report 2021
%âãÏÓ
          
<>/Filter/FlateDecode/ID[]/Index[1337 25]/Info 1336 0 R/Length 86/Prev 441957/Root 1338 0 R/Size 1362/Type/XRef/W[1 3 1]>>stream
hÞbbd```b``¶‘s@$“5XÄÌ. ‘,Ÿ@$£X<
No data found for Annual Report 2021 at https://cacj.georgia.gov/document/document/fy21-cacj-statewide-report/download
Scraping report: Annual Report 2020
%âãÏÓ
<
No data found for Annual Report 2020 at https://cacj.georgia.gov/document/document/fy20-cacj-statewide-report/download
Scraping report: Annual Report 2019
%âãÏÓ
             
<>/Filter/Fl

In [75]:
# Assuming 'df' is your DataFrame
print(df.head())  # Prints the first few rows of the DataFrame


Empty DataFrame
Columns: []
Index: []
