# Install Required Libraries
Use pip to install missing libraries such as requests and BeautifulSoup.

In [25]:
# Install Required Libraries
# Ensure necessary libraries are installed
import subprocess
import sys

def install(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Install libraries
try:
    import requests
except ImportError:
    install("requests")

try:
    from bs4 import BeautifulSoup
except ImportError:
    install("beautifulsoup4")

# Handle Missing Libraries
Check for missing libraries and provide instructions to install them.

In [26]:
# Handle Missing Libraries
# Function to check for missing libraries
def check_missing_libraries():
    missing_libraries = []
    try:
        import requests
    except ImportError:
        missing_libraries.append("requests")
    try:
        from bs4 import BeautifulSoup
    except ImportError:
        missing_libraries.append("beautifulsoup4")
    
    if missing_libraries:
        print("The following libraries are missing:")
        for lib in missing_libraries:
            print(f"- {lib}")
        print("Use pip to install the missing libraries.")
    else:
        print("All required libraries are installed.")

check_missing_libraries()

All required libraries are installed.


# Process JSON Data
Load and process JSON data from the compliance_checks.json file.

In [27]:
# Process JSON Data
# Load and process JSON data
input_file = "/Users/apple/Desktop/test-1/compliance_checks.json"

try:
    with open(input_file, "r") as f:
        data = json.load(f)
        print("Loaded JSON data:")
        print(data)
except FileNotFoundError:
    print(f"Error: File not found at {input_file}")
except json.JSONDecodeError:
    print("Error: Failed to decode JSON data")

Loaded JSON data:


# web crawling
Implement error handling for missing response syntax and HTTP errors.

In [40]:
import json
import os
import requests
from bs4 import BeautifulSoup

# Define the path to the JSON file
json_file_path = '/Users/apple/Desktop/test-1/compliance_checks.json'

# Open and load the JSON data
with open(json_file_path, 'r') as json_file:
    json_data = json.load(json_file)
    print("JSON data successfully loaded.")

# Function to fetch response syntax
def fetch_response_syntax(service, function):
    service = service.strip()
    function = function.strip()
    url = f"https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/{service}/client/{function}.html".replace('%20', '')
    print(f"Fetching URL: {url}")
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Locate the "Response Syntax" section
    response_syntax_section = soup.find('h3', string='Response Syntax')
    if response_syntax_section:
        pre_tag = response_syntax_section.find_next('div', class_='highlight-default notranslate').find('pre')
        if pre_tag:
            raw_content = pre_tag.text
            print(f"Raw content extracted: {raw_content[:100]}...")  # Log raw content for debugging
            return raw_content

    print(f"Response Syntax not found for {service}/{function}")
    return None

# Function to process JSON data and fetch response syntax
def process_compliance_checks(json_data):
    output_folder = os.path.join('/Users/apple/Desktop/test-1', 'output')
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for item in json_data:
        function_name = item.get('Function Name', '').strip()
        api_function = item.get('API function', '').strip()
        user_function = item.get('user function', '').strip()

        if not function_name or not api_function or not user_function:
            print(f"Skipping invalid entry: {item}")
            continue

        # Add error handling for cases where the expected string pattern is not found
        try:
            service = api_function.split("client = boto3.client(")[1].split(")")[0].strip("'")
        except IndexError:
            print(f"Error: Unable to extract service from API function: {api_function}")
            continue

        try:
            function = user_function.split('(')[0].strip()
        except IndexError:
            print(f"Error: Unable to extract function from user function: {user_function}")
            continue

        response_syntax = fetch_response_syntax(service, function)
        if response_syntax:
            output_file_name = f"{service}_{function_name}_{function}.json"
            output_file_path = os.path.join(output_folder, output_file_name)
            with open(output_file_path, 'w') as output_file:
                json.dump(response_syntax, output_file, indent=4)
            print(f"Saved response syntax to {output_file_path}")

# Process the compliance checks
process_compliance_checks(json_data)

# Add a function to analyze and log the structure of the HTML page
def analyze_html_structure(url):
    print(f"Analyzing HTML structure for URL: {url}")
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Log all <h2> tags and their following content
    print("Headers (h2) and their following content:")
    for header in soup.find_all('h2'):
        print(f"Header: {header.text.strip()}")
        next_element = header.find_next()
        while next_element and next_element.name != 'h2':
            print(f"  {next_element.name}: {next_element.text.strip()[:100]}...")
            next_element = next_element.find_next()

    # Log all <pre> tags
    print("Code blocks (<pre>):")
    for pre_tag in soup.find_all('pre'):
        print(f"  {pre_tag.text.strip()[:100]}...")

# Example usage
url = "https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ec2/client/describe_instances.html"
analyze_html_structure(url)

JSON data successfully loaded.
Fetching URL: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ec2/client/describe_instances.html


Raw content extracted: {
    'NextToken': 'string',
    'Reservations': [
        {
            'ReservationId': 'string',
...
Saved response syntax to /Users/apple/Desktop/test-1/output/ec2_ec2_instance_managed_by_ssm_describe_instances.json
Fetching URL: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ec2/client/describe_instances.html
Raw content extracted: {
    'NextToken': 'string',
    'Reservations': [
        {
            'ReservationId': 'string',
...
Saved response syntax to /Users/apple/Desktop/test-1/output/ec2_ec2_instance_older_than_specific_days_describe_instances.json
Fetching URL: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ssm/client/describe_instance_patch_states.html
Raw content extracted: {
    'InstancePatchStates': [
        {
            'InstanceId': 'string',
            'PatchGroup...
Saved response syntax to /Users/apple/Desktop/test-1/output/ssm_ssm_managed_compliant_patching_describe_instance_patc

KeyboardInterrupt: 