In [1]:
import requests
from bs4 import BeautifulSoup
import json
from datetime import datetime
import pytz

# Function to fetch data from the website
def fetch_financial_services_data():
    json_data = []
    ist = pytz.timezone('Asia/Kolkata')
    main_url = "https://www.mass.gov/lists/selected-financial-services-laws"

    # Get the main page content
    response = requests.get(main_url)
    soup = BeautifulSoup(response.content, "html.parser")
    download_links = soup.find_all("div", class_="ma__download-link")

    data = []

    # Process each download link element
    for link in download_links:
        # Find the span element within the div
        span_element = link.find("span")

        if span_element:
            # Find the a tag within the span element
            a_tag = span_element.find("a")

            if a_tag and 'href' in a_tag.attrs:
                full_url = a_tag['href']

                # Add the URL to the data list
                data.append({
                    'url': full_url
                })

    # Process each URL
    for entry in data:
        site_url = entry['url']

        # Fetch the page content
        page_response = requests.get(site_url)
        page_soup = BeautifulSoup(page_response.content, "html.parser")

        # Find the h2 content inside the specified structure
        h2_content = page_soup.select_one("main .content .container .row .col-xs-12.col-md-8 h2")
        sub_heading = h2_content.get_text().strip() if h2_content else "Heading not found"

        # Find all p tags
        p_tags = page_soup.select("main .content .container .row .col-xs-12.col-md-8 p")

        # Check if there is at least one p tag
        if p_tags:
            text_content = '\n'.join([p.get_text().strip() for p in p_tags])

            # Get current IST datetime
            now_utc = datetime.now(pytz.utc)
            now_ist = now_utc.astimezone(ist)
            formatted_date_time_ist = now_ist.strftime("%m%d%y %H:%M:%S")

            # Create the JSON object
            text_object_json = {
                "text": text_content,
                "metadata": {
                    "date_downloaded": formatted_date_time_ist,
                    "site_url": site_url,
                    "extra_data": {
                        "heading": "Massachusetts General Laws (M.G.L.) applying to the financial services industry.",
                        "sub_heading": sub_heading,
                    }
                },
                "volunteer_id": "2121",
                "location": "Pune, Maharashtra, India"
            }

            # Append the JSON object to json_data list
            json_data.append(text_object_json)

        general_laws_list = page_soup.select("main .content .container .row .col-xs-12.col-md-8 .row .col-xs-12 ul")

        if general_laws_list:
            # Extract links from <u class="generalLawsList">
             for ul_tag in general_laws_list:
                li_tags = ul_tag.find_all("li")
                for li_tag in li_tags:
                    a_tag = li_tag.find("a")
                    if a_tag and 'href' in a_tag.attrs:
                        # Join the base URL with the relative URL
                        full_link = "https://malegislature.gov" + a_tag['href']

                        # Fetch the content of the link
                        link_response = requests.get(full_link)
                        link_soup = BeautifulSoup(link_response.content, "html.parser")

                        # Find the h2 content inside the specified structure
                        link_h2_content = link_soup.select_one("main .content .container .row .col-xs-12.col-md-8 h2")
                        link_sub_heading = link_h2_content.get_text().strip() if link_h2_content else "Heading not found"

                        # Find all p tags
                        link_p_tags = link_soup.select("main .content .container .row .col-xs-12.col-md-8 p")
                        link_text_content = '\n'.join([p.get_text().strip() for p in link_p_tags])

                        # If there is at least one p tag, create JSON object
                        if link_p_tags:
                            # Get current IST datetime
                            now_utc = datetime.now(pytz.utc)
                            now_ist = now_utc.astimezone(ist)
                            formatted_date_time_ist = now_ist.strftime("%m%d%y %H:%M:%S")

                            # Create the JSON object
                            text_object_json = {
                                "text": link_text_content,
                                "metadata": {
                                    "date_downloaded": formatted_date_time_ist,
                                    "site_url": full_link,
                                    "extra_data": {
                                        "heading": "Massachusetts General Laws (M.G.L.) applying to the financial services industry.",
                                        "sub_heading": sub_heading,
                                        "section": link_sub_heading
                                    }
                                },
                                "volunteer_id": "2121",
                                "location": "Pune, Maharashtra, India"
                            }

                            # Append the JSON object to json_data list
                            json_data.append(text_object_json)

    return json_data



In [2]:
# Fetch data and store in json_data list
json_data = fetch_financial_services_data()

# Print the json_data list
print(json_data)

[{'text': "Section 24. As used in sections 24 to 28, inclusive the following words shall have the following meanings, unless the context requires otherwise:—  \n''Commissioner'', the commissioner of banks.  \n''Consumer'', any natural person obligated or allegedly obligated to pay any debt.  \n''Creditor'', any person who offers or extends credit creating a debt or to whom a debt is owed, but the term shall not include a person to the extent that he receives an assignment or transfer of a debt in default solely for the purpose of facilitating collection of the debt for another.  \n''Debt'', any obligation or alleged obligation of a consumer to pay money arising out of a transaction in which the money, property, insurance, or services which are the subject of the transaction are primarily for personal, family, or household purposes, whether or not the obligation has been reduced to judgment.  \n''Debt collector'', any person who uses an instrumentality of interstate commerce or the mail

In [13]:
json_data[40]

{'text': 'Section 2. This chapter shall not apply to the following:  \n(a) Credit transactions involving extensions of credit primarily for business, commercial, or agricultural purposes, or to government or governmental agencies or instrumentalities, or to organizations.  \n(b) Transactions in securities or commodities accounts with a broker-dealer registered with the Securities and Exchange Commission.  \n(c) Credit transactions, other than those in which a security interest is or will be acquired in real property, or in personal property used or expected to be used as the principal dwelling of the consumer, in which the total amount financed exceeds twenty-five thousand dollars.  \n(d) Transactions under public utility tariffs, if the commissioner determines that the public utilities commission regulates the charges for such public utility services involved, the charges for delayed payment, and any discount allowed for early payment.  \n(e) Loans made, insured, or guaranteed pursuan

In [14]:
# Save json_data to a JSON file
output_file = "Massachusetts_General_Laws_applying_to_the_financial_services_industry.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(json_data, f, ensure_ascii=False, indent=4)

In [15]:
with open('/content/Massachusetts_General_Laws_applying_to_the_financial_services_industry.json', "r", encoding="utf-8") as f:
    new_data = json.load(f)

In [17]:
len(new_data)

679

In [3]:
print(len(json_data))

679
