In [13]:
import requests
from bs4 import BeautifulSoup
import json
from datetime import datetime
import pytz

# Function to fetch data from the website
def fetch_state_finance_data():
    json_data = []
    ist = pytz.timezone('Asia/Kolkata')
    site_url = "https://malegislature.gov/Laws/GeneralLaws/PartI/TitleIII/Chapter29"
    page_response = requests.get(site_url)
    page_soup = BeautifulSoup(page_response.content, "html.parser")
    general_laws_list = page_soup.select("main .content .container .row .col-xs-12.col-md-8 .row .col-xs-12 ul")
    print(len(general_laws_list))
    if general_laws_list:
            # Extract links from <u class="generalLawsList">
             for ul_tag in general_laws_list:
                li_tags = ul_tag.find_all("li")
                for li_tag in li_tags:
                    a_tag = li_tag.find("a")
                    if a_tag and 'href' in a_tag.attrs:
                        # Join the base URL with the relative URL
                        full_link = "https://malegislature.gov" + a_tag['href']
                        # Fetch the content of the link
                        link_response = requests.get(full_link)
                        link_soup = BeautifulSoup(link_response.content, "html.parser")

                        # Find the h2 content inside the specified structure
                        link_h2_content = link_soup.select_one("main .content .container .row .col-xs-12.col-md-8 h2")
                        link_sub_heading = link_h2_content.get_text().strip() if link_h2_content else "Heading not found"
                        # Find all p tags
                        link_p_tags = link_soup.select("main .content .container .row .col-xs-12.col-md-8 p")
                        link_text_content = '\n'.join([p.get_text().strip() for p in link_p_tags])

                        # If there is at least one p tag, create JSON object
                        if link_p_tags:
                            # Get current IST datetime
                            now_utc = datetime.now(pytz.utc)
                            now_ist = now_utc.astimezone(ist)
                            formatted_date_time_ist = now_ist.strftime("%m%d%y %H:%M:%S")

                            # Create the JSON object
                            text_object_json = {
                                "text": link_text_content,
                                "metadata": {
                                    "date_downloaded": formatted_date_time_ist,
                                    "site_url": full_link,
                                    "extra_data": {
                                        "heading": "Massachusetts state finance laws(chapter 29)",
                                        "section": link_sub_heading
                                    }
                                },
                                "volunteer_id": "2121",
                                "location": "Pune, Maharashtra, India"
                            }

                            # Append the JSON object to json_data list
                            json_data.append(text_object_json)

    return json_data



In [14]:
# Fetch data and store in json_data list
json_data = fetch_state_finance_data()

# Print the json_data list
print(json_data)

1
[{'text': "Section 1. All words and terms defined by section 1 of chapter 7C and appearing in this chapter, except for the phrases ''state agency'' and ''state authority'', shall have the meaning defined in that section, unless the context shall indicate another meaning or intent.  \nAs used in this chapter, the following words shall, unless the context clearly requires otherwise, have the following meanings:—  \n''Account'', a separate 8–digit number designated in the state accounting system to separately record budgetary, bond, federal or trust funds.  \n''Agency head'' or ''department head'', the administrative head of a state agency, department, board, bureau, office or division of the commonwealth who has been authorized through legislation to obligate and expend funds, comply with legislative mandates and make any certifications or approvals required under this chapter or other state or federal laws or regulations requiring an agency head certification or approval.  \n''Allotme

In [27]:
json_data[39]

{'text': "Section 2GGGG. (a) There shall be established and set upon the books of the commonwealth a separate fund to be known as the Distressed Hospital Trust Fund to be expended, without further appropriation, by the health policy commission. The fund shall consist public and private sources such as gifts, grants and donations, interest earned on such revenues and any funds provided from other sources.  \nThe board of the health policy commission, as trustee, shall administer the fund and shall make expenditures from the fund consistent with this section; provided, however, that not more than 10 per cent of the amounts held in the fund in any 1 year shall be used by the commission for the combined cost of program administration, technical assistance to grantees or program evaluation.  \n(b) Revenues deposited in the fund that are unexpended at the end of the fiscal year shall not revert to the General Fund and shall be available for expenditure in the following fiscal year.  \n(c) Al

In [28]:
# Save json_data to a JSON file
output_file = "state_finance_laws_scrapping.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(json_data, f, ensure_ascii=False, indent=4)

In [29]:
with open('state_finance_laws_scrapping.json', "r", encoding="utf-8") as f:
    new_data = json.load(f)

In [30]:
len(new_data)

283

In [None]:
print(len(json_data))

679
