In [9]:
#%pip install requests beautifulsoup4

# Improved 2 (capture Link Title)

In [19]:
from bs4 import BeautifulSoup
import requests
import pymongo
import pandas as pd

# MongoDB connection setup
client = pymongo.MongoClient('mongodb://localhost:27017/')
db = client['Annapolis_New']  # Change to a new database to avoid overwriting the old one
collection = db['PD_New']  # Use a new collection name

# Define the URL to be scraped
base_url = 'https://www.annapolis.gov/list.aspx?PRVMSG=253'

# Make the request and create BeautifulSoup object
response = requests.get(base_url)
soup = BeautifulSoup(response.text, 'html.parser')

# Find the table containing the subjects and dates
table = soup.find('table')  # Adjust selector if necessary
if not table:
    raise ValueError("Could not find the table on the page.")

# Parse rows in the table
rows = table.find_all('tr')[1:]  # Skip the header row
print(f'Total rows found in table: {len(rows)}')

# Extract subject, date sent, and link for each row
records = []
for row in rows:
    cells = row.find_all('td')
    if len(cells) >= 2:
        subject = cells[0].get_text(strip=True)
        date_sent = cells[1].get_text(strip=True)
        link_tag = cells[0].find('a')
        if link_tag and link_tag.get('href'):
            link = f"https://www.annapolis.gov{link_tag.get('href')}"
            records.append({'title': subject, 'date_sent': date_sent, 'link': link})

print(f'Total records extracted from table: {len(records)}')

# Scrape content from each link and update the records
for record in records:
    link = record['link']
    print(f'Scraping: {link}')
    response = requests.get(link)
    if response.status_code != 200:
        print(f'Failed to fetch {link}')
        record['content'] = None
        continue

    soup = BeautifulSoup(response.text, 'html.parser')

    # Remove all script and style elements
    for script in soup(["script", "style"]):
        script.extract()

    # Get and clean text content
    text = soup.get_text()
    cleaned_text = ' '.join(text.split())  # Remove extra spaces and newlines
    record['content'] = cleaned_text

# Insert all records into MongoDB
collection.insert_many(records)

# Print completion message
print(f'Done! Inserted {len(records)} documents into the collection.')

# Convert to DataFrame for display or additional processing
df = pd.DataFrame(records)
print(df.head())

Total rows found in table: 2131
Total records extracted from table: 2131
Scraping: https://www.annapolis.gov/CivicSend/ViewMessage/message/247473
Scraping: https://www.annapolis.gov/CivicSend/ViewMessage/message/247344
Scraping: https://www.annapolis.gov/CivicSend/ViewMessage/message/247093
Scraping: https://www.annapolis.gov/CivicSend/ViewMessage/message/247086
Scraping: https://www.annapolis.gov/CivicSend/ViewMessage/message/247083
Scraping: https://www.annapolis.gov/CivicSend/ViewMessage/message/246948
Scraping: https://www.annapolis.gov/CivicSend/ViewMessage/message/246523
Scraping: https://www.annapolis.gov/CivicSend/ViewMessage/message/246401
Scraping: https://www.annapolis.gov/CivicSend/ViewMessage/message/246297
Scraping: https://www.annapolis.gov/CivicSend/ViewMessage/message/246188
Scraping: https://www.annapolis.gov/CivicSend/ViewMessage/message/246187
Scraping: https://www.annapolis.gov/CivicSend/ViewMessage/message/246128
Scraping: https://www.annapolis.gov/CivicSend/ViewM

In [3]:
import pymongo

# MongoDB connection setup
client = pymongo.MongoClient('mongodb://localhost:27017/')  # Update with your MongoDB connection string if different
db = client['Annapolis_New']  # Replace with your database name
collection = db['PD_New']  # Replace with your collection name

# Count the total number of documents in the collection
document_count = collection.count_documents({})  # Use an empty filter to count all documents
print(f"Total documents in collection: {document_count}")

Total documents in collection: 2131


# Original

In [None]:
from bs4 import BeautifulSoup
import requests
import pymongo
from pymongo import MongoClient
import pandas as pd

# MongoDB connection setup
client = pymongo.MongoClient('mongodb://localhost:27017/')
db = client['Annapolis']
collection = db['PD']

# Uncomment the following line if you want to delete all existing records before scraping
collection.delete_many({})

# Define the URL to be scraped
base_url = 'https://www.annapolis.gov/list.aspx?PRVMSG=253'

# Make the request and create BeautifulSoup object
response = requests.get(base_url)
soup = BeautifulSoup(response.text, 'html.parser')

# Find all <a> tags
links = soup.find_all('a')

print(f'Total links found: {len(links)}')  # Print the total number of <a> tags

# Filter only the required links and prepend base URL
filtered_links = ["https://www.annapolis.gov" + link.get('href') for link in links if link.get('href') and link.get('href').startswith("/CivicSend/ViewMessage/message/")]

print(f'Total filtered links: {len(filtered_links)}')  # Print the number of filtered links

print(f'Scraping {len(filtered_links)} links...')  # Print the number of links to be scraped

# For each link, scrape the data and save to MongoDB
for link in filtered_links:
    print(f'Scraping: {link}')  # Print the link being scraped
    
    # Check if the link already exists in the database
    existing_doc = collection.find_one({'scraping_link': link})
    if existing_doc:
        print(f'Skipping {link} - already in database')
        continue
    
    response = requests.get(link)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Remove all script and style elements
    for script in soup(["script", "style"]):
        script.extract()

    # Get text
    text = soup.get_text()

    # Clean up the text
    cleaned_text = ' '.join(text.split())  # This removes extra spaces and newlines

    # Add to MongoDB
    document = {'message': cleaned_text, 'scraping_link': link}
    collection.insert_one(document)

print('Done!')  # Print a completion message

# Print the total number of documents in the collection
print(f'Total documents in collection: {collection.count_documents({})}')