In [9]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import os
from collections import OrderedDict

# URL of the Reddit page with blacklisted websites
url = "https://www.reddit.com/r/gundeals/wiki/blacklisted_websites/"

# Function to scrape data from the URL
headers = {'User-Agent': 'Mozilla/5.0'}
def getdata(url):
    r = requests.get(url, headers=headers)
    return r.text

# Fetch the HTML data and parse it
htmldata = getdata(url)
soup = BeautifulSoup(htmldata, 'html.parser')

# Extract the relevant sections of the HTML
banned_sites = []
scam_sites = []
compromised_sites = []

current_section = None

# Identify sections by their headers
for header in soup.find_all(['h1', 'h4']):
    header_text = header.get_text(strip=True)
    if "Banned with prejudice" in header_text:
        current_section = "banned"
    elif "Scam Websites" in header_text:
        current_section = "scam"
    elif "Compromised Websites" in header_text:
        current_section = "compromised"
    elif "Does not meet Subreddit Standards" in header_text:
        current_section = None

    # Get the following table
    if current_section and header.find_next('table'):
        table = header.find_next('table')
        for row in table.find_all('tr'):
            cells = row.find_all('td')
            if len(cells) > 1:
                site = cells[1].get_text(strip=True)
                if site and '.' in site and ' ' not in site:
                    if current_section == "banned":
                        banned_sites.append(site)
                    elif current_section == "scam":
                        scam_sites.append(site)
                    elif current_section == "compromised":
                        compromised_sites.append(site)

# Remove duplicates while preserving order
unique_banned_sites = list(OrderedDict.fromkeys(banned_sites))
unique_scam_sites = list(OrderedDict.fromkeys(scam_sites))
unique_compromised_sites = list(OrderedDict.fromkeys(compromised_sites))

# Prepare the header
header = """[Adblock Plus 2.0]
! Version: 
! Title: Bad Ammo Sites
! Last modified: {date}
! Expires: 365 days (update frequency)
! Homepage: https://github.com/bluesky509/denylist
! This URL: https://raw.githubusercontent.com/bluesky509/clickbait/master/badammo.txt
! License: https://github.com/bluesky509/clickbait/blob/master/LICENSE.txt
""".format(date=datetime.now().strftime("%Y-%m-%d"))

# Save the formatted list to a file in the same directory
output_path = 'badammo.txt'

with open(output_path, 'w') as file:
    file.write(header)
    file.write("\n! Banned with prejudice\n")
    for site in unique_banned_sites:
        file.write(f"{site}\n")
    file.write("\n! Scam Websites\n")
    for site in unique_scam_sites:
        file.write(f"{site}\n")
    file.write("\n! Compromised Websites\n")
    for site in unique_compromised_sites:
        file.write(f"{site}\n")

print(f"Blacklist saved to {output_path}")

Blacklist saved to badammo.txt
