<a href="https://colab.research.google.com/github/Zoroaster-BGAE/Web-Scrapping-for-CFC/blob/main/Web_Scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Clean One

import requests
import json
from collections import defaultdict
from bs4 import BeautifulSoup

# Scrape index webpage
url = 'https://www.cfcunderwriting.com'
response = requests.get(url)

# Find all externally loaded resources
soup = BeautifulSoup(response.text, 'lxml')
externals = set()
for tag in soup.find_all():
    src = tag.get('src')
    href = tag.get('href')
    if src and src.startswith('http') and not src.startswith(url):
        externals.add(src)
    if href and href.startswith('http') and not href.startswith(url):
        externals.add(href)

# Write list of externals to a JSON file
with open('externals.json', 'w') as f1:
    json.dump(list(externals), f1)

# Find the URL of the privacy policy page
privacy_policy_url = None
for a in soup.find_all('a'):
    if a.text.lower() == 'privacy policy':
        privacy_policy_url = a['href']

#Correction 1 that checks if there is a http scheme and adjusts the url accordingly
if not privacy_policy_url.startswith("http"):
    privacy_policy_url = "https://cfcunderwriting.com" + privacy_policy_url

# Scrape the privacy policy page
response_pp = requests.get(privacy_policy_url)
soup_pp = BeautifulSoup(response_pp.text, 'html')

# Create a case-insensitive word frequency count
word_counts = defaultdict(int)
for text in soup_pp.find_all(text=True):
    for word in text.split():
        word_counts[word.lower()] += 1

# Write the word frequency count to a JSON file
with open('word_counts.json', 'w') as f2:
    json.dump(word_counts, f2)


This code first retrieves the index webpage and then uses BeautifulSoup to parse the HTML. It then looks for all tags that have a src or href attribute, and adds any external URLs to a set. It then writes this set to a JSON file.


Next, the code searches the page for a hyperlink with the text "Privacy Policy" and gets the URL of this page. It then retrieves the privacy policy page and uses BeautifulSoup to parse the HTML. It then creates a case-insensitive word frequency count and writes this to a JSON file.

In [None]:
#Ver 2

'''

import requests
import re
import json
import collections
from bs4 import BeautifulSoup

# Scrape the index webpage hosted at cfcunderwriting.com
r = requests.get('http://cfcunderwriting.com')

# Parse the HTML content
soup = BeautifulSoup(r.text, 'html.parser')

# Find all external resources (images, scripts, and fonts)
external_resources = []
for tag in soup.find_all():
    if tag.name in ['img', 'script', 'link']:
        src = tag.get('src')
        href = tag.get('href')
        if src and not src.startswith('http://cfcunderwriting.com'):
            external_resources.append(src)
        if href and not href.startswith('http://cfcunderwriting.com'):
            external_resources.append(href)


'''








# Write the list of external resources to a JSON file
with open('external_resources.json', 'w') as f:
    json.dump(external_resources, f)

# Find the "Privacy Policy" page
privacy_policy_url = None
for a in soup.find_all('a'):
    if a.text.lower() == 'privacy policy':
        privacy_policy_url = a['href']


#Correction 1: It checks if there is a http scheme
if not privacy_policy_url.startswith("http"):
    privacy_policy_url = "https://cfcunderwriting.com" + privacy_policy_url

# Scrape the "Privacy Policy" page
if privacy_policy_url:
    r = requests.get(privacy_policy_url)
    soup = BeautifulSoup(r.text, 'html.parser')

    # Extract the visible text from the page
    text = soup.get_text()

    # Remove non-alphabetic characters and split the text into words
    words = re.sub(r'[^a-zA-Z\s]', '', text).split()

    # Create a word frequency count
    word_count = collections.Counter(words)

    # Write the word frequency count to a JSON file
    with open('word_count.json', 'w') as f:
        json.dump(word_count, f)

This program uses the requests library to make HTTP requests to the website, the re library to remove non-alphabetic characters from the text, and the collections library to create a word frequency count. It also uses the BeautifulSoup library to parse the HTML content of the web pages.

In [None]:

soup = BeautifulSoup(response.text, 'lxml')
externals = set()
for tag in soup.find_all():
    src = tag.get('src')
    href = tag.get('href')
    if src and src.startswith('http') and not src.startswith(url):
        externals.add(src)
    if href and href.startswith('http') and not href.startswith(url):
        externals.add(href)

print(len(externals), externals)

15 {'https://fonts.googleapis.com/css?family=Montserrat:300,400,500,600,700', 'https://www.googletagmanager.com/ns.html?id=GTM-NGGN5FB', 'https://cdnjs.cloudflare.com/ajax/libs/jquery/3.3.1/jquery.min.js', 'https://twitter.com/cfcunderwriting', 'https://ico.org.uk/your-data-matters/raising-concerns/', 'https://www.linkedin.com/company/cfc-underwriting-ltd/', 'https://www.cfcunderwriting.eu/', 'https://www.youtube.com/user/CFCUnderwriting', 'https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/4.1.3/css/bootstrap.min.css', 'https://www.google.com/recaptcha/api.js?render=6LemiyEaAAAAAGwb4nR8oX38fxyM36xjIGbwz6d4', 'https://support.google.com/analytics/answer/4597324?hl=en', 'https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/4.1.3/js/bootstrap.min.js', 'https://www.instagram.com/cfc_underwriting/', 'https://www.google.com/recaptcha/about/', 'https://www.facebook.com/cfcspecialistinsurance'}
