# <center> Web Scraping

In [1]:
import json
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
# Creating a function to scrape the website data using BeautifulSoup
def extract_data(url):
    # Send a GET request to the URL
    response = requests.get(url)
    
    # Create a BeautifulSoup object to parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')
    #print(str(soup))
    
    # Extract the title of the page
    title = soup.title.string if soup.title else "Title not found"
    
    # Extract contact number using a more general regex
    contact_pattern = r'\b(?:\+?1[-.\s]?)?(?:\(\d{3}\)|\d{3})[-.\s]?\d{3}[-.\s]?\d{4}\b'
    contact_match = re.search(contact_pattern, str(soup))
    contact_number = contact_match.group() if contact_match else "Contact number not found"
    
    # Extract the email using regex
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    email = re.search(email_pattern, str(soup))
    email = email.group() if email else "Email not found"
    
    # Extract the address
    address_pattern = r"\d+ [\w\W]+ [A-Z]{1}\d{1}[A-Z]{1} \d{1}[A-Z]{1}\d{1}"
    match = re.match(address_pattern, str(soup))
    address_elem = soup.find('p', class_='address')
    #print(address_elem)
    combined_address = address_elem.text.strip() if address_elem else (match.group() if match else "Address not found")
    #print(combined_address)
    
    if combined_address == "Address not found":
        
        try:
            script_tag = soup.find("script", type="application/ld+json")

            # Load the content of the script as JSON
            json_data = json.loads(script_tag.string)

            # Extract the address information
            address = json_data.get("address", {})
            address_items = list(address.items())
            address_items.pop(0)  # Remove the first key-value pair

            # Convert back to dictionary
            address = dict(address_items)
            combined_address = ' '.join(address.values())
            #print("++++++++++++++++++++++++++++++++++++++++", address)
        except:
            combined_address = 'Address not found'
    
    return {
        "Title": title,
        "Contact Number": contact_number,
        "Email": email,
        "Address": combined_address
    }


In [4]:
# Creating a list of all the college websites we need to scrape data from
url_list = [
            'https://www.lambtoncollege.ca',
            'https://www.senecapolytechnic.ca',
            'https://humber.ca',
            'https://alexandercollege.ca',
            'https://www.algonquincollege.com',
            'https://thecanadiancollege.ca',
            'https://www.conestogac.on.ca',
            'https://collegeboreal.ca',
            'https://ambrose.edu',
            'https://bowvalleycollege.ca',
            'https://www.nwpolytech.ca',
            'https://www.keyano.ca',
            'https://www.kingsu.ca',
            'https://lethpolytech.ca',
            'https://www.mhc.ab.ca'
            ]

In [5]:
# Making the DataFrame from each of the extracted website data
df = pd.DataFrame(columns=["Title", "Contact Number", "Email", "Address"])

for i in url_list:
    # Extract the data
    data = extract_data(i)

    new_row = {
        "Title": data["Title"],
        "Contact Number": data["Contact Number"],
        "Email": data["Email"],
        "Address": data["Address"]
    }
    
    df = df.append(new_row, ignore_index=True)

# The API was not able to scrape all the details from the websites as some websites have anti-scraping mechanisms in place that prevent data extraction

In [6]:
# Viewing the scraped data
df

Unnamed: 0,Title,Contact Number,Email,Address
0,Lambton College Home | Lambton College,519-542-7751,info@lambtoncollege.ca,1457 London Rd Sarnia ON N7S 6K4 CA
1,"Home - Seneca Polytechnic, Toronto, Canada",1 416.491.5050,Email not found,Address not found
2,Humber College - Humber Polytechnic,1727187009,Email not found,Address not found
3,Home – Alexander College,6044355815,info@alexandercollege.ca,"4805 Kingsway, Burnaby, BC, V5H 4T6Phone: 604-..."
4,Algonquin College,1703103158,Email not found,Address not found
5,Home - Canadian College for Higher Studies,1660013965,Email not found,Address not found
6,Your Ontario college for full and part-tim...,519-748-5220,Email not found,Kitchener ON N2G 4M4 299 Doon Valley Drive
7,Collège Boréal - L’éducation postsecondaire fr...,1-800-361-6673,liaison@collegeboreal.ca,Address not found
8,Ambrose University | Ambrose University,1 (403) 410-2000,Email not found,Address not found
9,Bow Valley College | Home,403-410-1400,info@bowvalleycollege.ca,Address not found
