# AAI614: Data Science & its Applications

*Notebook 3.1: Practice with Data Collections*

<a href="https://colab.research.google.com/github/harmanani/AAI614/blob/main/Week%203/Notebook3.1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Source: Scraping with Python http://shop.oreilly.com/product/0636920034391.do

In [11]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import ssl

# Configure SSL context to ignore certificate verification
ssl._create_default_https_context = ssl._create_unverified_context

# Function to scrape a webpage and print different types of data
def scrape_wikipedia_page(url):
    try:
        # Open the URL
        html = urlopen(url)
        # Parse the HTML using BeautifulSoup
        bs = BeautifulSoup(html, 'html.parser')

        # Extract all links (anchor tags) with an 'href' attribute
        print("Extracting all hyperlinks:")
        for link in bs.find_all('a'):
            if 'href' in link.attrs:
                href = link.attrs['href']
                # Check if it's a Wikipedia link (internal link)
                if href.startswith('/wiki/'):
                    print(f"Internal link: https://en.wikipedia.org{href}")
                else:
                    print(f"External link: {href}")

        # Extract all image URLs
        print("\nExtracting all image sources:")
        for img in bs.find_all('img'):
            if 'src' in img.attrs:
                print(img.attrs['src'])

        # Extract all paragraph texts
        print("\nExtracting all paragraph texts:")
        for paragraph in bs.find_all('p'):
            print(paragraph.get_text())

        # Extract specific content using CSS selectors (e.g., headers)
        print("\nExtracting all headings (h1, h2, h3):")
        for header in bs.select('h1, h2, h3'):
            print(header.get_text())

    except Exception as e:
        # Print an error message if something goes wrong
        print(f"An error occurred: {e}")

# URL to scrape
url = 'http://en.wikipedia.org/wiki/Kevin_Bacon'

# Call the scraping function with the specified URL
scrape_wikipedia_page(url)


Extracting all hyperlinks:
External link: #main-content
External link: http://www.sfda.gov.sa/en
External link: #
External link: /en/overview
External link: /en/authority-establishment
External link: /en/authority-membership
External link: /en/authority-activities
External link: /en/authority-management
External link: /en/board-of-directors
External link: /en/sfda-in-vision
External link: /en/sfda-ntp
External link: /en/sfda-hstp
External link: /en/sfda-nidlp
External link: /en/vision-and-mission
External link: /en/authority-values
External link: /en/annual-report
External link: /en/career-vision-and-mission
External link: /en/career-general-goals
External link: #
External link: /en/lists-categories?tags=All
External link: /en/lists-categories?tags=46
External link: /en/lists-categories?tags=1
External link: /en/lists-categories?tags=2
External link: /en/lists-categories?tags=3
External link: /en/lists-categories?tags=4
External link: /en/lists-categories?tags=6
External link: /en/list

## Retrieving Articles Only

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

html = urlopen('http://en.wikipedia.org/wiki/Kevin_Bacon')
bs = BeautifulSoup(html, 'html.parser')
for link in bs.find('div', {'id':'bodyContent'}).find_all(
    'a', href=re.compile('^(/wiki/)((?!:).)*$')):
    print(link.attrs['href'])

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

html = urlopen('http://en.wikipedia.org/wiki/Kevin_Bacon')
bs = BeautifulSoup(html, 'html.parser')
for link in bs.find('div', {'id':'bodyContent'}).find_all(
    'a', href=re.compile('^(/wiki/)((?!:).)*$')):
    print(link.attrs['href'])

## Random Walk

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import datetime
import random
import re

random.seed(datetime.datetime.now().strftime('%s'))
def getLinks(articleUrl):
    html = urlopen(f'http://en.wikipedia.org{articleUrl}')
    bs = BeautifulSoup(html, 'html.parser')
    return bs.find('div', {'id':'bodyContent'}).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$'))

links = getLinks('/wiki/Kevin_Bacon')
while len(links) > 0:
    newArticle = links[random.randint(0, len(links)-1)].attrs['href']
    print(newArticle)
    links = getLinks(newArticle)

## Recursively crawling an entire site

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

pages = set()
def getLinks(pageUrl):
    html = urlopen(f'http://en.wikipedia.org{pageUrl}')
    bs = BeautifulSoup(html, 'html.parser')
    for link in bs.find_all('a', href=re.compile('^(/wiki/)')):
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                #We have encountered a new page
                newPage = link.attrs['href']
                print(newPage)
                pages.add(newPage)
                getLinks(newPage)
getLinks('')

## Collecting Data Across an Entire Site

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

pages = set()
def getLinks(pageUrl):
    html = urlopen(f'http://en.wikipedia.org{pageUrl}')
    bs = BeautifulSoup(html, 'html.parser')
    try:
        print(bs.h1.get_text())
        #mw-parser-output
        bodyContent = bs.find('div', {'id':'bodyContent'}).find_all('p')
        if len(bodyContent):
            print(bodyContent[0])
        print(bs.find(id='ca-edit').find('a').attrs['href'])
    except AttributeError:
        print('This page is missing something! Continuing.')

    for link in bs.find_all('a', href=re.compile('^(/wiki/)')):
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                #We have encountered a new page
                newPage = link.attrs['href']
                print('-'*20)
                print(newPage)
                pages.add(newPage)
                getLinks(newPage)
getLinks('/wiki/General-purpose_programming_language')

## Crawling across the Internet

In [None]:
from urllib.request import urlopen
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import re
import datetime
import random

#Retrieves a list of all Internal links found on a page
def getInternalLinks(bs, url):
    netloc = urlparse(url).netloc
    scheme = urlparse(url).scheme
    internalLinks = set()
    for link in bs.find_all('a'):
        if not link.attrs.get('href'):
            continue
        parsed = urlparse(link.attrs['href'])
        if parsed.netloc == '':
            internalLinks.add(f'{scheme}://{netloc}/{link.attrs["href"].strip("/")}')
        elif parsed.netloc == netloc:
            internalLinks.add(link.attrs['href'])
    return list(internalLinks)

#Retrieves a list of all external links found on a page
def getExternalLinks(bs, url):
    netloc = urlparse(url).netloc
    externalLinks = set()
    for link in bs.find_all('a'):
        if not link.attrs.get('href'):
            continue
        parsed = urlparse(link.attrs['href'])
        if parsed.netloc != '' and parsed.netloc != netloc:
            externalLinks.add(link.attrs['href'])
    return list(externalLinks)

def getRandomExternalLink(startingPage):
    bs = BeautifulSoup(urlopen(startingPage), 'html.parser')
    externalLinks = getExternalLinks(bs, startingPage)
    if not len(externalLinks):
        print('No external links, looking around the site for one')
        internalLinks = getInternalLinks(bs, startingPage)
        return getRandomExternalLink(random.choice(internalLinks))
    else:
        return random.choice(externalLinks)

def followExternalOnly(startingSite):
    externalLink = getRandomExternalLink(startingSite)
    print(f'Random external link is: {externalLink}')
    followExternalOnly(externalLink)


followExternalOnly('https://www.oreilly.com/')


## Collect all External Links from a Site

In [None]:
# Collects a list of all external URLs found on the site
allExtLinks = []
allIntLinks = []


def getAllExternalLinks(url):
    bs = BeautifulSoup(urlopen(url), 'html.parser')
    internalLinks = getInternalLinks(bs, url)
    externalLinks = getExternalLinks(bs, url)
    for link in externalLinks:
        if link not in allExtLinks:
            allExtLinks.append(link)
            print(link)

    for link in internalLinks:
        if link not in allIntLinks:
            allIntLinks.append(link)
            getAllExternalLinks(link)


allIntLinks.append('https://oreilly.com')
getAllExternalLinks('https://www.oreilly.com/')