# Web Scraping Using Beautiful Soup

### A.) Using Beautiful soup .findAll method webscraped the front page of Reddit. Get a list of all of the "timestamps"

In [37]:
pip install requests beautifulsoup4


Note: you may need to restart the kernel to use updated packages.


In [2]:
# Import Packages
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

In [3]:
user_agent_list = ["Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246" 
,"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.111 Safari/537.36 "
,"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9 "
,"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1" 
,"Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36"] 

headers = {
    'User-Agent': user_agent_list[2]}

In [4]:
URL = "https://www.cnn.com/markets/stocks/TSLA"

page = requests.get(URL, headers = headers)

In [5]:
page

<Response [200]>

In [5]:
print(type(page))

<class 'requests.models.Response'>


In [6]:
soup = BeautifulSoup(page.content, "html.parser")

In [10]:
print(soup.prettify())

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [9]:
from notebook.services.config import ConfigManager
cm = ConfigManager()
cm.update('notebook', {
    'NotebookApp': {
        'iopub_data_rate_limit': 10000000,
        'rate_limit_window': 3.0
    }
})


{'NotebookApp': {'iopub_data_rate_limit': 10000000, 'rate_limit_window': 3.0}}

In [None]:
finding= soup.find_all('div')
finding

In [90]:
timestamps = soup.find_all('faceplate-timeago', attrs={'ts': True})
timestamps

[<faceplate-timeago class="whitespace-nowrap text-neutral-content-weak" format="short" ts="2024-04-23T18:59:07.984000+0000"></faceplate-timeago>,
 <faceplate-timeago class="whitespace-nowrap text-neutral-content-weak" format="short" ts="2024-04-23T11:18:16.278000+0000"></faceplate-timeago>,
 <faceplate-timeago class="whitespace-nowrap text-neutral-content-weak" format="short" ts="2024-04-23T11:16:43.383000+0000"></faceplate-timeago>]

In [91]:
#Extracting ts values

timestamp_list = [tag['ts'] for tag in timestamps]

timestamp_list

['2024-04-23T18:59:07.984000+0000',
 '2024-04-23T11:18:16.278000+0000',
 '2024-04-23T11:16:43.383000+0000']

### B.) Using the functions findChild, descendents, etc. located the post title, text and post time into a dataframe.

In [142]:
posts = soup.find_all('article')
posts

[<article aria-label="🛠️ Patch 01.000.300 ⚙️" class="w-full m-0">
 <shreddit-post author="Waelder" author-id="t2_kr3rj" class="block relative stickied theme-rpl cursor-pointer group bg-neutral-background focus-within:bg-neutral-background-hover hover:bg-neutral-background-hover xs:rounded-[16px] px-md py-2xs my-2xs nd:visible" comment-count="6012" content-href="https://www.reddit.com/r/Helldivers/comments/1cfuuwu/patch_01000300/" created-timestamp="2024-04-29T09:30:53.396000+0000" domain="self.Helldivers" feedindex="0" icon="https://styles.redditmedia.com/t5_cgnmm/styles/profileIcon_xm3h21s1peu71.png?width=64&amp;height=64&amp;frame=1&amp;auto=webp&amp;crop=64:64,smart&amp;s=93b2dd53ff49624b0e2c64079e1b58c628fe82a6" id="t3_1cfuuwu" is-desktop-viewport="" is-embeddable="" is-slim-card="" item-state="" moderation-verdict="" pdp-target="_self" permalink="/r/Helldivers/comments/1cfuuwu/patch_01000300/" post-title="🛠️ Patch 01.000.300 ⚙️" post-type="text" previous-actions-feature="" score="

In [146]:
# Initialize lists to store the extracted data
titles = []
texts = []
post_times = []

# Find all shreddit-post tags or whichever tag your posts are contained within
posts = soup.find_all('article')

for post in posts:
    title_tag = post.findChild('a', {'slot': 'title'})  
    if title_tag:
        titles.append(title_tag.text.strip())
    else:
        titles.append("No title found")

    # Descendants to get text, found in <p> tags
    text = ""
    for p_tag in post.descendants:
        if p_tag.name == 'p':
            text += p_tag.text.strip() + " "
    texts.append(text.strip() if text else "No text found")

    # Time in `faceplate-timeago`
    time_tag = post.findChild('faceplate-timeago')
    if time_tag:
        post_times.append(time_tag['ts'])
    else:
        post_times.append("No timestamp found")

# Create a DataFrame from the collected data
df = pd.DataFrame({
    'Title': titles,
    'Text': texts,
    'Post Time': post_times
})

df


Unnamed: 0,Title,Text,Post Time
0,🛠️ Patch 01.000.300 ⚙️,A subreddit dedicated to HELLDIVERS and HELLDI...,2024-04-29T09:30:53.396000+0000
1,"People above 30, what is something you regret ...",r/AskReddit is the place to ask and answer tho...,2024-04-29T13:07:00.349000+0000
2,A very real note passed to me by a customer at...,Home to the largest online community of foodse...,2024-04-29T01:48:29.215000+0000


# Using RegEx

### A.) Using RegEx, get all the urls of ladder faculty profiles for UCLA Economics

In [115]:
URL = "https://economics.ucla.edu/faculty/ladder"

In [131]:
#WITH REGEX (RE)

import requests
import re

URL = "https://economics.ucla.edu/faculty/ladder"

def fetch_faculty_profiles(URL):
    response = requests.get(URL, headers=headers)
    
    # Using RegEx
    links = re.findall(r'href="(https://economics.ucla.edu/person/[^"]+)"', response.text)
    
    # Using a set to remove duplicates
    unique_links = set(links)
    
    return list(unique_links)

faculty_profile_urls = fetch_faculty_profiles(URL)
faculty_profile_urls


['https://economics.ucla.edu/person/lee-e-ohanian/',
 'https://economics.ucla.edu/person/david-baqaee/',
 'https://economics.ucla.edu/person/michael-rubens/',
 'https://economics.ucla.edu/person/bernardo-s-silveira/',
 'https://economics.ucla.edu/person/michela-giorcelli/',
 'https://economics.ucla.edu/person/daniel-haanwinckel/',
 'https://economics.ucla.edu/person/maurizio-mazzocco/',
 'https://economics.ucla.edu/person/pablo-fajgelbaum/',
 'https://economics.ucla.edu/person/jay-lu/',
 'https://economics.ucla.edu/person/aaron-tornell/',
 'https://economics.ucla.edu/person/shuyang-sheng/',
 'https://economics.ucla.edu/person/kathleen-mcgarry/',
 'https://economics.ucla.edu/person/felipe-goncalves/',
 'https://economics.ucla.edu/person/adriana-lleras-muney/',
 'https://economics.ucla.edu/person/ariel-burstein/',
 'https://economics.ucla.edu/person/pierre-olivier-weill/',
 'https://economics.ucla.edu/person/simon-board/',
 'https://economics.ucla.edu/person/tomasz-sadzik/',
 'https://ec

### Webcrawl the links from A and use RegEx to get all the emails and phone numbers of ladder faculty profiles

In [134]:
import requests
from bs4 import BeautifulSoup
import re

# List of URLs
urls = ['https://economics.ucla.edu/person/lee-e-ohanian/',
 'https://economics.ucla.edu/person/david-baqaee/',
 'https://economics.ucla.edu/person/michael-rubens/',
 'https://economics.ucla.edu/person/bernardo-s-silveira/',
 'https://economics.ucla.edu/person/michela-giorcelli/',
 'https://economics.ucla.edu/person/daniel-haanwinckel/',
 'https://economics.ucla.edu/person/maurizio-mazzocco/',
 'https://economics.ucla.edu/person/pablo-fajgelbaum/',
 'https://economics.ucla.edu/person/jay-lu/',
 'https://economics.ucla.edu/person/aaron-tornell/',
 'https://economics.ucla.edu/person/shuyang-sheng/',
 'https://economics.ucla.edu/person/kathleen-mcgarry/',
 'https://economics.ucla.edu/person/felipe-goncalves/',
 'https://economics.ucla.edu/person/adriana-lleras-muney/',
 'https://economics.ucla.edu/person/ariel-burstein/',
 'https://economics.ucla.edu/person/pierre-olivier-weill/',
 'https://economics.ucla.edu/person/simon-board/',
 'https://economics.ucla.edu/person/tomasz-sadzik/',
 'https://economics.ucla.edu/person/alexander-bloedel/',
 'https://economics.ucla.edu/person/till-von-wachter/',
 'https://economics.ucla.edu/person/denis-chetverikov/',
 'https://economics.ucla.edu/person/jinyong-hahn/',
 'https://economics.ucla.edu/person/juliana-londono-velez/',
 'https://economics.ucla.edu/person/sule-ozler/',
 'https://economics.ucla.edu/person/joao-guerreiro/',
 'https://economics.ucla.edu/person/daniel-clark/',
 'https://economics.ucla.edu/person/zhipeng-liao/',
 'https://economics.ucla.edu/person/will-rafey/',
 'https://economics.ucla.edu/person/oleg-itskhoki/',
 'https://economics.ucla.edu/person/martha-bailey/',
 'https://economics.ucla.edu/person/hugo-hopenhayn/',
 'https://economics.ucla.edu/person/rodrigo-pinto/',
 'https://economics.ucla.edu/person/jonathan-vogel/',
 'https://economics.ucla.edu/person/john-asker/',
 'https://economics.ucla.edu/person/natalie-bau/',
 'https://economics.ucla.edu/person/ichiro-obara/',
 'https://economics.ucla.edu/person/martin-b-hackmann/',
 'https://economics.ucla.edu/person/dora-costa/',
 'https://economics.ucla.edu/person/andrew-atkeson/',
 'https://economics.ucla.edu/person/yotam-shem-tov/',
 'https://economics.ucla.edu/person/moritz-meyer-ter-vehn/',
 'https://economics.ucla.edu/person/rosa-liliana-matzkin/',
 'https://economics.ucla.edu/person/gary-d-hansen/',
 'https://economics.ucla.edu/person/saki-bigio/',
 'https://economics.ucla.edu/person/andres-santos/'
]

def extract_contact_info(url):
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Regex patterns for email and phone numbers
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    phone_pattern = r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}'
    
    # Searching text for emails and phone numbers
    text = soup.get_text()
    emails = re.findall(email_pattern, text)
    phones = re.findall(phone_pattern, text)
    
    return {'url': url, 'emails': emails, 'phones': phones}

# Map the function over the list of URLs
contact_info = [extract_contact_info(url) for url in urls]

# Print the extracted information
for info in contact_info:
    print(info)


{'url': 'https://economics.ucla.edu/person/lee-e-ohanian/', 'emails': ['ohanian@econ.ucla.edu'], 'phones': []}
{'url': 'https://economics.ucla.edu/person/david-baqaee/', 'emails': ['baqaee@econ.ucla.edu'], 'phones': []}
{'url': 'https://economics.ucla.edu/person/michael-rubens/', 'emails': [], 'phones': []}
{'url': 'https://economics.ucla.edu/person/bernardo-s-silveira/', 'emails': ['silveira@econ.ucla.edu'], 'phones': []}
{'url': 'https://economics.ucla.edu/person/michela-giorcelli/', 'emails': ['mgiorcelli@econ.ucla.edu'], 'phones': []}
{'url': 'https://economics.ucla.edu/person/daniel-haanwinckel/', 'emails': ['haanwinckel@econ.ucla.edu'], 'phones': []}
{'url': 'https://economics.ucla.edu/person/maurizio-mazzocco/', 'emails': ['mmazzocc@econ.ucla.edu'], 'phones': ['(310) 825-6682']}
{'url': 'https://economics.ucla.edu/person/pablo-fajgelbaum/', 'emails': ['pfajgelbaum@econ.ucla.edu'], 'phones': ['(310) 794-7241']}
{'url': 'https://economics.ucla.edu/person/jay-lu/', 'emails': ['jay@