# Web Scraping and Web Crawling Using Beautiful Soup

### A.) webscrape the front page of Reddit. Get a list of all of the "timestamps"

In [37]:
pip install requests beautifulsoup4


Note: you may need to restart the kernel to use updated packages.


In [22]:
# Import Packages
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

In [23]:
user_agent_list = ["Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246" 
,"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.111 Safari/537.36 "
,"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9 "
,"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1" 
,"Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36"] 

headers = {
    'User-Agent': user_agent_list[2]}

In [24]:
URL = "https://www.reddit.com"

page = requests.get(URL, headers = headers)

In [138]:
page

<Response [200]>

In [139]:
print(type(page))

<class 'requests.models.Response'>


In [26]:
soup = BeautifulSoup(page.content, "html.parser")

In [27]:
print(soup.prettify())

In [89]:
finding= soup.find_all('span')
finding

[<span class="flex items-center justify-center">
 <span class="flex items-center gap-xs">Skip to main content</span>
 </span>,
 <span class="flex items-center gap-xs">Skip to main content</span>,
 <span class="flex items-center justify-center">
 <span class="flex"><svg fill="currentColor" height="20" icon-name="menu-outline" rpl="" viewbox="0 0 20 20" width="20" xmlns="http://www.w3.org/2000/svg">
 <path d="M19 10.625H1v-1.25h18v1.25Zm0-7.875H1V4h18V2.75ZM19 16H1v1.25h18V16Z"></path>
 </svg></span>
 </span>,
 <span class="flex"><svg fill="currentColor" height="20" icon-name="menu-outline" rpl="" viewbox="0 0 20 20" width="20" xmlns="http://www.w3.org/2000/svg">
 <path d="M19 10.625H1v-1.25h18v1.25Zm0-7.875H1V4h18V2.75ZM19 16H1v1.25h18V16Z"></path>
 </svg></span>,
 <span>Open navigation</span>,
 <span class="pr-0 s:pr-xs flex items-center"><svg height="32" viewbox="0 0 216 216" width="32" xml:space="preserve" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">

In [28]:
timestamps = soup.find_all('faceplate-timeago', attrs={'ts': True})
timestamps

[<faceplate-timeago class="whitespace-nowrap text-neutral-content-weak" format="short" ts="2024-04-30T10:10:57.504000+0000"></faceplate-timeago>,
 <faceplate-timeago class="whitespace-nowrap text-neutral-content-weak" format="short" ts="2024-04-30T13:16:59.386000+0000"></faceplate-timeago>,
 <faceplate-timeago class="whitespace-nowrap text-neutral-content-weak" format="short" ts="2024-04-30T06:17:54.976000+0000"></faceplate-timeago>]

In [29]:
#Extracting ts values

timestamp_list = [tag['ts'] for tag in timestamps]

timestamp_list

['2024-04-30T10:10:57.504000+0000',
 '2024-04-30T13:16:59.386000+0000',
 '2024-04-30T06:17:54.976000+0000']

### B.) Using the functions findChild, descendents, etc. locate the post title, text and post time into a dataframe.

In [30]:
posts = soup.find_all('article')
posts

[<article aria-label="do you guys really take at least one shower per day?" class="w-full m-0">
 <shreddit-post author="Top-Yoghurt-9416" author-id="t2_3uca90qx" class="block relative cursor-pointer group bg-neutral-background focus-within:bg-neutral-background-hover hover:bg-neutral-background-hover xs:rounded-[16px] px-md py-2xs my-2xs nd:visible" comment-count="6678" content-href="https://www.reddit.com/r/NoStupidQuestions/comments/1cgp9iy/do_you_guys_really_take_at_least_one_shower_per/" created-timestamp="2024-04-30T10:10:57.504000+0000" domain="self.NoStupidQuestions" feedindex="0" icon="https://www.redditstatic.com/avatars/defaults/v2/avatar_default_1.png" id="t3_1cgp9iy" is-desktop-viewport="" is-embeddable="" item-state="" moderation-verdict="" pdp-target="_self" permalink="/r/NoStupidQuestions/comments/1cgp9iy/do_you_guys_really_take_at_least_one_shower_per/" post-title="do you guys really take at least one shower per day?" post-type="text" previous-actions-feature="" score="

In [146]:
# Initialize lists to store the extracted data
titles = []
texts = []
post_times = []

# Find all shreddit-post tags or whichever tag your posts are contained within
posts = soup.find_all('article')

for post in posts:
    title_tag = post.findChild('a', {'slot': 'title'})  
    if title_tag:
        titles.append(title_tag.text.strip())
    else:
        titles.append("No title found")

    # Descendants to get text, found in <p> tags
    text = ""
    for p_tag in post.descendants:
        if p_tag.name == 'p':
            text += p_tag.text.strip() + " "
    texts.append(text.strip() if text else "No text found")

    # Time in `faceplate-timeago`
    time_tag = post.findChild('faceplate-timeago')
    if time_tag:
        post_times.append(time_tag['ts'])
    else:
        post_times.append("No timestamp found")

# Create a DataFrame from the collected data
df = pd.DataFrame({
    'Title': titles,
    'Text': texts,
    'Post Time': post_times
})

df


Unnamed: 0,Title,Text,Post Time
0,🛠️ Patch 01.000.300 ⚙️,A subreddit dedicated to HELLDIVERS and HELLDI...,2024-04-29T09:30:53.396000+0000
1,"People above 30, what is something you regret ...",r/AskReddit is the place to ask and answer tho...,2024-04-29T13:07:00.349000+0000
2,A very real note passed to me by a customer at...,Home to the largest online community of foodse...,2024-04-29T01:48:29.215000+0000


# Using RegEx

### A.) Using RegEx, get all the urls of ladder faculty profiles for UCLA Economics

In [115]:
URL = "https://economics.ucla.edu/faculty/ladder"

In [131]:
#WITH REGEX (RE)

import requests
import re

URL = "https://economics.ucla.edu/faculty/ladder"

def fetch_faculty_profiles(URL):
    response = requests.get(URL, headers=headers)
    
    # Using RegEx
    links = re.findall(r'href="(https://economics.ucla.edu/person/[^"]+)"', response.text)
    
    # Using a set to remove duplicates
    unique_links = set(links)
    
    return list(unique_links)

faculty_profile_urls = fetch_faculty_profiles(URL)
faculty_profile_urls


['https://economics.ucla.edu/person/lee-e-ohanian/',
 'https://economics.ucla.edu/person/david-baqaee/',
 'https://economics.ucla.edu/person/michael-rubens/',
 'https://economics.ucla.edu/person/bernardo-s-silveira/',
 'https://economics.ucla.edu/person/michela-giorcelli/',
 'https://economics.ucla.edu/person/daniel-haanwinckel/',
 'https://economics.ucla.edu/person/maurizio-mazzocco/',
 'https://economics.ucla.edu/person/pablo-fajgelbaum/',
 'https://economics.ucla.edu/person/jay-lu/',
 'https://economics.ucla.edu/person/aaron-tornell/',
 'https://economics.ucla.edu/person/shuyang-sheng/',
 'https://economics.ucla.edu/person/kathleen-mcgarry/',
 'https://economics.ucla.edu/person/felipe-goncalves/',
 'https://economics.ucla.edu/person/adriana-lleras-muney/',
 'https://economics.ucla.edu/person/ariel-burstein/',
 'https://economics.ucla.edu/person/pierre-olivier-weill/',
 'https://economics.ucla.edu/person/simon-board/',
 'https://economics.ucla.edu/person/tomasz-sadzik/',
 'https://ec