In [None]:
# Loading libraries
import requests
import re
from bs4 import BeautifulSoup
from time import sleep
import pandas as pd
from tqdm import tqdm

# Getting the html from our desired URL as a text string
url = 'https://carpentries.org/workshops/upcoming-workshops/'
req = requests.get(url).text

In [None]:
# Cleaning and printing the string
cleaned_req = re.sub(r'\s*\n\s*','', req).strip()
print(cleaned_req[0:1000])

In [None]:
# Parsing the HTML with BeautifulSoup
soup = BeautifulSoup(cleaned_req, 'html.parser')

# Finding all third-level headers and doing a formatted print
h3_by_tag = soup.find_all('h3')
print("Number of h3 elements found: ", len(h3_by_tag))
for n, h3 in enumerate(h3_by_tag):
    print(f"Workshop #{n} - {h3.get_text()}")

In [None]:
# An alternative using the # An alternative using the "class" attribute, instead of the h3 tag
h3_by_class = soup.find_all(class_="title text-base md:text-[1.75rem] leading-[2.125rem] font-semibold")


In [None]:
# Get the parent of the first h3 element and prettify it
div_firsth3 = h3_by_class[0].parent
#print(str(div_firsth3))
print(div_firsth3.prettify())

In [None]:
dict_workshop = {}
dict_workshop['host'] = div_firsth3.find('h3').get_text()
dict_workshop['link'] = div_firsth3.find('h3').find('a').get('href')
dict_workshop['curriculum'] = div_firsth3.get('data-curriculum')
dict_workshop['country'] = div_firsth3.get('data-country')
dict_workshop['format'] = div_firsth3.get('data-meeting')
dict_workshop['program'] = div_firsth3.get('data-program')

In [None]:
# Find all divs that match a class attribute
divs = soup.find_all('div', class_="p-8 mb-5 border")

workshop_list = []
for item in divs:
    dict_workshop = {}
    dict_workshop['host'] = item.find('h3').get_text()
    dict_workshop['link'] = item.find('h3').find('a').get('href')
    dict_workshop['curriculum'] = item.get('data-curriculum')
    dict_workshop['country'] = item.get('data-country')
    dict_workshop['format'] = item.get('data-meeting')
    dict_workshop['program'] = item.get('data-program')
    workshop_list.append(dict_workshop)

upcoming_workshops_df = pd.DataFrame(workshop_list)

In [None]:
# Get HTML and parse it with BeautifulSoup
url_past = 'https://carpentries.org/workshops/past-workshops/'
req_past = requests.get(url_past).text

soup_past = BeautifulSoup(req_past, 'html.parser')

# Find all divs that match a class attribute
divs_past = soup_past.find_all('div', class_="p-8 mb-5 border")

# Create an empty list, and fill it with info on each of the workshops found
workshop_list = []
for item in divs_past:
    dict_workshop = {}
    dict_workshop['host'] = item.find('h3').get_text()
    dict_workshop['link'] = item.find('h3').find('a').get('href')
    dict_workshop['curriculum'] = item.get('data-curriculum')
    dict_workshop['country'] = item.get('data-country')
    dict_workshop['format'] = item.get('data-meeting')
    dict_workshop['program'] = item.get('data-program')
    workshop_list.append(dict_workshop)

# Transform list into a DataFrame
pastworkshops_df  = pd.DataFrame(workshop_list)

print('Total number of workshops in the table: ', len(pastworkshops_df))

print('Top 5 of countries by number of workshops held: \n',
      pastworkshops_df['country'].value_counts().head())

In [None]:
from time import sleep
print('First')
sleep(5)
print('Second')

In [None]:
first_url = upcoming_workshops_df.loc[0, 'link']
print("URL we are visiting: ", first_url)

req = requests.get(first_url).text
cleaned_req = re.sub(r'\s*\n\s*', '', req).strip()

soup = BeautifulSoup(cleaned_req, 'html.parser')

In [None]:
urls = list(upcoming_workshops_df.loc[:5, 'link'])

In [None]:
list_of_workshops = []

for item in tqdm(urls):
    req = requests.get(item).text
    cleaned_req = re.sub(r'\s*\n\s*', '', req).strip()
    soup = BeautifulSoup(cleaned_req, 'html.parser')

    dict_w = {}
    dict_w['link'] = item

    dict_w['startdate'] = soup.find('meta', attrs = {'name': 'startdate'}).get('content')
    dict_w['enddate'] = soup.find('meta', attrs = {'name': 'enddate'}).get('content')
    dict_w['language'] = soup.find('meta', attrs = {'name': 'language'}).get('content')
    dict_w['latlng'] = soup.find('meta', attrs = {'name': 'latlng'}).get('content')
    dict_w['instructor'] = soup.find('meta', attrs = {'name': 'instructor'}).get('content')
    dict_w['helper'] = soup.find('meta', attrs = {'name': 'helper'}).get('content')

    list_of_workshops.append(dict_w)

    sleep(3)

extradata_upcoming_df = pd.DataFrame(list_of_workshops)
    

In [None]:
response = requests.get(first_url)
status_code = response.status_code
print(status_code)

In [None]:
if status_code == 200:
    # proceed with scraping
else:
    # handle or skip this URL