# Import Packages

In [1]:
# Packages for part 1 (for sending requests)
import pandas as pd # for general data processing
import requests # for scraping data from websites
from bs4 import BeautifulSoup # for converting scraped data into structured html

# Packages for part 2 (for data storage, not needed here)
# import shutil # High level operation on files (example copying and removing)
# import time # For timing and measuring progress of download
# import numpy as np # For rounding digits
# import datetime #For measuring time
# import pytz #For defining timezone
# import os # to examine local directory
# import zipfile # for unzipping files

# Finding Events URL from Main Page URL

In [100]:
# define main page url
url = "https://typebooks.ca"

In [101]:
# Send a GET request to gather a response
response = requests.get(
    url = url,
    auth = None,
    params = None
)

In [102]:
#Examine response, 2XX is OK, 4XX is error
print(f"Response status code: {response.status_code}, status: {response.reason}")

# Alternative
print(response.ok)

Response status code: 200, status: OK
True


In [104]:
type(response)

requests.models.Response

In [103]:
# Examine text
response.text[0:300]

'<!doctype html>\n<html class="supports-no-js" lang="en">\n<head>\n  <meta charset="utf-8">\n  <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">\n  <meta name="viewport" content="width=device-width,initial-scale=1">\n  <meta name="theme-color" content="">\n  <link rel="canonical" href="https://'

In [105]:
# Convert response text blob into structured HTML format
# response.text is the unorganized text of the response
# `html.parser` is method which BeautifulSoup uses to organize text
soup = BeautifulSoup(response.text, 'html.parser')

In [106]:
type(soup)

bs4.BeautifulSoup

In [107]:
# Find all `a` tags, which contain the `href` tag, which contains other URLs
url_tags = soup.find_all('a')

In [110]:
# Examine tags
print(f"Number of URL tags:{len(url_tags)}")
print("")
print("")
print(url_tags[0:5])

Number of URL tags:185


[<a class="in-page-link visually-hidden skip-link" href="#MainContent">Skip to content</a>, <a class="medium-up--hide" href="/search">
<svg aria-hidden="true" class="icon icon-search" focusable="false" role="presentation" viewbox="0 0 20 20"><path d="M18.64 17.02l-5.31-5.31c.81-1.08 1.26-2.43 1.26-3.87C14.5 4.06 11.44 1 7.75 1S1 4.06 1 7.75s3.06 6.75 6.75 6.75c1.44 0 2.79-.45 3.87-1.26l5.31 5.31c.45.45 1.26.54 1.71.09.45-.36.45-1.17 0-1.62zM3.25 7.75c0-2.52 1.98-4.5 4.5-4.5s4.5 1.98 4.5 4.5-1.98 4.5-4.5 4.5-4.5-1.98-4.5-4.5z" fill="#444"></path></svg>
</a>, <a href="/account/login" id="customer_login_link">Log in</a>, <a href="/account/register" id="customer_register_link">Sign up</a>, <a class="site-header__cart" href="/cart">
<svg aria-hidden="true" class="icon icon-cart" focusable="false" role="presentation" viewbox="0 0 20 20"><path d="M18.936 5.564c-.144-.175-.35-.207-.55-.207h-.003L6.774 4.286c-.272 0-.417.089-.491.18-.079.096-.16.263-.094.585l2.016 5.705

In [111]:
# Initiate a blank list to store extracted url
url_list = list()

# Loop through each tag to extract urls using get method
for tag in url_tags:
    url_list.append(tag.get('href'))

In [112]:
# Visual examination of the urls extracted from tags
print(f"Extracted urls from tags:")

# Visually examine first 5 urls extracted from tags
for index, url in enumerate(url_list[0:5]):
    print(f"URL {index+1} / {len(url_list)} : {url}")

Extracted urls from tags:
URL 1 / 185 : #MainContent
URL 2 / 185 : /search
URL 3 / 185 : /account/login
URL 4 / 185 : /account/register
URL 5 / 185 : /cart


In [114]:
# To use pandas str.contains() method, the list of extracted urls was first converted into a DataFrame
url_df = pd.DataFrame(
    url_list, 
    columns = ['extracted_url']
)

# Examine the extracted urls
display(url_df.head())

Unnamed: 0,extracted_url
0,#MainContent
1,/search
2,/account/login
3,/account/register
4,/cart


In [125]:
# To find urls that lead to events, define keywords to identify said urls
keywords = ["event","calendar", "program", "series", "upcoming"]

# Define regex search term for list of keywords
search_term = '|'.join(keywords)
search_term

'event|calendar|program|series|upcoming'

In [126]:
# Filter the URLs DataFrame for urls that contains the keywords
cond1 = url_df['extracted_url'].str.lower()\
        .str.contains(search_term)

In [127]:
# Apply filter condition
url_df.loc[cond1]

Unnamed: 0,extracted_url
25,/pages/%F0%9F%92%8C-events


## Scraping from Events Page (Shopify)

In [43]:
# Define base URL
base_url = "https://typebooks.ca"

# Define events URL
events_url = base_url + url_df.loc[cond1,"extracted_url"].values[0]
print(events_url)w

https://typebooks.ca/pages/%F0%9F%92%8C-events


In [44]:
# Send a new response for events_url
with requests.get(url = events_url) as response:
    soup = BeautifulSoup(response.text, "html.parser")

Element inspector in Google Chrome was used to find common terms that contain calender information for the shopify style.

In [52]:
# Extract shopify tags from soup
tags = soup.find("section", id = "shopify-section-tapita-main-page").find_all("p")

# Examine tags
print(tags[1])

<p data-mce-fragment="1" data-mce-style="text-align: center;" style="text-align: center;"><span data-mce-fragment="1" data-mce-style="font-weight: 400;" style="font-weight: 400;">📚Tuesday, October 10 <a href="https://www.instagram.com/p/Cxf52cSLoW-/" target="_blank">BOOK LAUNCH</a>: <em>Mudflowers</em> by Aley Waterman // 7 PM at Type Queen</span><span data-mce-fragment="1" data-mce-style="font-weight: 400;" style="font-weight: 400;"><br data-mce-fragment="1"/></span></p>


In [128]:
# Define information to collect from scraping
dict = {
    "date":[],
    "link":[],
    "title":[],
    "text":[]
}

# Iterate through each tag to gather the data
for item in ["date","link","title","text"]:
    for tag in tags:
        if item == "link":
                try:
                    #Extract link
                    dict[item].append(tag.find("a").get("href"))
                except:
                    # Append blank
                    dict[item].append(np.NaN)
                    continue
        elif item == "date":
            try:
                #Extract date
                dict[item].append(tag.find("a").previous_sibling)
            except:
                # Append blank
                dict[item].append(np.NaN)
                continue
        elif item == "title":
            try:
                #Extract date
                dict[item].append(tag.find("em").text)
            except:
                # Append blank
                dict[item].append(np.NaN)
                continue
        elif item == "text":
            try:
                #Extract date
                dict[item].append(tag.find("em").next_sibling)
            except:
                # Append blank
                dict[item].append(np.NaN)
                continue
        else:
            pass

In [131]:
pd.DataFrame(dict).dropna()

Unnamed: 0,date,link,title,text
1,"📚Tuesday, October 10",https://www.instagram.com/p/Cxf52cSLoW-/,Mudflowers,by Aley Waterman // 7 PM at Type Queen
2,"📚Wednesday, October 11",https://www.instagram.com/p/CxnoqeXuhD2/,Furniture Music,by Gail Scott // 7 PM at Type Queen
3,"📚Wednesday, October 18",https://www.instagram.com/p/CxvSFh-uqQM/,Burn Diary,by Joshua Chris Bouchard // 7 PM at Type Queen
6,"📚Tuesday, October 24",https://www.instagram.com/p/CxYGrLwOChF/,Waiting for Tomorrow,by with Susan Yoon // 7 PM at Type Queen
7,"📚Wednesday, October 25",https://www.instagram.com/typebooks/?img_index=1,There is No Blue,by Martha Baillie // 7 PM at Type Queen
