In [1]:
from bs4 import BeautifulSoup
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import regex as re
from tqdm import tqdm
import pandas as pd
import time 
from dotenv import load_dotenv
import os
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from urllib3.exceptions import InsecureRequestWarning
import warnings

load_dotenv()
warnings.simplefilter(action='ignore', category=(FutureWarning, DeprecationWarning, InsecureRequestWarning))

In [2]:
def mount_session():
    headers = {
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 \
        (KHTML, like Gecko) Ubuntu Chromium/80.0.3987.163 Chrome/80.0.3987.163 Safari/537.36',
        'Accept-Language': 'en-US,en;q=0.9,es;q=0.8,pt;q=0.7'
    }

    session = requests.Session()
    retry = Retry(connect=3, backoff_factor=0.5)
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)

    return session, headers

def fix_base_prefix(url):
    prefix = 'https://exrx.net/Lists/'
    if url.startswith(prefix):
        return url
    return prefix + url

def fix_index_prefix(url):
    prefix = 'https://exrx.net/'
    if not url.startswith(prefix):
        url = url.replace("../../", prefix)
    return url

def scrape_base(url):
    def filter(to_filter):
        pattern = re.compile(r'.*ExList\/[A-Za-z]+\#*[A-Za-z]+')

        if re.search(pattern, str(to_filter)):
            return True 
        return False
    
    session, headers = mount_session()
    
    page = session.get(url, headers=headers, verify=False)
    soup = BeautifulSoup(page.content, 'html.parser')

    links = []
    for link in soup.find_all('a'):
        if filter(link):
            links.append(link.get('href'))

    return links

def scrape_instance(url):
    def filter(to_filter):
        pattern = re.compile(r'.*WeightExercises\/[A-Za-z]+\/[A-Za-z]+')

        if re.search(pattern, str(to_filter)):
            return True 
        return False
    
    session, headers = mount_session()

    links = []
    try:
        page = session.get(url, headers=headers, verify=False)
        soup = BeautifulSoup(page.content, 'html.parser')

        for link in soup.find_all('a'):
            if filter(link):
                links.append(link.get('href'))
    except requests.exceptions.ConnectionError:
        time.sleep(3)
    
    return links

In [3]:
base_url = 'https://exrx.net/Lists/Directory'
links = scrape_base(base_url)
standardized_links = [fix_base_prefix(link) for link in links]

In [4]:
total_indexes = []
with tqdm(total=len(standardized_links)) as pbar:
    for link in standardized_links:
        total_indexes.extend(scrape_instance(link))
        pbar.update(1)
total_indexes = list(set(total_indexes))

  0%|          | 0/55 [00:00<?, ?it/s]

100%|██████████| 55/55 [00:09<00:00,  5.56it/s]


In [5]:
standardized_indexes = [fix_index_prefix(link) for link in total_indexes]

In [6]:
def account_login(url, driver):
    uName = os.getenv("uName")
    uPassword = os.getenv("uPassword")

    driver.get(url)
    driver.find_element("name", "uName").send_keys(uName)
    driver.find_element("name", "uPassword").send_keys(uPassword)
    driver.find_element("name", "uPassword").send_keys(Keys.RETURN)

def mount_driver():
    options = Options()
    options.add_argument('--headless=new')
    options.add_argument(
        '--user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 \
        (KHTML, like Gecko) Ubuntu Chromium/80.0.3987.163 Chrome/80.0.3987.163 Safari/537.36')
    options.add_argument('user-data-dir=C:/Users/bhava/Documents/GitHub/Workout-Planner/profile')
    driver = webdriver.Chrome(options=options)

    account_login('https://exrx.net/login', driver)

    return driver

In [30]:
df = pd.DataFrame(columns=["exercise", "utility", "mechanics", "force", "target_muscles", "synergist_muscles", "stabilizer_muscles", "dynamic_stabilizer_muscles", "antagonist_stabilizer_muscles"])

def remove_and_replace(muscles):
    if len(muscles) in [0, 1]:
        return muscles

    cleaned_muscles = []
    reversed_muscles = muscles[::-1]

    embedded = reversed_muscles[0]
    for i in range(len(reversed_muscles)):
        if i == 0:
            cleaned_muscles.append(reversed_muscles[i])
            continue

        cleaned_muscles.append(reversed_muscles[i].replace(embedded, ""))
        embedded = reversed_muscles[i]
    return cleaned_muscles[::-1]

def extract_muscles(soup, section_name):
    muscle_ul = soup.find("strong", text=section_name)
    muscles = []

    try:
        muscle_ul = muscle_ul.next_element.next_element
        for i in muscle_ul.find_all("li"):
            muscles.append(i.text)
    except (AttributeError, TypeError):
        muscles.append(muscle_ul.text if muscle_ul else '')
    muscles = remove_and_replace(muscles)
    return muscles

def extract_exercise(soup):
    try:
        exercise = soup.find("h1").text
    except AttributeError:
        exercise = ""
    return exercise

def extract_classifications(soup):
    classifications = []

    try:
        table = soup.find("table")
        rows = table.find_all("tr")

        for row in rows:
            cells = row.find_all("td")
            for cell in cells:
                classifications.append(cell.text.replace(":", ""))
        classifications = remove_and_replace(classifications)
    except AttributeError:
        classifications = ["", "", "", "", "", ""]
    return classifications[:6]

def scrape_info(link, driver):
    session, headers = mount_session()
    page = session.get(link, headers=headers, verify=False)

    soup = BeautifulSoup(page.content, "html.parser")
    title = extract_exercise(soup)
    if not title:
        return None
    
    if title == "Premium Content":
        driver.get(link)
        soup = BeautifulSoup(driver.page_source, "html.parser")
        title = extract_exercise(soup)

    classifications = extract_classifications(soup)

    target_muscles = extract_muscles(soup, "Target")
    synergist_muscles = extract_muscles(soup, "Synergists")
    stabilizer_muscles = extract_muscles(soup, "Stabilizers")
    dynamic_stabilizer_muscles = extract_muscles(soup, "Dynamic Stabilizers")
    antagonist_stabilizer_muscles = extract_muscles(soup, "Antagonist Stabilizers")

    data = {
        "exercise": title,
        "utility": classifications[1],
        "mechanics": classifications[3],
        "force": classifications[5],
        "target_muscles": target_muscles,
        "synergist_muscles": synergist_muscles,
        "stabilizer_muscles": stabilizer_muscles,
        "dynamic_stabilizer_muscles": dynamic_stabilizer_muscles,
        "antagonist_stabilizer_muscles": antagonist_stabilizer_muscles
    }

    return data

In [34]:
data_list = []
driver = mount_driver()
with tqdm(total=len(standardized_indexes)) as pbar:
    for link in standardized_indexes:
        data = scrape_info(link, driver)
        if data:
            data_list.append(data)
        pbar.update(1)
driver.quit()

100%|██████████| 1467/1467 [1:35:43<00:00,  3.91s/it]


In [35]:
df = pd.DataFrame(data_list)
df.to_csv("exrx.csv", index=False)
df.head(n=10)

Unnamed: 0,exercise,utility,mechanics,force,target_muscles,synergist_muscles,stabilizer_muscles,dynamic_stabilizer_muscles,antagonist_stabilizer_muscles
0,Barbell Side Lunge,Auxiliary,Compound,Push,[Gluteus Maximus],"[Quadriceps, Adductor Magnus (lead leg), Adduc...",[Erector Spinae],"[Hamstrings, Gastrocnemius]",[]
1,Dumbbell Decline Bench Press,Basic or Auxiliary,Compound,Push,"[Pectoralis Major, Sternal]","[Pectoralis Major, Clavicular, Deltoid, Anteri...",[],[Dynamic Stabilizers],[]
2,Lever Incline Bench Press,Basic or Auxiliary,Compound,Push,"[Pectoralis Major, Clavicular]","[Pectoralis Major, Sternal, Deltoid, Anterior,...",[],"[Biceps Brachii, Short Head]",[]
3,Cable Incline Chest Press,Basic or Auxiliary,Compound,Push,"[Pectoralis Major, Clavicular]","[Pectoralis Major, Sternal, Deltoid, Anterior,...",[],[Dynamic Stabilizers],[]
4,Cable One Arm Split Squat,Auxiliary,Compound,Push,[Quadriceps],"[Gluteus Maximus, Adductor Magnus, Soleus]",[Erector Spinae],[Dynamic Stabilizers],[]
5,Incline Twisting Crunch,Auxiliary,Isolated,Pull,[Obliques],"[Rectus Abdominis, Psoas Major]",[Tibialis Anterior],[],[]
6,Barbell Rear Delt Row,Basic or Auxiliary,Compound,Pull,"[Deltoid, Posterior]","[Infraspinatus, Teres Minor, Deltoid, Lateral,...",[Erector Spinae],[Dynamic Stabilizers],"[Rectus Abdominis, Obliques]"
7,Lever Deadlift (plate loaded),Basic,Compound,Pull,[Target],"[Gluteus Maximus, Adductor Magnus, Quadriceps,...","[Trapezius, Middle]","[Hamstrings, Gastrocnemius]","[Rectus Abdominis, Obliques]"
8,Dumbbell Incline Curl,Basic or Auxiliary,Isolated,Pull,[Biceps Brachii],"[Brachialis, Brachioradialis]","[Deltoid, Anterior]",[],[]
9,Cable Lying Row,Basic or Auxiliary,Compound,Pull,"[Back, General]","[Trapezius, Middle, Trapezius, Lower, Rhomboid...",[],[Dynamic Stabilizers],[]
