In [9]:
from bs4 import BeautifulSoup
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import regex as re
from tqdm import tqdm
import pandas as pd
import time 
import pickle as pkl
import os
from urllib3.exceptions import InsecureRequestWarning
import warnings

warnings.simplefilter(action='ignore', category=(FutureWarning, DeprecationWarning, InsecureRequestWarning))

In [2]:
def fix_base_prefix(url):
    prefix = 'https://web.archive.org/web/20230920212653/https://exrx.net/Lists/'
    if url.startswith(prefix):
        return url
    return prefix + url

def fix_index_prefix(url):
    prefix = 'https://web.archive.org/'
    if url.startswith(prefix):
        return url
    return prefix + url

def scrape_base(url):
    def filter(to_filter):
        pattern = re.compile(r'.*ExList\/[A-Za-z]+\#*[A-Za-z]+')

        if re.search(pattern, str(to_filter)):
            return True 
        return False
    
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')

    links = []
    for link in soup.find_all('a'):
        if filter(link):
            links.append(link.get('href'))

    return links

def mount_session():
    headers = {
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 \
        (KHTML, like Gecko) Ubuntu Chromium/80.0.3987.163 Chrome/80.0.3987.163 Safari/537.36',
        'Accept-Language': 'en-US,en;q=0.9,es;q=0.8,pt;q=0.7'
    }

    session = requests.Session()
    retry = Retry(connect=3, backoff_factor=0.5)
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)

    return session, headers

def scrape_instance(url):
    def filter(to_filter):
        pattern = re.compile(r'.*WeightExercises\/[A-Za-z]+\/[A-Za-z]+')

        if re.search(pattern, str(to_filter)):
            return True 
        return False
    
    session, headers = mount_session()

    links = []
    try:
        page = session.get(url, headers=headers, verify=False)
        soup = BeautifulSoup(page.content, 'html.parser')

        for link in soup.find_all('a'):
            if filter(link):
                links.append(link.get('href'))
    except requests.exceptions.ConnectionError:
        time.sleep(3)
    
    return links

In [3]:
base_url = 'https://web.archive.org/web/20230920212653/https://exrx.net/Lists/Directory'

links = scrape_base(base_url)
standardized_links = [fix_base_prefix(link) for link in links]

In [10]:
total_indexes = []
if os.path.exists('total_indexes.pkl'):
    with open('total_indexes.pkl', 'rb') as f:
        total_indexes = pkl.load(f)
else:
    with tqdm(total=len(links)) as pbar:
        for link in standardized_links:
            total_indexes.extend(scrape_instance(link))
            pbar.update(1)
    total_indexes = list(set(total_indexes))
    pkl.dump(total_indexes, open('total_indexes.pkl', 'wb'))

In [12]:
standardized_indexes = [fix_index_prefix(link) for link in total_indexes]

In [13]:
pattern = r'https://exrx\.net/(.+)'
links = ['https://exrx.net/' + re.search(pattern, link).group(1) for link in standardized_indexes]

In [141]:
df = pd.DataFrame(columns=["exercise", "utility", "mechanics", "force", "target_muscles", "synergist_muscles", "stabilizer_muscles"])

def remove_and_replace(muscles):
    if len(muscles) in [0, 1]:
        return muscles

    cleaned_muscles = []
    reversed_muscles = muscles[::-1]

    embedded = reversed_muscles[0]
    for i in range(len(reversed_muscles)):
        if i == 0:
            cleaned_muscles.append(reversed_muscles[i])
            continue

        cleaned_muscles.append(reversed_muscles[i].replace(embedded, ""))
        embedded = reversed_muscles[i]
    return cleaned_muscles[::-1]

def extract_muscles(soup, section_name):
    muscle_ul = soup.find("strong", text=section_name)
    muscles = []

    try:
        muscle_ul = muscle_ul.next_element.next_element
        for i in muscle_ul.find_all("li"):
            muscles.append(i.text)
    except (AttributeError, TypeError):
        muscles.append(muscle_ul.text if muscle_ul else '')
    muscles = remove_and_replace(muscles)
    return muscles

def extract_exercise(soup):
    try:
        exercise = soup.find("h1").text
    except AttributeError:
        exercise = ""
    return exercise

def extract_classifications(soup):
    classifications = []

    try:
        table = soup.find("table")
        rows = table.find_all("tr")

        for row in rows:
            cells = row.find_all("td")
            for cell in cells:
                classifications.append(cell.text.replace(":", ""))
        classifications = remove_and_replace(classifications)
    except AttributeError:
        classifications = ["", "", "", "", "", ""]
    return classifications[:6]

def scrape_info(link):
    session, headers = mount_session()

    try:
        page = session.get(link, headers=headers, verify=False)
        soup = BeautifulSoup(page.content, "html.parser")

        title = extract_exercise(soup)
        if title == "":
            return None

        classifications = extract_classifications(soup)

        target_muscles = extract_muscles(soup, "Target")
        synergist_muscles = extract_muscles(soup, "Synergists")
        stabilizer_muscles = extract_muscles(soup, "Stabilizers")
    except requests.exceptions.ConnectionError:
        time.sleep(3)

    data = {
        "exercise": title,
        "utility": classifications[1],
        "mechanics": classifications[3],
        "force": classifications[5],
        "target_muscles": target_muscles,
        "synergist_muscles": synergist_muscles,
        "stabilizer_muscles": stabilizer_muscles
    }

    return data

In [142]:
data_list = []
premium_list = []
with tqdm(total=len(standardized_indexes)) as pbar:
    for link in links:
        data = scrape_info(link)
        # Exercise is either behind a pay wall or does not exist
        if data is None or data['exercise'] == 'Premium Content':
            premium_list.append(link)
        else:
            data_list.append(data)
        pbar.update(1)

100%|██████████| 1406/1406 [47:36<00:00,  2.03s/it] 


In [144]:
pkl.dump(premium_list, open("premium_list.pkl", "wb"))

df = pd.DataFrame(data_list)
df.to_csv("exrx.csv", index=False)
df.head(n=10)

Unnamed: 0,exercise,utility,mechanics,force,target_muscles,synergist_muscles,stabilizer_muscles
0,Weighted Back Raise (on hyperextension apparatus),Auxiliary,Isolated,Pull,[Hamstrings],"[Gluteus Maximus, Adductor Magnus]","[Erector Spinae, Latissimus Dorsi, Teres Major..."
1,Barbell Side Split Squat,Auxiliary,Compound,Push,[Quadriceps],"[Gluteus Maximus, Adductor Magnus (lead leg), ...","[Erector Spinae, Gluteus Medius, Gluteus Minimus]"
2,Machine-assisted Chest Dip,Basic,Compound,Push,"[Pectoralis Major, Sternal]","[Deltoid, Anterior, Triceps Brachii, Pectorali...","[Trapezius, Lower]"
3,Weighted Hanging Leg Raise,Auxiliary,Isolated,Pull,[Iliopsoas],"[Tensor Fasciae Latae, Pectineus, Sartorius, A...","[Rectus Abdominis, Obliques]"
4,Suspended Inverted Row,Basic or Auxiliary,Compound,Pull,"[Back, General]","[Trapezius, Middle, Trapezius, Lower, Rhomboid...","[Erector Spinae, Gluteus Maximus, Hamstrings]"
5,Cable Incline Shoulder Raise,Auxiliary,Isolated,Push,[Serratus Anterior],"[Pectoralis Major, Clavicular]","[Deltoid, Anterior, Triceps Brachii]"
6,Cable One Arm Bent-over Row,Auxiliary,Compound,Pull,"[Back, General]","[Trapezius, Middle, Trapezius, Lower, Rhomboid...","[Obliques, Hamstrings, Gluteus Maximus, Tricep..."
7,Bench Dip (heels on floor),Basic,Compound,Push,[Triceps Brachii],"[Deltoid, Anterior, Pectoralis Major, Sternal,...","[Trapezius, Lower]"
8,Dumbbell Incline Fly,Auxiliary,Isolated,Push,"[Pectoralis Major, Clavicular]","[Pectoralis Major, Sternal, Deltoid, Anterior,...","[Biceps Brachii, Brachialis, Triceps Brachii, ..."
9,Hanging Leg Raise (with ab straps),Auxiliary,Isolated,Pull,[Iliopsoas],"[Tensor Fasciae Latae, Pectineus, Sartorius, A...","[Rectus Abdominis, Obliques]"
