In [255]:
from bs4 import BeautifulSoup
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import regex as re
from tqdm import tqdm
import pandas as pd
import time 
import pickle as pkl
from urllib3.exceptions import InsecureRequestWarning
import warnings

warnings.simplefilter(action='ignore', category=(FutureWarning, DeprecationWarning, InsecureRequestWarning))

In [206]:
def fix_base_prefix(url):
    prefix = 'https://web.archive.org/web/20230920212653/https://exrx.net/Lists/'
    if url.startswith(prefix):
        return url
    return prefix + url

def fix_index_prefix(url):
    prefix = 'https://web.archive.org/'
    if url.startswith(prefix):
        return url
    return prefix + url

def scrape_base(url):
    def filter(to_filter):
        pattern = re.compile(r'.*ExList\/[A-Za-z]+\#*[A-Za-z]+')

        if re.search(pattern, str(to_filter)):
            return True 
        return False
    
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')

    links = []
    for link in soup.find_all('a'):
        if filter(link):
            links.append(link.get('href'))

    return links

def mount_session():
    headers = {
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 \
        (KHTML, like Gecko) Ubuntu Chromium/80.0.3987.163 Chrome/80.0.3987.163 Safari/537.36',
        'Accept-Language': 'en-US,en;q=0.9,es;q=0.8,pt;q=0.7'
    }

    session = requests.Session()
    retry = Retry(connect=3, backoff_factor=0.5)
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)

    return session, headers

def scrape_instance(url):
    def filter(to_filter):
        pattern = re.compile(r'.*WeightExercises\/[A-Za-z]+\/[A-Za-z]+')

        if re.search(pattern, str(to_filter)):
            return True 
        return False
    
    session, headers = mount_session()

    links = []
    try:
        page = session.get(url, headers=headers, verify=False)
        soup = BeautifulSoup(page.content, 'html.parser')

        for link in soup.find_all('a'):
            if filter(link):
                links.append(link.get('href'))
    except requests.exceptions.ConnectionError:
        time.sleep(5)
    
    return links

In [63]:
base_url = 'https://web.archive.org/web/20230920212653/https://exrx.net/Lists/Directory'

links = scrape_base(base_url)
standardized_links = [fix_base_prefix(link) for link in links]

['https://web.archive.org/web/20230920212653/https://exrx.net/Lists/ExList/NeckWt', 'https://web.archive.org/web/20230920212653/https://exrx.net/Lists/ExList/NeckWt#Sternocleidomastoid', 'https://web.archive.org/web/20230920212653/https://exrx.net/Lists/ExList/NeckWt#Splenius', 'https://web.archive.org/web/20230920212653/https://exrx.net/Lists/ExList/ShouldWt', 'https://web.archive.org/web/20230920212653/https://exrx.net/Lists/ExList/ShouldWt#Anterior', 'https://web.archive.org/web/20230920212653/https://exrx.net/Lists/ExList/ShouldWt#Lateral', 'https://web.archive.org/web/20230920212653/https://exrx.net/Lists/ExList/ShouldWt#Posterior', 'https://web.archive.org/web/20230920212653/https://exrx.net/Lists/ExList/ShouldWt#Supraspinatus', 'https://web.archive.org/web/20230920212653/https://exrx.net/Lists/ExList/ArmWt', 'https://web.archive.org/web/20230920212653/https://exrx.net/Lists/ExList/ArmWt#Triceps', 'https://web.archive.org/web/20230920212653/https://exrx.net/Lists/ExList/ArmWt#Bic

In [122]:
total_indexes = []
with tqdm(total=len(links)) as pbar:
    for link in standardized_links:
        total_indexes.extend(scrape_instance(link))
        pbar.update(1)
total_indexes = list(set(total_indexes))

100%|██████████| 55/55 [06:08<00:00,  6.71s/it]


In [134]:
standardized_indexes = [fix_index_prefix(link) for link in total_indexes]

In [183]:
pattern = r'https://exrx\.net/(.+)'
links = ['https://exrx.net/' + re.search(pattern, link).group(1) for link in standardized_indexes]

In [251]:
df = pd.DataFrame(columns=["exercise", "utility", "mechanics", "force", "target_muscles", "synergist_muscles", "stabilizer_muscles"])

def extract_muscles(soup, section_name):
    muscle_ul = soup.find("strong", text=section_name)
    muscles = []
    try:
        muscle_ul = muscle_ul.next_element.next_element
        for i in muscle_ul.find_all("li"):
            muscles.append(i.text)
    except (AttributeError, TypeError):
        muscles.append(muscle_ul.text if muscle_ul else '')
    return muscles

def extract_exercise(soup):
    try:
        exercise = soup.find("h1").text
    except AttributeError:
        exercise = ""
    return exercise

def extract_table(soup):
    try:
        table = soup.find_all("table")
    except AttributeError:
        table = ""
    return table

def scrape_info(link):
    session, headers = mount_session()

    try:
        page = session.get(link, headers=headers, verify=False)
        soup = BeautifulSoup(page.content, "html.parser")

        title = extract_exercise(soup)
        if title == "":
            return None

        table = extract_table(soup)

        classifications = []
        for row in table:
            row = row.find_all("tr")
            for td in row:
                td = td.find_all("td")
                for i in td:
                    classifications.append(i.text.replace(":", ""))

        if not classifications:
            classifications = ["", "", "", "", "", ""]

        target_muscles = extract_muscles(soup, "Target")
        synergist_muscles = extract_muscles(soup, "Synergists")
        stabilizer_muscles = extract_muscles(soup, "Stabilizers")
    except requests.exceptions.ConnectionError:
        time.sleep(3)

    data = {
        "exercise": title,
        "utility": classifications[1],
        "mechanics": classifications[3],
        "force": classifications[5],
        "target_muscles": target_muscles,
        "synergist_muscles": synergist_muscles,
        "stabilizer_muscles": stabilizer_muscles
    }

    return data

In [253]:
data_list = []
premium_list = []
with tqdm(total=len(standardized_indexes)) as pbar:
    for link in links:
        data = scrape_info(link)
        # Exercise is either behind a pay wall or does not exist
        if data is None or data['exercise'] == 'Premium Content':
            premium_list.append(link)
        else:
            data_list.append(data)
        pbar.update(1)

100%|██████████| 1493/1493 [53:38<00:00,  2.16s/it] 


In [257]:
df = pd.DataFrame(data_list)
df.to_csv("new_exrx.csv", index=False)
df.head(n=10)
pkl.dump(premium_list, open("premium_list.pkl", "wb"))

Unnamed: 0,exercise,utility,mechanics,force,target_muscles,synergist_muscles,stabilizer_muscles
0,Lever Single Leg Split V-Squat,AuxiliaryMechanicsCompoundForcePush,CompoundForcePush,Push,[Gluteus Maximus],"[QuadricepsAdductor MagnusSoleus, Adductor Mag...",[Gluteus MediusGluteus MinimusQuadratus Lumbor...
1,Lever Squat (plate loaded),BasicMechanicsCompoundForcePush,CompoundForcePush,Push,[Quadriceps],"[Gluteus MaximusAdductor MagnusSoleus, Adducto...","[Erector SpinaeTrapezius, UpperTrapezius, Midd..."
2,Side Bridge Hip Abduction,AuxiliaryMechanicsIsolatedForcePush,IsolatedForcePush,Push,[Tensor Fasciae LataeGluteus MediusGluteus Min...,[ObliquesQuadratus lumborumPsoas majorIliocast...,[Latissimus dorsiPectoralis minorLevator Scapu...
3,Sled One Hand Grip,Basic or AuxiliaryMechanicsIsolatedForcePull,IsolatedForcePull,Pull,[Wrist Flexors],[None],[No significant stabilizers]
4,Barbell Decline Bench Press,Basic or AuxiliaryMechanicsCompoundForcePush,CompoundForcePush,Push,"[Pectoralis Major, Sternal]","[Pectoralis Major, ClavicularDeltoid, Anterior...",[]
5,Dumbbell Lateral Step-up,AuxiliaryMechanicsCompoundForcePush,CompoundForcePush,Push,[Quadriceps],[Gluteus MaximusAdductor MagnusSoleusGastrocne...,[Erector SpinaeGluteus MediusGluteus MinimusQu...
6,Weighted Twisting Crunch,AuxiliaryMechanicsIsolatedForcePull,IsolatedForcePull,Pull,[Obliques],"[Rectus AbdominisPsoas major, Psoas major]",[No significant stabilizers]
7,Lever Lying Rear Lateral Raise,AuxiliaryMechanicsIsolatedForcePull,IsolatedForcePull,Pull,"[Deltoid, Posterior]","[InfraspinatusTeres MinorTrapezius, MiddleTrap...",[No significant stabilizers]
8,Cable Bent-over Triceps Extension,Basic or AuxiliaryMechanicsIsolatedForcePush,IsolatedForcePush,Push,[Triceps Brachii],[None],"[Deltoid, AnteriorPectoralis Major, Clavicular..."
9,Cable Rear Lunge,AuxiliaryMechanicsCompoundForcePush,CompoundForcePush,Push,[Quadriceps],"[Gluteus MaximusAdductor MagnusSoleus, Adducto...","[Erector SpinaeTrapezius, UpperTrapezius, Lowe..."
