In [1]:
import requests
from bs4 import BeautifulSoup

from tqdm import tqdm

In [2]:
def update_page(page_num):
    url = f"https://medex.com.bd/generics?page={page_num}"
    response = requests.get(url, timeout=5)
    response.raise_for_status()
    html = response.text
    
    return BeautifulSoup(html, "html")

In [3]:
# paragraphs = soup.find_all('a', class_='hoverable-block darker')
# for p in paragraphs:
#     # print(p.text)
#     link = p.get("href")
#     # text = p.text.strip()
#     title_div = p.find("div", class_="col-xs-12 data-row-top dcind-title")
#     title = title_div.text.strip() if title_div else "No title"
    
#     avail_brand_div = p.find("div", class_="col-xs-12 dcind")
#     avail_brand = avail_brand_div.text.replace(" available brands", "").strip() if avail_brand_div else "N/A"

In [4]:
page = 1
data = []

In [23]:
while True:
    soup = update_page(page)
    
    paragraphs = soup.find_all('a', class_='hoverable-block darker')
    for index, p in enumerate(paragraphs):
        # print(p.text)
        link = p.get("href")
        # text = p.text.strip()
        title_div = p.find("div", class_="col-xs-12 data-row-top dcind-title")
        title = title_div.text.strip() if title_div else "No title"
        
        avail_brand_div = p.find("div", class_="col-xs-12 dcind")
        avail_brand = avail_brand_div.text.replace(" available brands", "").strip() if avail_brand_div else "N/A"

        data.append({
            "Index": index,
            "Title": title,
            "Available Brands": avail_brand,
            "Link": link
        })
    if not paragraphs:
        break

    page += 1

In [24]:
import pandas as pd
medicine_types = pd.DataFrame(data)

In [26]:
medicine_types = pd.read_csv('data/generics/all_title.csv')

In [27]:
medicine_types["Available Brands"] = pd.to_numeric(medicine_types["Available Brands"], errors='coerce')

In [9]:
import os

# Ensure the directory exists
os.makedirs("data/generics", exist_ok=True)

medicine_types.to_csv("data/generics/all_title.csv", index=False)

All the title of the generics have been scraped. Now, we just have to fetch the medicines only.

In [28]:
def generic_brands_page(url):
    response = requests.get(url, timeout=5)
    response.raise_for_status()
    html = response.text
    
    return BeautifulSoup(html, "html")

In [29]:
nested_data = []

In [42]:
def fetch_from_main(avail_brands, brand_list):
    for brand in avail_brands:
        a_tag = brand.find("a", class_="hoverable-block brand-item")
        if not a_tag:
            continue
    
        link = a_tag.get("href")
        
        li_tag = a_tag.find("li", class_="data-row")
        if not li_tag:
            continue
            
        name_div = li_tag.find("div", class_="data-row-top")
        name = name_div.text.strip() if name_div else "N/A"
        
        strength_div = li_tag.find("div", class_="data-row-strength")
        strength = strength_div.text.strip() if strength_div else "N/A"
    
        company_div = li_tag.find("div", class_="data-row-company")
        company = company_div.text.strip() if company_div else "N/A"
    
        price_div = li_tag.find("div", class_="packages-wrapper")
        price = price_div.text.strip() if price_div else "N/A"
        
        brand_list.append({
            "Name": name,
            "Strength": strength,
            "Company": company,
            "Price": price,
            "Link": link
        })

In [43]:
def fetch_from_view_more(brand_page_link, brand_list):
    # view_more_link_tag = soup.find("a", string=lambda text: text and "View More Brands" in text)

    # if view_more_link_tag:
    view_more_url = f"{brand_page_link}/brand-names"
    print("View More URL:", view_more_url)
    # else:
    #     print("View More link not found.")
    
    ##==================================== go to view more if the page is already loaded ================================
    soup_in_tables = generic_brands_page(view_more_url)
    avail_brands_in_tables = soup_in_tables.find('table', class_="table gg-table bindex-table")
    
    if avail_brands_in_tables:
        rows = avail_brands_in_tables.find('tbody').find_all('tr', class_='brand-row')
        
        for row in rows:
            name = row.find('td', {'data-col': 'name'}).get_text(strip=True)
            strength = row.find_all('td')[2].get_text(strip=True)
            company = row.find_all('td')[3].get_text(strip=True)
            price = row.find('td', {'data-col': 'price'}).get_text(strip=True).replace('\n', ' ')
            link = row.get('data-href')
    
            brand_list.append({
                "Name": name,
                "Strength": strength,
                "Company": company,
                "Price": price,
                "Link": link
            })
    
    #==================================== fetched the data fom view more page ============================================

In [44]:
total_pages = 1622
pbar = tqdm(desc="Scraping pages")

Scraping pages: 0it [00:00, ?it/s]

In [45]:
for index, brand_page_link in enumerate(medicine_types["Link"]):
    # skip if link is NaN
    if pd.isna(brand_page_link):
        continue

    # Get the parent title
    parent_title = medicine_types.iloc[index]["Title"]
    available_brands = medicine_types.iloc[index]["Available Brands"]
    
    soup = generic_brands_page(brand_page_link)
    avail_brands = soup.select('div.available-brands')
    
    
    brand_list = []

    if (available_brands <= 44):
        fetch_from_main(avail_brands, brand_list)
    else:
        fetch_from_view_more(brand_page_link, brand_list)

    def get_section_text(soup, section_id):
        section = soup.find("div", id=section_id)
        if section and section.find_next("div", class_="ac-body"):
            return section.find_next("div", class_="ac-body").get_text(strip=True)
        return "N/A"

    description = get_section_text(soup, "description")
    indications = get_section_text(soup, "indications")
    composition = get_section_text(soup, "composition")
    pharmacology = get_section_text(soup, "mode_of_action")
    dosage = get_section_text(soup, "dosage")
    interaction = get_section_text(soup, "interaction")
    contraindications = get_section_text(soup, "contraindications")
    side_effects = get_section_text(soup, "side_effects")
    pregnancy_lactation = get_section_text(soup, "pregnancy_cat")
    pediatric_uses = get_section_text(soup, "pediatric_uses") 
    precautions = get_section_text(soup, "precautions")
    overdose_effects = get_section_text(soup, "overdose_effects")
    therapeutic_class = get_section_text(soup, "drug_classes")
    reconstitution = get_section_text(soup, "reconstitution")
    storage_conditions = get_section_text(soup, "storage_conditions")
    
    
    nested_data.append({
        "Title": parent_title,
        "Available Brands": available_brands,
        "Link": brand_page_link,
        "Brands": brand_list,
        "Description": description,
        "Indications": indications,
        "Composition": composition,
        "Pharmacology": pharmacology,
        "Dosage & Administrations": dosage,
        "Interaction": interaction,
        "Contraindications": contraindications,
        "Side Effects": side_effects,
        "Pregnancy & Lactation": pregnancy_lactation,
        "Use in Special Populations": pediatric_uses,
        "Precautions & Warnings": precautions,
        "Overdose Effects": overdose_effects,
        "Therapeutic Class": therapeutic_class,
        "Reconstitution": reconstitution,
        "Storage Conditions": storage_conditions,
    })
    pbar.update(1)
    page += 1

pbar.close()

Scraping pages: 9it [00:08,  1.43it/s]

View More URL: https://medex.com.bd/generics/3/aceclofenac/brand-names


Scraping pages: 31it [00:24,  1.47it/s]

KeyboardInterrupt: 

In [None]:
nested_df = pd.DataFrame(nested_data)
nested_df.to_csv("data/generics/detailed_generic_data.csv", index=False)

In [None]:
nested_data