### Thai - Registry : Scraping translated data

### https://porta.fda.moph.go.th/FDA_SEARCH_ALL/MAIN/SEARCH_CENTER_MAIN.aspx

In [1]:
#!pip install plyer
#from plyer import notification
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from bs4 import BeautifulSoup
import geckodriver_autoinstaller
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from tqdm import tqdm
from datetime import datetime as date
import pandas as pd, numpy as np
import warnings as ws
ws.filterwarnings("ignore")
import re, os, time, glob
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException, TimeoutException
from selenium.webdriver.common.keys import Keys
import pickle
from typing import List
# Create directory if it doesn't exist
directory = "Fetched Data"
if not os.path.exists(directory):
    os.makedirs(directory)
# Initialize Firefox with options
options = Options()
options.binary_location = r'C:\Program Files\Mozilla Firefox\firefox.exe'
geckodriver_autoinstaller.install()


def clean_str(string):
    string = string.replace('\xa0', " ").strip()
    string = string.replace('\n', "")
    string = string.replace('\t', "")
    return string.strip()

Firefox version:  122.0
Available geckodriver version:  v0.34.0


In [2]:
## Use IVD Gateway Major player to search MDA using company names
major_players = pd.read_csv("IVD_gateway_major_players_actual.csv")
major_players 
players = major_players.player_name.values.tolist()
## Split out company names delimited by '-'
keywords = []
for name in players:
    if '-' in name:
        spread_names = name.split('-')
        for i in spread_names:
            keywords.append(i.strip())
    else:
        keywords.append(name)

In [3]:
seen = set()
keywords_indexed = [x for x in keywords if not (x in seen or seen.add(x))]
len(keywords_indexed)

345

In [4]:
driver = webdriver.Firefox(options=options)
driver.get('https://addons.mozilla.org/en-US/firefox/addon/traduzir-paginas-web/?utm_source=addons.mozilla.org&utm_medium=referral&utm_content=search')

In [7]:
catch = dict()

In [None]:
print("Searching FDA TH using keywords...")

for keyword in keywords_indexed[201:]:
    collector = []
    if keyword not in catch.keys() and len(keyword) > 3:
        print(keyword)
        try:
            driver, total_results = load_search(keyword, driver)
            page = 0
            if total_results == 'No records':
                print(f"No results for {keyword}.")
            elif total_results == 'One Page':
                flag, table = extract_page_data(driver)
                collector.append(table)
            else:
                flag, table = extract_page_data(driver)
                collector.append(table)
                ## find next button until
                total_res = total_results.strip().split(" ")[0]
                page_count = round(int(total_res) / 50)
                while page <= page_count:
                    page += 1
                    wait = WebDriverWait(driver, 10)
                    next_btn = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'rgPageNext')))
                    next_btn.click()
                    table = extract_page_data(driver)
                    collector.append(table)
                    time.sleep(1)
        except Exception as e:
            print(f"Error for {keyword}")
            raise(e)

       
    catch[keyword] = collector
    
    

Searching FDA TH using keywords...
Robonik
No results for Robonik.
Sacace
No results for Sacace.
Sakura


In [5]:
def load_search(keyword, driver):
    
    driver.get('https://porta.fda.moph.go.th/FDA_SEARCH_ALL/MAIN/SEARCH_CENTER_MAIN.aspx')
    driver.maximize_window()
    time.sleep(2)
    try:
        pop_close = driver.find_element(By.XPATH,'//*[@id="myModal2"]/div/div/div[3]/button')
        pop_close.click()
    except Exception as e:
        print(e, keyword)
    #driver.find_element(By.XPATH , "//div[@class = 'modal-content']/div[@class='modal-footer']/button[@class = 'btn btn-default']").click()
    time.sleep(4)

    driver.find_element(By.XPATH , "//div[@class = 'col-lg-2 col-md-2']/table/tbody/tr/td/input[@id = 'ContentPlaceHolder1_R_LIST']").click()
    time.sleep(4)

    driver.find_element(By.ID , 'ContentPlaceHolder1_CheckBoxList1').find_element(By.XPATH , "//tbody/tr[7]").click()
    time.sleep(4)

    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(3)

    wait = WebDriverWait(driver, 10)
    search_btn = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="ContentPlaceHolder1_btn_search"]')))
    input_box = driver.find_element(By.XPATH, '//*[@id="ContentPlaceHolder1_txt_search"]')
    input_box.send_keys(keyword)
    driver.execute_script("arguments[0].click();", search_btn)
    time.sleep(2)
    wait = WebDriverWait(driver, 10)
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    flag, table = extract_page_data(driver)
    if flag == 'No results':
        return driver, 'No records'
    elif flag == 'One Page':
        return driver, 'One Page'
    else:
        totals = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="ContentPlaceHolder1_RadGrid1_ctl00_ctl03_ctl01_PageSizeComboBox_Input"]')))
        totals.click()
        fifty = driver.find_element(By.XPATH, '//*[@id="ContentPlaceHolder1_RadGrid1_ctl00_ctl03_ctl01_PageSizeComboBox_DropDown"]/div/ul/li[3]')
        fifty.click()
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        flag, table = extract_page_data(driver)
        if flag == 'One Page':
            return driver, 'One Page'
        else:
            total_res_count = driver.find_element(By.CLASS_NAME,'rgInfoPart')
            total_res_count = total_res_count.text

            return driver, total_res_count
    
def extract_page_data(driver):
    time.sleep(1)
    wait = WebDriverWait(driver, 30)
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    soup = BeautifulSoup(driver.page_source)
    data = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'rgMasterTable')))
    table = soup.find('table',{'class':'rgMasterTable'})
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    if 'No records to display.' in table.text:
        return 'No results', table
    else:
        table = pd.read_html(str(table))[0]
        try:
            total_res_count = driver.find_element(By.CLASS_NAME,'rgInfoPart')
            return 'Found', table   
        except NoSuchElementException:
            return 'One Page', table
        
        
        

In [33]:
extract_page_data(driver)

('One Page',
       ประเภทผลิตภัณฑ์        CAT_NO ใบสำคัญ/ใบอนุญาต  \
 0     เครื่องมือแพทย์  40230;005702       CHE5602513   
 1     เครื่องมือแพทย์  40240;005712       CHE5602513   
 2     เครื่องมือแพทย์  45560;005400       CHE5602513   
 3     เครื่องมือแพทย์  45840;009948       CHE5602513   
 4     เครื่องมือแพทย์  45950;009930       CHE5602513   
 5     เครื่องมือแพทย์  60300;110082       CHE5602513   
 6     เครื่องมือแพทย์  60400;110074       CHE5602513   
 7   medical equipment  61470;110075       CHE5602513   
 8   medical equipment  61480;110076       CHE5602513   
 9   medical equipment  62380;110072       CHE5602513   
 10  medical equipment  63390;110073       CHE5602513   
 11  medical equipment  64370;110071       CHE5602513   
 12  medical equipment  65570;110077       CHE5602513   
 13  medical equipment        005702       CHE6101357   
 14  medical equipment        005712       CHE6101357   
 15  medical equipment        005400       CHE6101357   
 16  medical equip

In [87]:
driver.quit()