# Selenium

In [1]:
import time
import pandas as pd
from tqdm import tqdm
from urllib.parse import urlparse
from selenium import webdriver
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.common.by import By
from difflib import SequenceMatcher
import os

path_to_chromedriver = '/Users/zqh/Documents/RIETI_new/RIETI_AI_Survey/output_data/geckodriver'
out_path = '/Users/zqh/Documents/RIETI_new/RIETI_AI_Survey/output_data/result/'

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

def get_clickable_text(elements):
    all_clickable_tag = [i.text for i in elements]
    all_clickable_tag = [i for i in all_clickable_tag if i != '' and not i.isspace()]
    #return list(set(all_clickable_tag + all_clickable_cls))
    return list(set(all_clickable_tag))


class WebPage():
    """
    A model of a webpage
    """
    def __init__(self, url, browser):
        self.url = url
        #self.url_domain = urlparse(url).netloc
        try:
            #self.browser = get_driver()
            browser.get(url)
            time.sleep(4)
            #self.current_url = browser.current_url.copy()
            #self.url_domain = urlparse(self.current_url).netloc
            self.browser_start = 1
            if url == 'http://mltrons.com/':
                self.browser_start = None
        except Exception:
            self.browser_start = None
            self.current_url = None
            
    @property
    def homepage(self):
        return browser.current_url
            
    @property
    def all_clickable(self):
        all_clickable_tag = browser.find_elements(By.TAG_NAME,"a")
        all_clickable_tag = [i.text for i in all_clickable_tag]
        all_clickable_tag = [i for i in all_clickable_tag if i != '' and not i.isspace()]
        return list(set(all_clickable_tag))
    
    
    def URLandContent(self, browser):
        if self.browser_start is None:
            page_urls, page_texts, page_urls_all = [], [], []
            return page_urls, page_texts, page_urls_all
        else:
            page_urls, page_texts, page_urls_all = self._URLandContent(browser)
            return page_urls, page_texts, page_urls_all
    
    def _URLandContent(self, browser):
        if len(self.all_clickable) >= 1:
            page_urls_click, page_texts_click, page_urls_all_click = self.URLandContentByClick(browser)
        else:
            page_urls_click, page_texts_click, page_urls_all_click = [], [], []
            self.homepage_current_url = self.homepage
            self.homepage_content = browser.find_element_by_xpath("/html/body").text
        browser.get(self.url)
        time.sleep(5)
        frames = browser.find_elements(By.TAG_NAME, 'frame')
        frames = [frame.get_attribute('src') for frame in frames]
        frames = [frame for frame in frames if frame.startswith('http')]
        if len(frames) >= 1:
            page_urls_frame, page_texts_frame, page_urls_all_frame = self.URLandContentFrame(browser, frames)
        else:
            page_urls_frame, page_texts_frame, page_urls_all_frame = [], [], []
        browser.get(self.url)
        time.sleep(5)
        frames = browser.find_elements(By.TAG_NAME, 'iframe')
        frames = [frame.get_attribute('src') for frame in frames]
        frames = [frame for frame in frames if frame.startswith('http')]
        if len(frames) >= 1:
            page_urls_iframe, page_texts_iframe, page_urls_all_iframe = self.URLandContentIFrame(browser, frames)
        else:
            page_urls_iframe, page_texts_iframe, page_urls_all_iframe = [], [], []
        page_urls = page_urls_click + page_urls_frame + page_urls_iframe
        page_texts = page_texts_click + page_texts_frame + page_texts_iframe
        page_urls_all = page_urls_all_click + page_urls_all_frame + page_urls_all_iframe
        return page_urls, page_texts, page_urls_all
        
    def URLandContentByClick(self, browser):
        page_texts = []
        page_urls = []
        page_urls_all = []
        all_clickable = self.all_clickable
        self.homepage_current_url = self.homepage
        self.url_domain = urlparse(self.homepage_current_url).netloc
        self.homepage_content = browser.find_element_by_xpath("/html/body").text
        
        for element in all_clickable:
            print(element)
            try:
                element = browser.find_element_by_link_text(element)
                browser.execute_script("arguments[0].target='_self'", element)
                browser.execute_script("arguments[0].click();", element)
                time.sleep(10)
                current_page_url = browser.current_url
                page_urls_all.append(current_page_url)
                if similar(urlparse(current_page_url).netloc, self.url_domain) >= 0.75:
                    page_texts.append(browser.find_element_by_xpath("/html/body").text)
                    page_urls.append(current_page_url)
                #browser.back()
                browser.get(self.homepage_current_url)
                time.sleep(5)
            except Exception:
                pass
        return page_urls, page_texts, page_urls_all
    
    
    def URLandContentFrame(self, browser, frames):
        page_texts = []
        page_urls = []
        page_urls_all = []
        #frames = browser.find_elements(By.TAG_NAME, 'frame')
        #frames = [frame.get_attribute('src') for frame in frames]
        for frame in frames:
            browser.get(frame)
            time.sleep(5)
            all_clickable_frame = get_clickable_text(browser.find_elements(By.TAG_NAME, "a"))
            for elem in all_clickable_frame:
                try:
                    element = browser.find_element_by_link_text(elem)
                    current_page_url = element.get_attribute('href')
                    page_urls_all.append(current_page_url)
                    browser.execute_script("arguments[0].target='_self'", element)
                    browser.execute_script("arguments[0].click();", element)
                    time.sleep(5)
                    if similar(urlparse(current_page_url).netloc, urlparse(frame).netloc) >= 0.75:
                        page_texts.append(browser.find_element_by_xpath("/html/body").text)
                        page_urls.append(current_page_url)
                    browser.get(frame)
                    time.sleep(5)
                except Exception:
                    pass
        return page_urls, page_texts, page_urls_all
    
    def URLandContentIFrame(self, browser, frames):
        page_texts = []
        page_urls = []
        page_urls_all = []
        #frames = browser.find_elements(By.TAG_NAME, 'iframe')
        #frames = [frame.get_attribute('src') for frame in frames]
        #frames = [frame for frame in frames if frame.startswith('http')]
        for frame in frames:
            browser.get(frame)
            time.sleep(5)
            all_clickable_frame = get_clickable_text(browser.find_elements(By.TAG_NAME, "a"))
            for elem in all_clickable_frame:
                try:
                    element = browser.find_element_by_link_text(elem)
                    current_page_url = element.get_attribute('href')
                    page_urls_all.append(current_page_url)
                    browser.execute_script("arguments[0].target='_self'", element)
                    browser.execute_script("arguments[0].click();", element)
                    time.sleep(5)
                    if similar(urlparse(current_page_url).netloc, urlparse(frame).netloc) >= 0.75:
                        page_texts.append(browser.find_element_by_xpath("/html/body").text)
                        page_urls.append(current_page_url)
                    browser.get(frame)
                    time.sleep(5)
                except Exception:
                    pass
        return page_urls, page_texts, page_urls_all
    
#%%
data_redo_lang = pd.read_csv('/Users/zqh/Documents/RIETI_new/RIETI_AI_Survey/output_data/crawl_test.csv')
data_redo_lang['hojin_id'] = data_redo_lang['hojin_id'].astype(int)
data_redo_lang = data_redo_lang.dropna()
hojin_ids = data_redo_lang['hojin_id']
url_webs = data_redo_lang['url_web']

result_data = []

for hojin_id, url_web in tqdm(zip(hojin_ids, url_webs)):
    hojin_id = int(hojin_id)
    try:
        browser = webdriver.Firefox(executable_path = path_to_chromedriver)
        browser.set_page_load_timeout(15)
        browser.set_script_timeout(15)
        UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1 Safari/605.1.15'
        options = webdriver.FirefoxOptions()
        options.add_argument('--user-agent=' + UA)
        #options.add_argument("--headless")
        try:
            web = WebPage(url_web, browser)
            time.sleep(4)
            temp = web.all_clickable
            page_urls, page_texts, page_urls_all = web.URLandContent(browser)
            homepage_content = web.homepage_content
            
            content = {
            'hojin_id': hojin_id,
            'homepage_content': homepage_content,
            'page_content': page_texts
        }
        except Exception:
            content = {
            'hojin_id': hojin_id,
            'homepage_content': None,
            'page_content': None
        }

        browser.close()
        result_data.append(content)

    except Exception:
        print("error")

results_df = pd.DataFrame(result_data)
results_df.to_csv('/Users/zqh/Desktop/test.csv', index=False, encoding='utf_8_sig')

0it [00:00, ?it/s]

決算公告
プライバシーポリシー
情報セキュリティポリシー
お知らせ一覧
保険商品、取扱保険会社
お問い合わせ
保険部門関連「各保険会社への事故・災害等の連絡先」
事業所一覧
社是・経営理念
新卒採用情報
第160期「貸借対照表」「損益計算書」を公開しました
キャリア採用情報
Web購買
事業内容紹介
採用情報
「金融商品の販売等に関する法律」に基づく勧誘方針
会社概要
関工商事株式会社
「Web購買」は登録した会社様の専用サイトです。一般のお客様はお問い合せからお気軽にご連絡ください。
個人情報保護方針
障がい者採用情報
トップメッセージ
関電工グループ
沿革


1it [07:12, 432.37s/it]

サイトマップ
恒温槽　流し台
第24回日本言語聴覚学会　機器展示ご来場御礼
フットケア・ハンドケア商品
製品カタログ
第90回 日本消化器内視鏡技師学会　機器展示　ご来場御礼
新製品「点滴モニタリングシステム・モニドロップ」
電動式人工喉頭
移乗システム「ロールボード」
日本麻酔科学会　第70回学術集会　機器展示　ご来場御礼
第9回日本医療安全学会学術総会　機器展示　ご来場御礼
センシンメディカルについて
ステンレス製品 (特注承ります)
商品紹介サイト（外部サイト）
硬性挿管用喉頭鏡「エアトラック」
日本臨床麻酔学会　第42回大会　機器展示　ご来場御礼
関連リンク
個人情報保護方針
製品情報
適格請求書発行事業者登録番号のお知らせ
ホーム


2it [13:14, 390.94s/it]

サイトマップ
プライバシーポリシー
日本語
トップページ
English
卓上内視鏡洗浄消毒器EFEWD
News
2
お問い合わせ
お役⽴ち資料
企業情報
価格改定
ISO 認証・許可証
医工連携・研究について
News一覧へ
福祉機器
披裂軟骨内転術用持針器 Ｋスティッチ
採用情報
鼻孔プロテクター
ショールーム
会社概要
難治性めまいへの弛まぬ挑戦
ファーストスコープ０８型
「夏季休業および出荷のご案内」
3
透明性ガイドライン
医療機器
年末年始休業および出荷のご案内
1
トップメッセージ
ZAOSONiCの鼻科手術への応用
製品情報
会社案内
4
沿革
クリスタルアート


3it [23:43, 474.60s/it]


# Request

In [46]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
from tqdm import tqdm
import difflib
import numpy as np

df = pd.read_csv('/Users/zqh/Documents/RIETI_new/RIETI_AI_Survey/output_data/crawl_test.csv')
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}

def extract_links_from_webpage(soup, base_url, main_domain):
    links = []
    
    for link in soup.find_all('a'):
        href = link.get('href')
        full_url = urljoin(base_url, href)
        
        if is_valid_url(full_url):
            similarity = url_similarity(main_domain, full_url)
            if similarity > 0.7:
                links.append(full_url)
    
    return links

def get_webpage_and_links_text(url):
    if not url or url != url:
        return None
    
    try:
        response = requests.get(url, timeout=15, headers=headers)
        response.raise_for_status()
        
        if response.status_code == 403:
            print(f"HTTP 403 Forbidden for URL: {url}")
            return None
        
        response.encoding = response.apparent_encoding
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # homepage (1st layer)
        first_layer_content = soup.get_text()
        main_domain = urlparse(url).netloc
        
        # 2nd layers
        second_layer_links = extract_links_from_webpage(soup, url, main_domain)
        second_layer_content = ""
        filtered_links = list(set(np.unique(second_layer_links)) - set(main_domain))
        
        for link in filtered_links:
            try:
                response = requests.get(link, timeout=15, headers=headers)
                response.raise_for_status()
                
                if response.status_code == 403:
                    print(f"HTTP 403 Forbidden for URL: {link}")
                else:
                    second_layer_content += BeautifulSoup(response.text, 'html.parser').get_text()
            except Exception as e:
                print(f"无法获取网页内容：{str(e)}")
        
        return first_layer_content + second_layer_content
    except Exception as e:
        print(f"无法获取网页内容：{str(e)}")
        return None

def url_similarity(url1, url2):
    return difflib.SequenceMatcher(None, url1, url2).ratio()

def is_valid_url(url):
    parsed_url = urlparse(url)
    return bool(parsed_url.scheme) and bool(parsed_url.netloc)

for index, row in tqdm(df.iterrows()):
    df.loc[index, 'content'] = get_webpage_and_links_text(row['url_web'])

1it [00:00,  3.73it/s]


In [47]:
df

Unnamed: 0,hojin_id,url_web,content
0,1210000000.0,http://www.koshinmilk.co.jp/environmental.html,\n\n\n\n\n\n\n\n\n環境保全活動 - コーシン乳業株式会社\n\n\n\n\...


In [8]:
import time
import pandas as pd
from tqdm import tqdm
from urllib.parse import urlparse
from selenium import webdriver
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.common.by import By
from difflib import SequenceMatcher
import os

path_to_chromedriver = '/Users/zqh/Documents/RIETI_new/RIETI_AI_Survey/output_data/geckodriver'
out_path = '/Users/zqh/Documents/RIETI_new/RIETI_AI_Survey/output_data/result/'

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

def get_clickable_text(elements):
    all_clickable_tag = [i.text for i in elements]
    all_clickable_tag = [i for i in all_clickable_tag if i != '' and not i.isspace()]
    #return list(set(all_clickable_tag + all_clickable_cls))
    return list(set(all_clickable_tag))


class WebPage():
    """
    A model of a webpage
    """
    def __init__(self, url, browser):
        self.url = url
        #self.url_domain = urlparse(url).netloc
        try:
            #self.browser = get_driver()
            browser.get(url)
            time.sleep(4)
            #self.current_url = browser.current_url.copy()
            #self.url_domain = urlparse(self.current_url).netloc
            self.browser_start = 1
            if url == 'http://mltrons.com/':
                self.browser_start = None
        except Exception:
            self.browser_start = None
            self.current_url = None
            
    @property
    def homepage(self):
        return browser.current_url
            
    @property
    def all_clickable(self):
        all_clickable_tag = browser.find_elements(By.TAG_NAME,"a")
        all_clickable_tag = [i.text for i in all_clickable_tag]
        all_clickable_tag = [i for i in all_clickable_tag if i != '' and not i.isspace()]
        return list(set(all_clickable_tag))
    
    
    def URLandContent(self, browser):
        if self.browser_start is None:
            page_urls, page_texts, page_urls_all = [], [], []
            return page_urls, page_texts, page_urls_all
        else:
            page_urls, page_texts, page_urls_all = self._URLandContent(browser)
            return page_urls, page_texts, page_urls_all
    
    def _URLandContent(self, browser):
        if len(self.all_clickable) >= 1:
            page_urls_click, page_texts_click, page_urls_all_click = self.URLandContentByClick(browser)
        else:
            page_urls_click, page_texts_click, page_urls_all_click = [], [], []
            self.homepage_current_url = self.homepage
            self.homepage_content = browser.find_element_by_xpath("/html/body").text
        browser.get(self.url)
        time.sleep(5)
        frames = browser.find_elements(By.TAG_NAME, 'frame')
        frames = [frame.get_attribute('src') for frame in frames]
        frames = [frame for frame in frames if frame.startswith('http')]
        if len(frames) >= 1:
            page_urls_frame, page_texts_frame, page_urls_all_frame = self.URLandContentFrame(browser, frames)
        else:
            page_urls_frame, page_texts_frame, page_urls_all_frame = [], [], []
        browser.get(self.url)
        time.sleep(5)
        frames = browser.find_elements(By.TAG_NAME, 'iframe')
        frames = [frame.get_attribute('src') for frame in frames]
        frames = [frame for frame in frames if frame.startswith('http')]
        if len(frames) >= 1:
            page_urls_iframe, page_texts_iframe, page_urls_all_iframe = self.URLandContentIFrame(browser, frames)
        else:
            page_urls_iframe, page_texts_iframe, page_urls_all_iframe = [], [], []
        page_urls = page_urls_click + page_urls_frame + page_urls_iframe
        page_texts = page_texts_click + page_texts_frame + page_texts_iframe
        page_urls_all = page_urls_all_click + page_urls_all_frame + page_urls_all_iframe
        return page_urls, page_texts, page_urls_all
        
    def URLandContentByClick(self, browser):
        page_texts = []
        page_urls = []
        page_urls_all = []
        all_clickable = self.all_clickable
        self.homepage_current_url = self.homepage
        self.url_domain = urlparse(self.homepage_current_url).netloc
        self.homepage_content = browser.find_element_by_xpath("/html/body").text
        
        for element in all_clickable:
            print(element)
            try:
                element = browser.find_element_by_link_text(element)
                browser.execute_script("arguments[0].target='_self'", element)
                browser.execute_script("arguments[0].click();", element)
                time.sleep(10)
                current_page_url = browser.current_url
                page_urls_all.append(current_page_url)
                if similar(urlparse(current_page_url).netloc, self.url_domain) >= 0.75:
                    page_texts.append(browser.find_element_by_xpath("/html/body").text)
                    page_urls.append(current_page_url)
                #browser.back()
                browser.get(self.homepage_current_url)
                time.sleep(5)
            except Exception:
                pass
        return page_urls, page_texts, page_urls_all
    
    
    def URLandContentFrame(self, browser, frames):
        page_texts = []
        page_urls = []
        page_urls_all = []
        #frames = browser.find_elements(By.TAG_NAME, 'frame')
        #frames = [frame.get_attribute('src') for frame in frames]
        for frame in frames:
            browser.get(frame)
            time.sleep(5)
            all_clickable_frame = get_clickable_text(browser.find_elements(By.TAG_NAME, "a"))
            for elem in all_clickable_frame:
                try:
                    element = browser.find_element_by_link_text(elem)
                    current_page_url = element.get_attribute('href')
                    page_urls_all.append(current_page_url)
                    browser.execute_script("arguments[0].target='_self'", element)
                    browser.execute_script("arguments[0].click();", element)
                    time.sleep(5)
                    if similar(urlparse(current_page_url).netloc, urlparse(frame).netloc) >= 0.75:
                        page_texts.append(browser.find_element_by_xpath("/html/body").text)
                        page_urls.append(current_page_url)
                    browser.get(frame)
                    time.sleep(5)
                except Exception:
                    pass
        return page_urls, page_texts, page_urls_all
    
    def URLandContentIFrame(self, browser, frames):
        page_texts = []
        page_urls = []
        page_urls_all = []
        #frames = browser.find_elements(By.TAG_NAME, 'iframe')
        #frames = [frame.get_attribute('src') for frame in frames]
        #frames = [frame for frame in frames if frame.startswith('http')]
        for frame in frames:
            browser.get(frame)
            time.sleep(5)
            all_clickable_frame = get_clickable_text(browser.find_elements(By.TAG_NAME, "a"))
            for elem in all_clickable_frame:
                try:
                    element = browser.find_element_by_link_text(elem)
                    current_page_url = element.get_attribute('href')
                    page_urls_all.append(current_page_url)
                    browser.execute_script("arguments[0].target='_self'", element)
                    browser.execute_script("arguments[0].click();", element)
                    time.sleep(5)
                    if similar(urlparse(current_page_url).netloc, urlparse(frame).netloc) >= 0.75:
                        page_texts.append(browser.find_element_by_xpath("/html/body").text)
                        page_urls.append(current_page_url)
                    browser.get(frame)
                    time.sleep(5)
                except Exception:
                    pass
        return page_urls, page_texts, page_urls_all
    
#%%
data_redo_lang = pd.read_csv('/Users/zqh/Documents/RIETI_new/RIETI_AI_Survey/output_data/crawl_test.csv')
data_redo_lang['hojin_id'] = data_redo_lang['hojin_id'].astype(int)
data_redo_lang = data_redo_lang.dropna()
hojin_ids = data_redo_lang['hojin_id']
url_webs = data_redo_lang['url_web']

for hojin_id, url_web in tqdm(zip(hojin_ids, url_webs)):
    hojin_id = int(hojin_id)
    try:
        browser = webdriver.Firefox(executable_path = path_to_chromedriver)
        browser.set_page_load_timeout(15)
        browser.set_script_timeout(15)
        UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1 Safari/605.1.15'
        options = webdriver.FirefoxOptions()
        options.add_argument('--user-agent=' + UA)
        #options.add_argument("--headless")
        try:
            web = WebPage(url_web, browser)
            time.sleep(4)
            temp = web.all_clickable
            page_urls, page_texts, page_urls_all = web.URLandContent(browser)
            homepage_content = web.homepage_content
            combined_page_texts = '\n'.join(page_texts)
            full_content = f'{homepage_content}\n\n{combined_page_texts}'
            
            hojin_id_str = str(hojin_id)
            output_file = os.path.join(out_path, f'{hojin_id_str}.txt')
            with open(output_file, 'w', encoding='utf-8') as f:
                f.write(full_content)

        except Exception as e:
            print(f"error:{str(e)}")
            
        browser.close()

    except Exception as e:
        print(f"error:{str(e)}")

0it [00:00, ?it/s]

＞＞詳細はお知らせをご覧ください。
貸会議室・貸ホール
お知らせ一覧
営業所紹介
会計事務所博覧会2023に出展します。
おすすめ情報一覧
OA機器・オフィスレイアウト
登録商標・商標
最上楽農園　夏の新メニュー　第二弾　ＮＥＷ！
総合問い合わせ
会計事務所のITサポート
ホーム
採用情報
【ホームページが生まれ変わる！作成支援サービスをご利用ください】ゆりかご倶楽部ニュース8月号
他のサービス
会計事務所のオフィスプランニング
会長挨拶
【税理士実践塾】専門知識が身に付く実践セミナーのご案内
サイトマップ
開業をお考えの方へ
総合支援サイト
個人情報保護方針　及び　特定個人情報取扱基本方針
会社概要
エッサムファミリー会 会報 2023年10月号を公開しました。
貸会議室
注意喚起：当社社員を騙る不審なメールにご注意ください
台風7号の影響によるお荷物のお届けについて
財務・税務システム
税理士業務書式文例集(令和5年改訂版)のご案内
過去の情報を見る>>>
沿革
おトクなキャンペーン一覧
情報セキュリティ基本方針
コロナ対策　オフィスの創り方　冊子
事務用品
エッサムファミリー会
会計士・税理士向け総合支援情報サイト
会長経歴・講演履歴


1it [10:35, 635.05s/it]

error:Message: Unable to locate element: /html/body



2it [10:47, 268.65s/it]

スキップしてコンテンツに移動する
the button’s
INFORMATION
INFORMATION
個人情報保護方針
■新規会員登録はこちら■
COMPANY
SAMPLE BOOK
SAMPLE BOOK
お問合せ(幸徳ボタン)
検索
BLOG
BLOG
お問合せ(the button's)
PRODUCTS
特定商取引表記
ページ内検索


3it [15:15, 305.21s/it]
