# モジュール

In [15]:
import pandas as pd
import re
import pandas as pd
import numpy as np
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 30)

In [19]:
kihon_df_list = []
suii_code_df_list = []

for i in tqdm(range(1300, 9998), desc="Processing"):
    
    try:
        # スクレイピング対象のURL
        url = f'https://kabutan.jp/stock/?code={str(i)}'

        # URLからデータを取得
        response = requests.get(url)
        response.encoding = 'utf-8'

        # BeautifulSoupでHTMLを解析
        soup = BeautifulSoup(response.text, 'html.parser')

        # テキスト要素を抽出
        text_elements = soup.get_text()

        # 不要な空白や改行を除外
        clean_text = re.sub(r'\s+', ' ', text_elements).strip() 

        # 空白区切りで単語に分ける
        words_list = clean_text.split()

        if "大株主" not in words_list:
            continue

        # それぞれの単語のインデックスを取得
        realtime_index = words_list.index("リアルタイムに変更")
        hikaku_index = words_list.index("比較される銘柄")

        # インデックス番号の間にあるワードを抽出
        taishaku_info = words_list[realtime_index + 1 : hikaku_index]

        # 最初の3要素を抜き出す
        first_four = taishaku_info[:3]

        # 「業績」というワードの次の要素を抜き出す
        index_of_gyouseki = taishaku_info.index('業績')
        gyousyu = taishaku_info[index_of_gyouseki + 1]

        # 最後の6要素を抜き出す
        last_six = taishaku_info[-6:]
        last_six.remove("時価総額")

        # 全てを一つのリストに格納
        cleaned_taishaku_info = first_four + [gyousyu] + last_six

        columns = ['証券コード', '銘柄', '市場', '業種', 'PER', 'PBR', '利回り', '信用倍率', '時価総額']

        taishaku_df = pd.DataFrame([cleaned_taishaku_info], columns=columns)

        # 目的の単語
        gonitisen = "5日線"
        happyoubi = "発表日"
        tansin = "直近の決算短信"

        if gonitisen in words_list:
            index_gonitisen = words_list.index(gonitisen)
            trend_list = words_list[index_gonitisen + 4 : index_gonitisen + 8]  # インデックス+5から+8までを含む
        else:
            trend_list = []

        if happyoubi in words_list and tansin in words_list:
            index_happyoubi = words_list.index(happyoubi)
            index_tansin = words_list.index(tansin)
            if index_happyoubi < index_tansin:
                suii_list = words_list[index_happyoubi + 1 : index_tansin] 
            else:
                suii_list = []
        else:
            suii_list = []
            
        suii_list = [x for x in suii_list if x not in ('予', 'I')]

        trend_df = pd.DataFrame([trend_list], columns=['5日線', '25日線', '75日線', '200日線'])

        # suii_listの長さを7の倍数に調整する
        while len(suii_list) % 7 != 0:
            suii_list.append(None)  # 足りない部分をNoneで埋める
        suii_array = np.array(suii_list).reshape(-1, 7)

        # データフレームに変換し、列名を設定
        suii_df = pd.DataFrame(suii_array, columns=['決算期', '売上高', '経常益', '最終益', '１株益', '１株配', '発表日'])

        kihon_df = pd.concat([taishaku_df, trend_df], axis=1)
        kihon_df_list.append(kihon_df)

        shouken_code = pd.DataFrame({'証券コード': np.repeat(kihon_df['証券コード'].values, len(suii_df))})
        # suii_dfとkihon_df_repeatedを結合
        suii_code_df = pd.concat([shouken_code.reset_index(drop=True), suii_df.reset_index(drop=True)], axis=1)
        suii_code_df_list.append(suii_code_df)
        
    except Exception as e:
        print(f"An error occurred for code {i}: {e}")
        continue  # エラーが発生した場合は次のループに進む

DATA_DIR = '/Users/yuseiito/Desktop/stock/data/'
all_kihon_df = pd.concat(kihon_df_list, ignore_index=True)
all_suii_code_df = pd.concat(suii_code_df_list, ignore_index=True)
all_kihon_df.to_csv(f'{DATA_DIR}/basic_info.csv', index=False, encoding='utf-8-sig')
all_suii_code_df.to_csv(f'{DATA_DIR}/performance_trend.csv', index=False, encoding='utf-8-sig')


Processing:  33%|███▎      | 2898/8698 [08:05<18:01,  5.36it/s] 

An error occurred for code 4197: 4 columns passed, passed data had 0 columns


Processing:  40%|████      | 3513/8698 [09:52<11:04,  7.80it/s]

An error occurred for code 4811: 4 columns passed, passed data had 0 columns


Processing:  41%|████▏     | 3598/8698 [10:05<14:22,  5.91it/s]

An error occurred for code 4896: 4 columns passed, passed data had 0 columns


Processing:  49%|████▉     | 4297/8698 [11:44<11:25,  6.42it/s]

An error occurred for code 5595: 4 columns passed, passed data had 0 columns
An error occurred for code 5596: 4 columns passed, passed data had 0 columns


Processing:  49%|████▉     | 4298/8698 [11:44<11:48,  6.21it/s]

An error occurred for code 5597: 4 columns passed, passed data had 0 columns


Processing:  49%|████▉     | 4301/8698 [11:44<10:30,  6.97it/s]

An error occurred for code 5599: 4 columns passed, passed data had 0 columns


Processing:  50%|████▉     | 4318/8698 [11:47<10:01,  7.28it/s]

An error occurred for code 5616: 4 columns passed, passed data had 0 columns


Processing:  50%|████▉     | 4320/8698 [11:48<11:42,  6.23it/s]

An error occurred for code 5618: 4 columns passed, passed data had 0 columns
An error occurred for code 5619: 4 columns passed, passed data had 0 columns


Processing:  50%|████▉     | 4323/8698 [11:48<11:02,  6.60it/s]

An error occurred for code 5621: 4 columns passed, passed data had 0 columns


Processing:  52%|█████▏    | 4548/8698 [12:18<09:46,  7.08it/s]

An error occurred for code 5845: 4 columns passed, passed data had 0 columns


Processing:  52%|█████▏    | 4559/8698 [12:20<11:18,  6.10it/s]

An error occurred for code 5858: 4 columns passed, passed data had 0 columns


Processing:  53%|█████▎    | 4569/8698 [12:21<09:57,  6.91it/s]

An error occurred for code 5867: 4 columns passed, passed data had 0 columns
An error occurred for code 5868: 4 columns passed, passed data had 0 columns


Processing:  53%|█████▎    | 4571/8698 [12:21<10:52,  6.32it/s]

An error occurred for code 5869: 4 columns passed, passed data had 0 columns
An error occurred for code 5870: 4 columns passed, passed data had 0 columns


Processing:  53%|█████▎    | 4573/8698 [12:21<09:59,  6.88it/s]

An error occurred for code 5871: 4 columns passed, passed data had 0 columns


Processing:  53%|█████▎    | 4589/8698 [12:23<09:07,  7.51it/s]

An error occurred for code 5888: 4 columns passed, passed data had 0 columns


Processing:  53%|█████▎    | 4590/8698 [12:24<12:46,  5.36it/s]

An error occurred for code 5889: 4 columns passed, passed data had 0 columns


Processing:  53%|█████▎    | 4593/8698 [12:24<11:36,  5.90it/s]

An error occurred for code 5891: 4 columns passed, passed data had 0 columns
An error occurred for code 5892: 4 columns passed, passed data had 0 columns


Processing:  60%|██████    | 5227/8698 [14:16<10:20,  5.60it/s]

An error occurred for code 6525: 4 columns passed, passed data had 0 columns


Processing:  91%|█████████ | 7872/8698 [21:29<02:27,  5.59it/s]

An error occurred for code 9170: 4 columns passed, passed data had 0 columns


Processing:  91%|█████████▏| 7937/8698 [21:38<02:05,  6.05it/s]

An error occurred for code 9235: 4 columns passed, passed data had 0 columns
An error occurred for code 9236: 4 columns passed, passed data had 0 columns


Processing:  91%|█████████▏| 7939/8698 [21:39<02:16,  5.56it/s]

An error occurred for code 9237: 4 columns passed, passed data had 0 columns
An error occurred for code 9238: 4 columns passed, passed data had 0 columns


Processing: 100%|██████████| 8698/8698 [23:42<00:00,  6.11it/s]
