In [16]:
import sys,os
sys.path.append("..")
import django
django.setup()
import datetime
import pandas as pd
import requests
from io import StringIO
from crawlers.finlab.data_process_tools import last_month

In [20]:
class CrawlMonthlyRevnueTW:
    def __init__(self,date):
        self.date=date
        self.target_name = "台股月營收資訊"
        self.sub_market = ["sii", "otc", "rotc"]



    def crawl_main(self):
        url_date = last_month(self.date)
        data = []
        for i in self.sub_market:

            url = 'https://mops.twse.com.tw/nas/t21/' + i + '/t21sc03_' + str(url_date.year - 1911) + '_' + str(
                url_date.month) + '.html'

            # 偽瀏覽器
            headers = {
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko)'
                              ' Chrome/39.0.2171.95 Safari/537.36'}

            # 下載該年月的網站，並用pandas轉換成 dataframe
            try:
                r = requests.get(url, headers=headers)
                r.encoding = 'big5'
                html_df = pd.read_html(StringIO(r.text))
                # 處理一下資料
                if html_df[0].shape[0] > 500:
                    df = html_df[0].copy()
                else:
                    df = pd.concat([df for df in html_df if (df.shape[1] <= 11) and (df.shape[1] > 5)])

                if 'levels' in dir(df.columns):
                    df.columns = df.columns.get_level_values(1)
                else:
                    df = df[list(range(0, 10))]
                    column_index = df.index[(df[0] == '公司代號')][0]
                    df.columns = df.iloc[column_index]

                df['當月營收'] = pd.to_numeric(df['當月營收'], 'coerce')
                df = df[~df['當月營收'].isnull()]
                df = df[df['公司代號'] != '合計']

                df['date'] = datetime.date(self.date.year, self.date.month, 10)

                df = df.rename(columns={'公司代號': 'stock_id'})
                df = df.set_index(['stock_id', 'date'])

                data.append(df)
            except Exception as e:
                print(e)
                print('**WARRN: Pandas cannot find any table in the HTML file')
                return None
        df = pd.concat(data)
        if '備註' not in df.columns:
            df['備註'] = None
        df.iloc[:, 1:-1] = df.iloc[:, 1:-1].apply(lambda s: pd.to_numeric(s, errors='coerce'))
        df = df[df['公司名稱'] != '總計']
        df = df.where(pd.notnull(df), None)
        df = df.rename(columns={'公司名稱': "stock_name", "當月營收": "this_month_rev",
                                '上月營收': "last_month_rev", "去年當月營收": "last_year_rev",
                                '上月比較增減(%)': "cp_last_month_rev", "去年同月增減(%)": "cp_last_year_rev",
                                '當月累計營收': "cm_this_month_rev", "去年累計營收": "cm_last_month_rev",
                                '前期比較增減(%)': "cp_cm_rev", "備註": "note",
                                })
        df = df.reset_index()

        return df

In [22]:
C=CrawlMonthlyRevnueTW(datetime.datetime(2020,4,1))
C.crawl_main()

Unnamed: 0,stock_id,date,stock_name,this_month_rev,last_month_rev,last_year_rev,cp_last_month_rev,cp_last_year_rev,cm_this_month_rev,cm_last_month_rev,cp_cm_rev,note
0,1101,2020-04-10,台泥,9473250,5000692,10876929,89.43,-12.9,21976083,25356331,-13.33,-
1,1102,2020-04-10,亞泥,5077457,2851105,8318969,78.08,-38.96,13138881,19390605,-32.24,-
2,1103,2020-04-10,嘉泥,179493,150813,166883,19.01,7.55,495334,452391,9.49,-
3,1104,2020-04-10,環泥,479717,414075,437140,15.85,9.73,1253758,1187296,5.59,-
4,1108,2020-04-10,幸福,418037,390288,313093,7.1,33.51,1104720,834347,32.4,-
5,1109,2020-04-10,信大,488590,199776,541582,144.56,-9.78,1165081,1360434,-14.35,-
6,1110,2020-04-10,東泥,146412,115270,147819,27.01,-0.95,387944,417039,-6.97,-
7,1201,2020-04-10,味全,1423557,1108658,1598137,28.4,-10.92,3974342,4540400,-12.46,-
8,1203,2020-04-10,味王,591294,539918,514903,9.51,14.83,1675725,1502182,11.55,-
9,1210,2020-04-10,大成,6610246,6001214,6130106,10.14,7.83,19100652,18278107,4.5,-


In [None]:
#pycharm 測試
import sys,os
sys.path.append("..")
import django
django.setup()
from crawlers.models import *
# from crawlers.finlab.pioneers import *
# from crawlers.finlab.import_tools import *

In [None]:
add_to_sql(CompanyBasicInfoTW, df)