In [1]:
import os
from numpy.lib.arraysetops import ediff1d
import requests
import time
import pandas as pd
import traceback
import prettytable as pt

import pickle


In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:

def Requests(url,headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36 Edg/89.0.774.50'}):
    error_time = 0
    while True:
        try:
            r = requests.get(url,headers=headers,timeout=10)
            return r
        except:
            error_time += 1
            print("reconnect：",error_time)
            time.sleep(2)
            if error_time > 5:
                return requests.Response()


In [4]:
def Int(num_str):
    try:
        return int(num_str)
    except:
        return 0

In [5]:
#general function to scrap data from Eastmoney
def get_data():
    df = pd.DataFrame({"code":[],"name":[],"no. of attending institutions":[],"meeting date":[],"announcement date":[]})
    count = 0
    for i in range(1,1000):
        page_size = 50000   #maximum no of announcement downloaded each time

        url = f"http://datainterface3.eastmoney.com/EM_DataCenter_V3/api/JGDYHZ/GetJGDYMX?tkn=eastmoney&secuCode=&sortfield=0&sortdirec=1&pageNum={i}&pageSize={page_size}&cfg=jgdyhz&p=3&pageNo={i}"
        r = Requests(url)
        js = r.json()
        if js["Data"][0]["TotalPage"] == 0: #go through the whole database
            break
        print(f"{i*page_size}announcements scrapped.....\r",end="")
        header = js["Data"][0]["FieldName"].split(",")
        code_index = header.index("SCode")
        name_index = header.index("SName")
        No_index = header.index("OrgSum")
        meet_date_index = header.index("StartDate")
        notice_date_index = header.index("NoticeDate")

        for row in js["Data"][0]["Data"]:
            # print(count)
            row_list = row.split("|")
            scode = row_list[code_index]
            sname = row_list[name_index]
            No_ins = Int(row_list[No_index])
            meet_date = row_list[meet_date_index]
            notice_date = row_list[notice_date_index]
            df.loc[count] = [scode,sname,No_ins,pd.Timestamp(meet_date),notice_date]
            print(f"reading{count}announcement.....\r",end="")
            count += 1
    df.index = df["meeting date"]
    print("")
    df["year"] = df.index.year
    df["month"] = df.index.month
    df.sort_index(inplace=True)
    with open("all_data.pkl","wb") as f:
        pickle.dump(df,f)
    return df

In [6]:
#task 2: count the monthly meetings of a given stock
def get_scode(df,scode,year):
    df_son = df[(df["code"] == scode) & (df["year"]==int(year))]
    df_son.to_csv(f"{scode}_{year}annual meetings.csv",encoding="utf_8_sig")
    #save all meetings from a given year

    result_df = pd.DataFrame({"code":[],"name":[],"year":[],"month":[],"sum":[]})
    result_df["month"] = list(range(1,13))
    result_df.index = result_df["month"]
    result_df["sum"] = df_son["no. of attending institutions"].groupby(df_son["month"]).sum()
    result_df["code"] = scode
    result_df["name"] = df_son["name"][0]
    result_df["year"] = year
    result_df["sum"].fillna(0,inplace=True)

    
    #output table
    tb = pt.PrettyTable()  
    # print(tb)
    # print(result_df)
    # print(list(result_df.columns.values))
    for col in result_df.columns.values:    
        tb.add_column(col, list(result_df[col]))
    print(tb)
    

In [7]:
#task 1: present all meetings on one single day
def get_day(df,date):
    son_df = df[df.index == date]
    son_df = son_df[["code","name","no. of attending institutions","meeting date","announcement date"]]

    tb = pt.PrettyTable()       
    tb.add_column('dates',son_df.index)
    for col in son_df.columns.values:    
        tb.add_column(col, list(son_df[col]))
    print(tb)


In [None]:
def main():
    print("*"*5,"Eastmoney data scraper","*"*5)
    if os.path.exists("all_data.pkl"):
        print("Data loaded")
        with open("all_data.pkl","rb") as f:
            df = pickle.load(f)
    else:
        print("Downloading...")
        df = get_data()

    while True:
        try:
            input_str = input("Input date(YYYYMMDD) to scrap all meetings today or code+year(xxxxxx+YYYY) for meetings of a specific stock in a year:")
            input_str = input_str.strip()
            if len(input_str) == 8:
                print(f"scraping all meetings on {input_str}")
                get_day(df,input_str)
            else:
                scode = input_str.split("+")[0]
                year = input_str.split("+")[1]
                print(f"scrapping all meetings for {scode} on {year}")
                get_scode(df,scode,year)
        except Exception as e:
            print(traceback.format_exc())

if __name__ == "__main__":
    main()

***** Eastmoney data scraper *****
Data loaded


Input date(YYYYMMDD) to scrap all meetings today or code+year(xxxxxx+YYYY) for meetings of a specific stock in a year: 601139+2021


scrapping all meetings for 601139 on 2021
+--------+----------+------+-------+-----+
|  code  |   name   | year | month | sum |
+--------+----------+------+-------+-----+
| 601139 | 深圳燃气 | 2021 |   1   | 5.0 |
| 601139 | 深圳燃气 | 2021 |   2   | 0.0 |
| 601139 | 深圳燃气 | 2021 |   3   | 0.0 |
| 601139 | 深圳燃气 | 2021 |   4   | 0.0 |
| 601139 | 深圳燃气 | 2021 |   5   | 0.0 |
| 601139 | 深圳燃气 | 2021 |   6   | 0.0 |
| 601139 | 深圳燃气 | 2021 |   7   | 0.0 |
| 601139 | 深圳燃气 | 2021 |   8   | 0.0 |
| 601139 | 深圳燃气 | 2021 |   9   | 0.0 |
| 601139 | 深圳燃气 | 2021 |   10  | 0.0 |
| 601139 | 深圳燃气 | 2021 |   11  | 0.0 |
| 601139 | 深圳燃气 | 2021 |   12  | 0.0 |
+--------+----------+------+-------+-----+


Input date(YYYYMMDD) to scrap all meetings today or code+year(xxxxxx+YYYY) for meetings of a specific stock in a year: 300750+2021


scrapping all meetings for 300750 on 2021
+--------+----------+------+-------+------+
|  code  |   name   | year | month | sum  |
+--------+----------+------+-------+------+
| 300750 | 宁德时代 | 2021 |   1   | 0.0  |
| 300750 | 宁德时代 | 2021 |   2   | 0.0  |
| 300750 | 宁德时代 | 2021 |   3   | 0.0  |
| 300750 | 宁德时代 | 2021 |   4   | 0.0  |
| 300750 | 宁德时代 | 2021 |   5   | 34.0 |
| 300750 | 宁德时代 | 2021 |   6   | 0.0  |
| 300750 | 宁德时代 | 2021 |   7   | 0.0  |
| 300750 | 宁德时代 | 2021 |   8   | 0.0  |
| 300750 | 宁德时代 | 2021 |   9   | 0.0  |
| 300750 | 宁德时代 | 2021 |   10  | 0.0  |
| 300750 | 宁德时代 | 2021 |   11  | 0.0  |
| 300750 | 宁德时代 | 2021 |   12  | 0.0  |
+--------+----------+------+-------+------+


Input date(YYYYMMDD) to scrap all meetings today or code+year(xxxxxx+YYYY) for meetings of a specific stock in a year: 20210807


scraping all meetings on 20210807
+-------+------+------+-------------------------------+--------------+-------------------+
| dates | code | name | no. of attending institutions | meeting date | announcement date |
+-------+------+------+-------------------------------+--------------+-------------------+
+-------+------+------+-------------------------------+--------------+-------------------+


Input date(YYYYMMDD) to scrap all meetings today or code+year(xxxxxx+YYYY) for meetings of a specific stock in a year: 20210804


scraping all meetings on 20210804
+---------------------+--------+----------+-------------------------------+---------------------+-------------------+
|        dates        |  code  |   name   | no. of attending institutions |     meeting date    | announcement date |
+---------------------+--------+----------+-------------------------------+---------------------+-------------------+
| 2021-08-04 00:00:00 | 001203 | 大中矿业 |              1.0              | 2021-08-04 00:00:00 |     2021-08-05    |
| 2021-08-04 00:00:00 | 002860 |  星帅尔  |              4.0              | 2021-08-04 00:00:00 |     2021-08-06    |
| 2021-08-04 00:00:00 | 002688 | 金河生物 |              1.0              | 2021-08-04 00:00:00 |     2021-08-05    |
| 2021-08-04 00:00:00 | 300441 | 鲍斯股份 |              2.0              | 2021-08-04 00:00:00 |     2021-08-05    |
| 2021-08-04 00:00:00 | 002345 |  潮宏基  |              2.0              | 2021-08-04 00:00:00 |     2021-08-05    |
| 2021-08-04 00:00:00 | 002266 | 浙富控股 | 