## This is the notebook of DataProcessing

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

In [3]:
Stock_list = ["600000", "600009", "600016", "600028", "600030", "600031",
              "600036", "600048", "600050", "600104", "600196", "600276",
              "600309", "600438", "600519", "600547", "600570", "600585",
              "600588", "600690", "600703", "600745", "600809", "600837",
              "600887", "600893", "600918", "601012", "601066", "601088",
              "601138", "601166", "601211", "601288", "601318", "601336",
              "601398", "601601", "601628", "601668", "601688", "601818",
              "601857", "601888", "601899", "601995", "603259", "603288",
              "603501", "603986"]

First, we need to get the time interval of each stock, since different stock has different time interval over the last 300 news. And the time interval that duplicated by all the stocks will be the time interval we choose to build the dataset.

In [4]:
def interval_getter(stock):
    data = pd.read_csv(f"dataset/{stock}NLPChinese.csv")
    start_day = data["Time"].iloc[-1]
    end_day = data["Time"].iloc[0]
    return start_day, end_day

From here, we can see that the stocks time-interval starting times variance very differently. To make sure the data's length is enough, we need to delete some stocks that time interval is really short.

In [5]:
drop_list = []
for stock in tqdm(Stock_list):
    begin_d, end_d = interval_getter(stock)
    if begin_d > "2020-01-01":
        print(f"{stock} need to be deleted.")
        drop_list.append(stock)

 42%|████▏     | 21/50 [00:00<00:00, 103.90it/s]

600028 need to be deleted.
600030 need to be deleted.
600050 need to be deleted.
600104 need to be deleted.
600196 need to be deleted.
600276 need to be deleted.
600438 need to be deleted.
600519 need to be deleted.
600837 need to be deleted.


100%|██████████| 50/50 [00:00<00:00, 106.92it/s]

600918 need to be deleted.
601012 need to be deleted.
601066 need to be deleted.
601166 need to be deleted.
601211 need to be deleted.
601318 need to be deleted.
601628 need to be deleted.
601668 need to be deleted.
601899 need to be deleted.
601995 need to be deleted.





In [6]:
final_stock_list = []
for stock in tqdm(Stock_list):
    if stock in drop_list:
        pass
    else:
        final_stock_list.append(stock)

100%|██████████| 50/50 [00:00<?, ?it/s]


In [7]:
len(final_stock_list)

31

In [71]:
begin_day_list = []
end_day_list = []
for stock in tqdm(final_stock_list):
    begin_d, end_d = interval_getter(stock)
    begin_day_list.append(begin_d)
    end_day_list.append(end_d)
interval_start = max(begin_day_list)
# we still use the latest date as the end of the interval
interval_end = max(end_day_list)
print(interval_start, interval_end)

100%|██████████| 31/31 [00:00<00:00, 550.38it/s]

2019-12-18 2021-12-01





So, the interval is 2019-12-18 to 2021-12-01

The training set is 2019-12-18 to 2021-2-28. (14 months)

The validation set is 2021-3-1 to 2021-5-31. (3 months)

The testing set is 2021-6-1 to 2021-12-1. (6 months)

In [94]:
for stock in tqdm(final_stock_list):
    data = pd.read_csv(f"dataset/{stock}NLPChinese.csv")
    data_train = data[(data["Time"]>="2019-12-18") & (data["Time"]<="2021-02-28")].reset_index(drop=True)
    data_train = data_train.reindex(index=data_train.index[::-1]).reset_index(drop=True)
    data_valid = data[(data["Time"]>="2021-03-01") & (data["Time"]<="2021-05-31")].reset_index(drop=True)
    data_valid = data_valid.reindex(index=data_valid.index[::-1]).reset_index(drop=True)
    data_test = data[(data["Time"]>="2021-06-01") & (data["Time"]<="2021-12-1")].reset_index(drop=True)
    data_test = data_test.reindex(index=data_test.index[::-1]).reset_index(drop=True)
    data_train.to_csv(f"data_cleaned/Train/{stock}NLP_Train_cleaned.csv", index=False, encoding="utf_8_sig")
    data_test.to_csv(f"data_cleaned/Test/{stock}NLP_Test_cleaned.csv", index=False, encoding="utf_8_sig")
    data_valid.to_csv(f"data_cleaned/Valid/{stock}NLP_Valid_cleaned.csv", index=False, encoding="utf_8_sig")

100%|██████████| 31/31 [00:00<00:00, 167.93it/s]


In [97]:
train_num = 0
for stock in tqdm(final_stock_list):
    data = pd.read_csv(f"data_cleaned/Train/{stock}NLP_Train_cleaned.csv")
    train_num += (len(data))
train_num

100%|██████████| 31/31 [00:00<00:00, 765.07it/s]


4045

In [100]:
valid_num = 0
for stock in tqdm(final_stock_list):
    data = pd.read_csv(f"data_cleaned/Valid/{stock}NLP_Valid_cleaned.csv")
    valid_num += (len(data))
valid_num

100%|██████████| 31/31 [00:00<00:00, 132.35it/s]


982

In [101]:
test_num = 0
for stock in tqdm(final_stock_list):
    data = pd.read_csv(f"data_cleaned/Test/{stock}NLP_Test_cleaned.csv")
    test_num += (len(data))
test_num

100%|██████████| 31/31 [00:00<00:00, 151.89it/s]


1474

In [8]:
final_stock_list

['600000',
 '600009',
 '600016',
 '600031',
 '600036',
 '600048',
 '600309',
 '600547',
 '600570',
 '600585',
 '600588',
 '600690',
 '600703',
 '600745',
 '600809',
 '600887',
 '600893',
 '601088',
 '601138',
 '601288',
 '601336',
 '601398',
 '601601',
 '601688',
 '601818',
 '601857',
 '601888',
 '603259',
 '603288',
 '603501',
 '603986']