In [1]:
#%% import packages
import pandas as pd
import pandas_datareader as pdr
from datetime import datetime
import time
import copy
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

In [2]:
#%% var definitions
stock_list = [   # ten of the 0050 constituent stocks
    2330, 2454, 2317, 2303, 2308, \
    1301, 2412, 1303, 2891, 3008
]
stock_list_2 = [   # ten of the 0050 constituent stocks
    2330, 2454, 2317, 2303, 2308, \
    1301, 2882, 1303, 2891, 3008
]
start = datetime(2009, 1, 1)   # data fetched start point
df = pd.DataFrame(columns=["Date", "tic", "High", "Low", "Open", "Close", "Volume", "Adj Close"])   # df storig data of 10 stocks

## Get VIX

In [50]:
vix = pdr.DataReader('^VIX', 'yahoo',start)

vix=vix.drop(columns=['High','Low','Volume','Adj Close'])
vix['datadate']=vix.index
vix=vix.reset_index()

vix=vix.drop(['Date'],axis=1)
vix=vix[['datadate','Open',"Close"]]
# date_li=vix.datadate
# date_li=[int(ele.strftime("%Y%m%d")) for ele in date_li]
# vix['datadate']=date_li

In [51]:
vix.head()

Unnamed: 0,datadate,Open,Close
0,2008-12-31,41.630001,40.0
1,2009-01-02,39.580002,39.189999
2,2009-01-05,39.240002,39.080002
3,2009-01-06,38.060001,38.560001
4,2009-01-07,40.290001,43.389999


In [55]:
vix.to_csv('./data/VIX.csv',index=False)
vix=pd.read_csv('./data/VIX.csv')

## Check VIX date right

In [135]:
from config import config
import pandas as pd
data=pd.read_csv('done_data.csv')

unique_trade_date = data[(data.datadate > config.VALID_START)&(data.datadate <= config.VALID_END)].datadate.unique()
train_date=data[(data.datadate<"2016-01-00") & (data.datadate>="2009-00-00")].datadate.unique()
vix_trade_date=vix[(vix.datadate > config.VALID_START)&(vix.datadate <= config.VALID_END)].datadate.unique()
vix_train_date=vix[(vix.datadate<"2016-01-00") & (vix.datadate>="2009-00-00")].datadate.unique()

In [134]:
train_date

array(['2010-01-04', '2010-01-05', '2010-01-06', ..., '2015-12-29',
       '2015-12-30', '2015-12-31'], dtype=object)

In [136]:
vix_drop_date=[]
for ele in vix_trade_date:
    if ele not in unique_trade_date:
        vix_drop_date.append(ele)
for ele in vix_train_date:
    if ele not in train_date:
        vix_drop_date.append(ele)
len(vix_drop_date)

407

In [137]:
vix_miss_date=[]
for ele in unique_trade_date:
    if ele not in vix_trade_date:
        vix_miss_date.append(ele)
for ele in train_date:
    if ele not in vix_train_date:
        vix_miss_date.append(ele)
recover_data=[]
nan = float("NAN")
for date in vix_miss_date:
    recover_data.append([date,nan,nan ])
df_recover = pd.DataFrame(recover_data, columns=['datadate','Open','Close'])
vix_new=pd.concat([vix, df_recover])
vix_new=vix_new.sort_values(by=['datadate'])
vix_new=vix_new.fillna(method='ffill')

In [138]:
vix_new

Unnamed: 0,datadate,Open,Close
0,2008-12-31,41.630001,40.000000
1,2009-01-02,39.580002,39.189999
2,2009-01-05,39.240002,39.080002
3,2009-01-06,38.060001,38.560001
4,2009-01-07,40.290001,43.389999
...,...,...,...
3016,2020-12-23,23.490000,23.309999
3017,2020-12-24,22.469999,21.530001
3018,2020-12-28,22.110001,21.700001
3019,2020-12-29,21.610001,23.080000


In [139]:
vix_new = vix_new[~vix_new['datadate'].isin(vix_drop_date)]

In [140]:
vix_new.to_csv('./data/VIX.csv',index=False)

In [3]:
#%% function definition
def obs_missing_value(df):   # observe the missing values in dataframe, stock by stock
    '''
        Input:
            df: pd.DataFrame, dataframe for 10 stocks in 0050
        Return:
            None
    '''
    for stock_ticker, stock_data in df.groupby("tic"):
        print("Stock: ", stock_ticker)
        print("=====Stock Data Size=====")
        print(stock_data.shape[0])
        for column in ["High", "Low", "Open", "Close", "Volume", "Adj Close"]:
            print("=====Number of Missing Value in " + column +"=====")
            print(stock_data[column].isnull().sum())

def extract_missing_date(df):   # extract missing dates of the particular
    '''
        Input:
            df: pd.DataFrame, dataframe for 10 stocks in 0050
        Return:
            missing_dates: list, storing missing dates for some specific stocks
    '''
    date_seqs = []
    missing_dates = set()
    testing_stocks = sorted(copy.deepcopy(stock_list_2))
    #testing_stock = ["1301", "2308", "2317", "2330", "2882"]
    #for stock_ticker in ["1301", "1303","2303","2454","2891", "3008"]:
          #date_seqs.append(set(df[df["tic"] == stock_ticker]["Date"]))
    for stock_ticker, stock_data in df.groupby("tic"):
        #if str(stock_ticker) in testing_stock:
        date_seqs.append(set(stock_data["Date"]))
    for i in range(len(date_seqs)-1):
        for j in range(i+1, len(date_seqs)):
            diff = date_seqs[i].symmetric_difference(date_seqs[j])
            if len(diff) == 0:
                continue
            else:
                missing_dates = missing_dates.union(diff)
                print(missing_dates)
                print(str(testing_stocks[i]) + ", " + str(testing_stocks[j]) + " have difference:", diff)
    return list(missing_dates)

def handle_missing_date(df, missing_dates, method="drop"):   # deal with the missing data in dataframe (missing at particular date)
    '''
        Input:
            df: pd.DataFrame, dataframe for 10 stocks in 0050
            missing_dates: list, storing missing dates for some specific stocks
            method: string, the method ued to deal with missing data (default="drop"), can choose on of {"drop", "impute"}
        Return:
            None
    '''
    df_proc = df.copy()
    if method == "drop":
        for missing_date in missing_dates:
            df_proc = df_proc[df_proc["Date"] != missing_date]
        df_proc.reset_index(drop=True, inplace=True)
    return df_proc

def plot_padjc_pc(df):   # plot the relationship of close and adj close
    '''
        Input:
            df: pd.DataFrame, dataframe for 10 stocks in 0050
        Return:
            None
    '''
    df_ploted = df.copy()
    #fig, ax = plt.subplots(figsize=(20, 10))
    plot_count = 1
    for stock_ticker, stock_data in df_ploted.groupby("tic"):
        plt.figure(figsize=(14, 7))
        #plt.subplot(5, 2, plot_count)
        plt.plot(stock_data["Close"])
        plt.plot(stock_data["Adj Close"])
        plt.title("Adj Close versus Close")
        plt.xlabel("Date")
        plt.ylabel("Price")
        plt.legend(["Close", "Adj Close"], loc="upper right")
        plt.show()
        plot_count += 1
    #plt.show()

In [None]:
'''for stock_ticker in stock_list_2:
    try:
        df_tmp = pdr.DataReader(str(stock_ticker)+".TW", "yahoo", start)
    except:
        print(str(stock_ticker) + "has some problem!")
        continue
    print(str(stock_ticker) + "success!")
    df_tmp["tic"] = [str(stock_ticker) for _ in range(df_tmp.shape[0])]
    df_tmp.reset_index(drop=False, inplace=True)
    print(df_tmp.head())
    df = pd.concat([df, df_tmp], ignore_index=True, axis=0)
    time.sleep(3)
df.to_csv("tw0050_10_2.csv", index=False)'''

In [16]:
#%% exploratory data analysis
df = pd.read_csv("tw0050_10_2.csv")
#obs_missing(df)
missing_dates = extract_missing_date(df)
df = handle_missing_date(df, missing_dates)   # deal with the missing data in dataframe (missing at particular date)
plot_padjc_pc(df)

{'2009-08-07'}
1301, 2308 have difference: {'2009-08-07'}
{'2009-08-07'}
1301, 2317 have difference: {'2009-08-07'}
{'2009-08-07'}
1301, 2330 have difference: {'2009-08-07'}
{'2009-08-07'}
1303, 2308 have difference: {'2009-08-07'}
{'2009-08-07'}
1303, 2317 have difference: {'2009-08-07'}
{'2009-08-07'}
1303, 2330 have difference: {'2009-08-07'}
{'2009-08-07'}
2303, 2308 have difference: {'2009-08-07'}
{'2009-08-07'}
2303, 2317 have difference: {'2009-08-07'}
{'2009-08-07'}
2303, 2330 have difference: {'2009-08-07'}
{'2009-08-07'}
2308, 2454 have difference: {'2009-08-07'}
{'2009-08-07'}
2308, 2882 have difference: {'2009-08-07'}
{'2009-08-07'}
2308, 2891 have difference: {'2009-08-07'}
{'2009-08-07'}
2308, 3008 have difference: {'2009-08-07'}
{'2009-08-07'}
2317, 2454 have difference: {'2009-08-07'}
{'2009-08-07'}
2317, 2882 have difference: {'2009-08-07'}
{'2009-08-07'}
2317, 2891 have difference: {'2009-08-07'}
{'2009-08-07'}
2317, 3008 have difference: {'2009-08-07'}
{'2009-08-07'}