In [39]:
import backtrader as bt
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import vectorbt as vbt

#### 功能 setup

In [2]:
# 互動圖表
import plotly.graph_objs as go
import plotly.express as px

def interactive_plot(df:pd.DataFrame, title:str='自定圖表標題', recession:bool=False, height:int=370, width:int=1050):
    """ 使用 plotly 劃出 dataframe 裡的序列畫出，搭配衰退陰影

    Args:
        df (_type_): dataframe index 要是時間序列，column 的名字要先設定好
        title (str, optional): _description_. Defaults to '自訂圖表標題'.
    """
    #===================================
    # setup
    #===================================
    df = df.dropna()
    str_dt_index = df.index.strftime('%Y-%m-%d')

    recession_period = [
        ['1980-01','1983-02'],
        ['1984-05','1985-08'],
        ['1989-05','1990-08'],
        ['1995-02','1996-03'],
        ['1997-12','1998-12'],
        ['2000-09','2001-09'],
        ['2004-03','2005-02'],
        ['2008-03','2009-02'],
        ['2011-02','2012-01'],
        ['2014-10','2016-02'],
        ['2020-02','2020-04'],
    ]

    fig_layout = go.Layout(

            # paper_bgcolor='rgba(0,0,0,0)',
            # plot_bgcolor='rgba(0,0,0,0)',
            margin=dict(l=40, r=40, b=40, t=60, pad=0),
            legend=dict(
                orientation="h",
                yanchor="top",
                xanchor="left"),
            height=height,
            width=width
        )
    #===================================
    # initiate
    #===================================

    fig = go.Figure()
    fig.update_layout(fig_layout)
    fig.update_layout(template='seaborn') # ["plotly", "plotly_white", "plotly_dark", "ggplot2", "seaborn", "simple_white", "none"]
    fig.update_layout(title_text=title, title_x=0.5)

    if recession:
        for recession in recession_period:
            if int(recession[0][:4]) >= int(str_dt_index[0][:4]) and int(recession[1][:4]) <= int(str_dt_index[-1][:4]):
                fig.add_vrect(
                        x0=recession[0],
                        x1=recession[1],
                        fillcolor='rgba(30,30,30,0.3)',
                        opacity=0.5,
                        line_width=0)
    #===================================
    # loop through data in df
    #===================================

    for col in df.columns:

        fig.add_trace(
            go.Scatter(x=df.index, y=df[col],
                        mode='lines',
                        name=col,
                        showlegend=True,
            )
        )

    fig.show()
    
    return fig

In [27]:
# helper function of combine_stock()
# 結合情緒分數和各股新聞
def _combine_sentiment(stock_name, sentiment_df, news_df, special_transform):
  news_ = news_df[news_df.Tag.apply(lambda x: stock_name in x)]
  trans_col = 'score' if special_transform else ['neg','pos']
  sentiment_df = pd.concat([news_[['Date']],sentiment_df],axis=1,join='inner')
  sentiment_df.set_index('Date',inplace=True)
  sentiment_df.index = pd.to_datetime(sentiment_df.index)

  sentiment_df['Date_'] = sentiment_df.index.date
  before = sentiment_df[sentiment_df.index.time < datetime.time(9, 0)].groupby('Date_')[trans_col].mean().shift(-1)
  after = sentiment_df[sentiment_df.index.time >= datetime.time(9, 0)].groupby('Date_')[trans_col].mean()
  sentiment_daily = pd.concat([before,after]).groupby(level=0).mean().copy()
  if special_transform:
    sentiment_daily.columns = [stock_name]
  else:
    sentiment_daily[stock_name] = np.log((sentiment_daily['pos']+1)/(sentiment_daily['neg']+1))
    sentiment_daily = sentiment_daily[[stock_name]]
    
  sentiment_daily.index = pd.to_datetime(sentiment_daily.index)

  # sentiment_daily = ((sentiment_daily - sentiment_daily.min()) / (sentiment_daily.max() - sentiment_daily.min())).to_frame(stock_name)
  return sentiment_daily

# 結合情緒分數與股價
def combine_stock(stock_list, sentiment_df, news_df, stock_df, special_transform=False):
  """

  Args:
      stock_list (_type_): 輸入範例  [('鴻海',2317), ('台積電',2330),]
      sentiment_df (_type_): 
      news_df (_type_): 
      stock_df (_type_): 
      special_transform (bool, optional): 如果情緒分數 df 只有 scoer 沒有 pos 和 neg 的欄位，則設定為 True. Defaults to False.

  Returns:
      _type_: 
  """
  _ = zip(*stock_list)
  names = list(next(iter(_)))
  ids = list(next(iter(_)))
  ids = [str(id) for id in ids]
  stock_ = stock_df[ids]

  sentiment_list = []
  for name in names:
      sentiment = _combine_sentiment(name,sentiment_df, news_df,special_transform)
      sentiment_list.append(sentiment)
  sentiment_ = pd.concat(sentiment_list,axis=1).sort_index()
  sentiment_ = sentiment_.ffill().rolling(10).mean()

  return stock_, sentiment_

def time_range(df, start, gap):
    row, col = df.shape
    date = [df.index[date] for date in range(start,row,gap)]
    return date

#### 讀取 data

In [4]:
# 讀取資料

# 情緒分數
vader_df = pd.read_csv('cynes_sector_en_vadersentiment.csv')
vader_df.drop(['Unnamed: 0'],axis=1,inplace=True)

baidu_df = pd.read_csv('baidu.csv')
baidu_df = baidu_df.rename(columns={'pos_prob':'pos','neg_prob':'neg'})
baidu_df = baidu_df[['neg','pos']]

google_df = pd.read_csv('google.csv')
google_df.drop(['Unnamed: 0'],axis=1,inplace=True)

keymoji_df = pd.read_csv('keymoji.csv')
keymoji_df.drop(['Unnamed: 0'],axis=1,inplace=True)
keymoji_df.rename(columns={'total_score':'score'},inplace=True)

# 新聞
news_df = pd.read_csv('cynes_sector.csv')
news_df.drop(['Unnamed: 0'],axis=1,inplace=True)

# 股價
stock_df = pd.read_csv('stock_daily.csv',index_col='date',parse_dates=True)

### 分析

In [7]:
# 個股新聞數量分布 EDA

from collections import defaultdict

news = pd.read_csv('cynes_sector.csv')
news.drop(['Unnamed: 0'],axis=1,inplace=True)
news.Date = pd.to_datetime(news.Date)

# ======================================================
# 頻率:分鐘
# ======================================================
# newscount_dict = {}

# for ids in news['Tag']:
    
#     for id in eval(ids):
#         newscount_dict[id] = newscount_dict.get(id,0)
#         newscount_dict[id] += 1

# df1 = pd.DataFrame.from_dict(newscount_dict, orient='index',columns=['num_news']).sort_index()

# ======================================================
# 頻率:日
# ======================================================
news_ = news.copy()
news_['Date_'] = news_.Date.apply(lambda x:x.date())
news_ = news_[['Date_','Tag']]
newscount_dict = defaultdict(lambda: defaultdict(int))

for idx, cols in news_.iterrows():
    date, ids = cols[0], cols[1]

    for id in eval(ids):
        newscount_dict[date][id] += 1

df2 = pd.DataFrame.from_dict(newscount_dict, orient='index').sort_index()

eda_dict = {}
for thresh in range(500,1600,100):
    ids = list(df2.dropna(axis=1,thresh=thresh).columns)
    eda_dict[f'stocks with at least {thresh} non nan values'] = {'number of stocks': len(ids),'ids':ids}
    
pd.DataFrame.from_dict(eda_dict, orient='index')

Unnamed: 0,number of stocks,ids
stocks with at least 500 non nan values,53,"[鴻海, 旺宏, 中華電, 台積電, 台泥, 國巨, 台達電, 聯發科, 兆豐金, 大立光,..."
stocks with at least 600 non nan values,38,"[鴻海, 旺宏, 中華電, 台積電, 國巨, 台達電, 聯發科, 兆豐金, 大立光, 開發金..."
stocks with at least 700 non nan values,27,"[鴻海, 旺宏, 中華電, 台積電, 國巨, 台達電, 聯發科, 兆豐金, 大立光, 開發金..."
stocks with at least 800 non nan values,17,"[鴻海, 中華電, 台積電, 聯發科, 兆豐金, 大立光, 開發金, 中信金, 欣興, 聯電..."
stocks with at least 900 non nan values,14,"[鴻海, 台積電, 聯發科, 大立光, 中信金, 欣興, 聯電, 新光金, 富邦金, 國泰金..."
stocks with at least 1000 non nan values,7,"[鴻海, 台積電, 聯發科, 大立光, 中信金, 群創, 友達]"
stocks with at least 1100 non nan values,4,"[鴻海, 台積電, 聯發科, 大立光]"
stocks with at least 1200 non nan values,2,"[鴻海, 台積電]"
stocks with at least 1300 non nan values,1,[台積電]
stocks with at least 1400 non nan values,0,[]


In [13]:
# 挑選特定股票持
pd.DataFrame.from_dict(eda_dict, orient='index').iloc[4,:2].values

array([14,
       list(['鴻海', '台積電', '聯發科', '大立光', '中信金', '欣興', '聯電', '新光金', '富邦金', '國泰金', '華邦電', '群創', '友達', '南亞科'])],
      dtype=object)

In [35]:
# 定期換股 (只作多)
def sentiment_portfolio(stock_df, sentiment_df, benchmark_df):
    sentiment_ = sentiment - sentiment.rolling(10).mean()
    stock_ret = stock.pct_change().shift(-2) + 1  # 開盤前蒐集完資訊，可能當天收盤價買入，所以 return 只要往前 shift 一期 # shift 兩期表現比較好? 延後買入有優勢?
    benchmark_ret = benchmark_df.pct_change().shift(-2) 

    _ = pd.concat([sentiment_,stock_ret,benchmark_ret],axis=1,join='inner')[:'2022-04-01']
    sentiment_ = _.iloc[:,:sentiment_.shape[1]]
    stock_ret = _.iloc[:,stock_ret.shape[1]:-1]
    benchmark_ret = _.iloc[:,[-1]]

    # sentiment filter
    filter_sent = sentiment_.copy()
    # filter_sent = (filter_sent - filter_sent.expanding().mean()) / filter_sent.expanding().std() # rolling standardization 
    filter_sent.loc[time_range(filter_sent,30,20)] = filter_sent.apply(lambda x: x <= x.sort_values().iloc[0],axis=1)
    filter_sent = filter_sent[(filter_sent == True)| (filter_sent==False)].ffill()

    # commision
    commision_buy = 0.001425
    commision_sell = 0.004425
    commision = filter_sent.fillna(0).astype('int').diff()
    commision = commision.replace(1,1/(1+commision_buy))
    commision = commision.replace(-1,(1-commision_sell))
    commision = commision.replace(0,1)

    # return after commision
    stock_ret = stock_ret * commision.values -1

    # cumulative return 
    cum_ret = ((stock_ret * filter_sent.fillna(0).astype('int').values).sum(axis=1)+1).cumprod()
    ret = (stock_ret * filter_sent.fillna(0).astype('int').values).sum(axis=1)

    return cum_ret, ret, benchmark_ret

In [36]:
# 設定輸入值
stock_list = [('鴻海',2317), ('台積電',2330), ('聯發科',2454),  ('大立光',3008), ('中信金',2891), ('欣興',3037), ('聯電',2303), ('新光金',2888), ('富邦金',2881), ('國泰金',2882), ('華邦電',2344), ('群創',3481), ('友達',2409), ('南亞科',2408)]
# stock_list = [str(i[1]) for i in stock_list]

benchmark_df = stock_df[['0050']]
sentiment_df = baidu_df

stock, sentiment = combine_stock(stock_list, sentiment_df, news_df, stock_df, special_transform=False)
stock = stock['2018':]

cum_ret, ret, benchmark_ret = sentiment_portfolio(stock_df=stock, sentiment_df=sentiment, benchmark_df=benchmark_df )

In [40]:
baidu_ret= ret
baidu_cum_ret = cum_ret.to_frame('baidu')

ret_acc = baidu_ret.vbt.returns(freq='d')
ret_acc.stats(settings=dict(benchmark_rets=benchmark_ret)).to_frame('baidu')

Unnamed: 0,baidu
Start,2018-01-02 00:00:00
End,2022-04-01 00:00:00
Period,1035 days 00:00:00
Total Return [%],477.722082
Benchmark Return [%],60.527894
Annualized Return [%],85.620325
Annualized Volatility [%],41.005865
Max Drawdown [%],32.445161
Max Drawdown Duration,484 days 00:00:00
Sharpe Ratio,1.713149


In [41]:
fig = interactive_plot(baidu_cum_ret, title='擺渡分數建構策略之累積報酬率',height=400,width=900)