In [27]:
import os 
import pandas as pd

from sklearn.model_selection import train_test_split
import numpy as np

dir_name = '../../database_real/sentiment_data/'

In [28]:
# get data for hkex equity stock
hkex_files = os.path.join(dir_name,'stock_ticker_datasets/hkex_in.csv')

hkex = pd.read_csv(hkex_files)

hkex['Symbol'] = hkex['Symbol'].astype(str)
hkex_input = hkex['Symbol']

n = 400  #chunk row size
hkex_df = [hkex_input[i:i+n] for i in range(0,hkex_input.shape[0],n)]

hkex.set_index("Symbol" , inplace=True)
print(hkex.head())

                            Description
Symbol                                 
1            Cheung Kong (Holdings) Ltd
2                      CLP Holdings Ltd
3       Hong Kong and China Gas Co. Ltd
4                  Wharf (Holdings) Ltd
5                     HSBC Holdings plc


In [29]:
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import twitter_samples 

analyser = SentimentIntensityAnalyzer()

In [30]:
# append the compound vader score to the corresponding news
def read_news_path(df):
    print('read in datasets')
    cs=[]
    # append a compound score to every news row
    for row in range(len(df)):
        cs.append(analyser.polarity_scores(df['news'].iloc[row])['compound'])
    # append the column to original dataset
    df['compound_vader_score']=cs
    return df


# group by the mean compound vader score by dates
def find_news_pred_label(df,threshold):
    print('find_pred_label')
    news = df['news']
    # group the data by dates
    df = df.groupby(['dates'])['compound_vader_score'].mean().reset_index()
    final_label=[]
    
    # convert the vader score using a threshold to a sentiment label
    for i in range(len(df)):

        if df['compound_vader_score'].iloc[i] > threshold:
            final_label.append(2)
        elif df['compound_vader_score'].iloc[i] < -threshold:
            final_label.append(0)
        elif (df['compound_vader_score'].iloc[i] >= -threshold  
              and df['compound_vader_score'].iloc[i] <= threshold):
            final_label.append(1)

    df['vader_label'] = final_label
    return df


# merge the dataset with the hang seng index daily moving average
def merge_actual_label (df,hsi_movement_df):
    print('merge_actual_label')
    vader_data = df
    vader_data.set_index(keys = ["dates"],inplace=True)
    label_data = pd.read_csv(hsi_movement_df)
    label_data.set_index(keys = ["dates"],inplace=True)
    # inner join the two datasets using the date index
    merge = pd.merge(vader_data,label_data, how='inner', left_index=True, right_index=True)
    merge = merge.reset_index()
    # drop the redudant column 
    merge = merge.drop(['Unnamed: 0'],axis=1)
    
    return merge

In [33]:
def starter_vader(ticker):
    # get the full path of each ticker
    path = os.path.join(dir_name,'data-news/data-aastock/'+'data-'+ticker.zfill(5)+'-aastock.csv')           
    df = pd.read_csv(path,names=['dates','news'])
    # read append the compound vader score to the pandas dataframe
    df = read_news_path(df)
    # pass in the threshold to get the vader label
    df = find_news_pred_label(df,0.01)
            
    result_path = os.path.join(dir_name,'data-results/vader-results/hkex-results/hkex-aastock/'+'data-'+ticker.zfill(5)+'-result.csv')
    
    # get the full path of the hang seng index average csv file
    hsi_movement_path = os.path.join(dir_name,'train-data/hkex/hsi_movement.csv')  
    # merge the df pandas with the hsi_average
    df = merge_actual_label (df,hsi_movement_path)
    # store to the csv file if the dataset is not empty
    if (df.empty == False):
        df.to_csv(result_path,index=False)

# collect vader label for tickers in hkex    
for tickers in hkex_df:
     for ticker in tickers:
            print(ticker)
            starter_vader(ticker)            

1
read in datasets
find_pred_label
merge_actual_label
2
read in datasets
find_pred_label
merge_actual_label
3
read in datasets
find_pred_label
merge_actual_label
4
read in datasets
find_pred_label
merge_actual_label
5
read in datasets
find_pred_label
merge_actual_label
6
read in datasets
find_pred_label
merge_actual_label
7
read in datasets
find_pred_label
merge_actual_label
8
read in datasets
find_pred_label
merge_actual_label
9
read in datasets
find_pred_label
merge_actual_label
10
read in datasets
find_pred_label
merge_actual_label
11
read in datasets
find_pred_label
merge_actual_label
12
read in datasets
find_pred_label
merge_actual_label
14
read in datasets
find_pred_label
merge_actual_label
15
read in datasets
find_pred_label
merge_actual_label
16
read in datasets
find_pred_label
merge_actual_label
17
read in datasets
find_pred_label
merge_actual_label
18
read in datasets
find_pred_label
merge_actual_label
19
read in datasets
find_pred_label
merge_actual_label
20
read in datasets

read in datasets
find_pred_label
merge_actual_label
169
read in datasets
find_pred_label
merge_actual_label
171
read in datasets
find_pred_label
merge_actual_label
172
read in datasets
find_pred_label
merge_actual_label
173
read in datasets
find_pred_label
merge_actual_label
174
read in datasets
find_pred_label
merge_actual_label
175
read in datasets
find_pred_label
merge_actual_label
176
read in datasets
find_pred_label
merge_actual_label
177
read in datasets
find_pred_label
merge_actual_label
178
read in datasets
find_pred_label
merge_actual_label
179
read in datasets
find_pred_label
merge_actual_label
180
read in datasets
find_pred_label
merge_actual_label
181
read in datasets
find_pred_label
merge_actual_label
182
read in datasets
find_pred_label
merge_actual_label
183
read in datasets
find_pred_label
merge_actual_label
184
read in datasets
find_pred_label
merge_actual_label
185
read in datasets
find_pred_label
merge_actual_label
186
read in datasets
find_pred_label
merge_actual_la

335
read in datasets
find_pred_label
merge_actual_label
336
read in datasets
find_pred_label
merge_actual_label
337
read in datasets
find_pred_label
merge_actual_label
338
read in datasets
find_pred_label
merge_actual_label
339
read in datasets
find_pred_label
merge_actual_label
340
read in datasets
find_pred_label
merge_actual_label
341
read in datasets
find_pred_label
merge_actual_label
342
read in datasets
find_pred_label
merge_actual_label
343
read in datasets
find_pred_label
merge_actual_label
345
read in datasets
find_pred_label
merge_actual_label
346
read in datasets
find_pred_label
merge_actual_label
347
read in datasets
find_pred_label
merge_actual_label
348
read in datasets
find_pred_label
merge_actual_label
351
read in datasets
find_pred_label
merge_actual_label
352
read in datasets
find_pred_label
merge_actual_label
353
read in datasets
find_pred_label
merge_actual_label
354
read in datasets
find_pred_label
merge_actual_label
355
read in datasets
find_pred_label
merge_actua

merge_actual_label
525
read in datasets
find_pred_label
merge_actual_label
526
read in datasets
find_pred_label
merge_actual_label
527
read in datasets
find_pred_label
merge_actual_label
528
read in datasets
find_pred_label
merge_actual_label
529
read in datasets
find_pred_label
merge_actual_label
530
read in datasets
find_pred_label
merge_actual_label
531
read in datasets
find_pred_label
merge_actual_label
532
read in datasets
find_pred_label
merge_actual_label
533
read in datasets
find_pred_label
merge_actual_label
535
read in datasets
find_pred_label
merge_actual_label
536
read in datasets
find_pred_label
merge_actual_label
538
read in datasets
find_pred_label
merge_actual_label
539
read in datasets
find_pred_label
merge_actual_label
540
read in datasets
find_pred_label
merge_actual_label
542
read in datasets
find_pred_label
merge_actual_label
543
read in datasets
find_pred_label
merge_actual_label
544
read in datasets
find_pred_label
merge_actual_label
546
read in datasets
find_pre

705
read in datasets
find_pred_label
merge_actual_label
706
read in datasets
find_pred_label
merge_actual_label
707
read in datasets
find_pred_label
merge_actual_label
708
read in datasets
find_pred_label
merge_actual_label
709
read in datasets
find_pred_label
merge_actual_label
710
read in datasets
find_pred_label
merge_actual_label
711
read in datasets
find_pred_label
merge_actual_label
712
read in datasets
find_pred_label
merge_actual_label
713
read in datasets
find_pred_label
merge_actual_label
715
read in datasets
find_pred_label
merge_actual_label
716
read in datasets
find_pred_label
merge_actual_label
717
read in datasets
find_pred_label
merge_actual_label
718
read in datasets
find_pred_label
merge_actual_label
719
read in datasets
find_pred_label
merge_actual_label
720
read in datasets
find_pred_label
merge_actual_label
721
read in datasets
find_pred_label
merge_actual_label
722
read in datasets
find_pred_label
merge_actual_label
723
read in datasets
find_pred_label
merge_actua

903
read in datasets
find_pred_label
merge_actual_label
904
read in datasets
find_pred_label
merge_actual_label
905
read in datasets
find_pred_label
merge_actual_label
906
read in datasets
find_pred_label
merge_actual_label
907
read in datasets
find_pred_label
merge_actual_label
908
read in datasets
find_pred_label
merge_actual_label
909
read in datasets
find_pred_label
merge_actual_label
910
read in datasets
find_pred_label
merge_actual_label
911
read in datasets
find_pred_label
merge_actual_label
912
read in datasets
find_pred_label
merge_actual_label
913
read in datasets
find_pred_label
merge_actual_label
914
read in datasets
find_pred_label
merge_actual_label
915
read in datasets
find_pred_label
merge_actual_label
916
read in datasets
find_pred_label
merge_actual_label
918
read in datasets
find_pred_label
merge_actual_label
919
read in datasets
find_pred_label
merge_actual_label
921
read in datasets
find_pred_label
merge_actual_label
922
read in datasets
find_pred_label
merge_actua

1088
read in datasets
find_pred_label
merge_actual_label
1089
read in datasets
find_pred_label
merge_actual_label
1090
read in datasets
find_pred_label
merge_actual_label
1091
read in datasets
find_pred_label
merge_actual_label
1093
read in datasets
find_pred_label
merge_actual_label
1094
read in datasets
find_pred_label
merge_actual_label
1096
read in datasets
find_pred_label
merge_actual_label
1097
read in datasets
find_pred_label
merge_actual_label
1098
read in datasets
find_pred_label
merge_actual_label
1099
read in datasets
find_pred_label
merge_actual_label
1100
read in datasets
find_pred_label
merge_actual_label
1101
read in datasets
find_pred_label
merge_actual_label
1102
read in datasets
find_pred_label
merge_actual_label
1103
read in datasets
find_pred_label
merge_actual_label
1104
read in datasets
find_pred_label
merge_actual_label
1105
read in datasets
find_pred_label
merge_actual_label
1106
read in datasets
find_pred_label
merge_actual_label
1107
read in datasets
find_pred

1277
read in datasets
find_pred_label
merge_actual_label
1278
read in datasets
find_pred_label
merge_actual_label
1280
read in datasets
find_pred_label
merge_actual_label
1281
read in datasets
find_pred_label
merge_actual_label
1282
read in datasets
find_pred_label
merge_actual_label
1285
read in datasets
find_pred_label
merge_actual_label
1288
read in datasets
find_pred_label
merge_actual_label
1289
read in datasets
find_pred_label
merge_actual_label
1290
read in datasets
find_pred_label
merge_actual_label
1292
read in datasets
find_pred_label
merge_actual_label
1293
read in datasets
find_pred_label
merge_actual_label
1296
read in datasets
find_pred_label
merge_actual_label
1297
read in datasets
find_pred_label
merge_actual_label
1298
read in datasets
find_pred_label
merge_actual_label
1299
read in datasets
find_pred_label
merge_actual_label
1300
read in datasets
find_pred_label
merge_actual_label
1301
read in datasets
find_pred_label
merge_actual_label
1302
read in datasets
find_pred

1555
read in datasets
find_pred_label
merge_actual_label
1556
read in datasets
find_pred_label
merge_actual_label
1557
read in datasets
find_pred_label
merge_actual_label
1558
read in datasets
find_pred_label
merge_actual_label
1559
read in datasets
find_pred_label
merge_actual_label
1560
read in datasets
find_pred_label
merge_actual_label
1561
read in datasets
find_pred_label
merge_actual_label
1565
read in datasets
find_pred_label
merge_actual_label
1566
read in datasets
find_pred_label
merge_actual_label
1568
read in datasets
find_pred_label
merge_actual_label
1569
read in datasets
find_pred_label
merge_actual_label
1570
read in datasets
find_pred_label
merge_actual_label
1571
read in datasets
find_pred_label
merge_actual_label
1572
read in datasets
find_pred_label
merge_actual_label
1573
read in datasets
find_pred_label
merge_actual_label
1575
read in datasets
find_pred_label
merge_actual_label
1577
read in datasets
find_pred_label
merge_actual_label
1578
read in datasets
find_pred

read in datasets
find_pred_label
merge_actual_label
1908
read in datasets
find_pred_label
merge_actual_label
1910
read in datasets
find_pred_label
merge_actual_label
1913
read in datasets
find_pred_label
merge_actual_label
1918
read in datasets
find_pred_label
merge_actual_label
1919
read in datasets
find_pred_label
merge_actual_label
1928
read in datasets
find_pred_label
merge_actual_label
1929
read in datasets
find_pred_label
merge_actual_label
1932
read in datasets
find_pred_label
merge_actual_label
1938
read in datasets
find_pred_label
merge_actual_label
1958
read in datasets
find_pred_label
merge_actual_label
1962
read in datasets
find_pred_label
merge_actual_label
1963
read in datasets
find_pred_label
merge_actual_label
1966
read in datasets
find_pred_label
merge_actual_label
1970
read in datasets
find_pred_label
merge_actual_label
1972
read in datasets
find_pred_label
merge_actual_label
1975
read in datasets
find_pred_label
merge_actual_label
1979
read in datasets
find_pred_labe

2340
read in datasets
find_pred_label
merge_actual_label
2341
read in datasets
find_pred_label
merge_actual_label
2342
read in datasets
find_pred_label
merge_actual_label
2343
read in datasets
find_pred_label
merge_actual_label
2345
read in datasets
find_pred_label
merge_actual_label
2348
read in datasets
find_pred_label
merge_actual_label
2349
read in datasets
find_pred_label
merge_actual_label
2355
read in datasets
find_pred_label
merge_actual_label
2356
read in datasets
find_pred_label
merge_actual_label
2357
read in datasets
find_pred_label
merge_actual_label
2358
read in datasets
find_pred_label
merge_actual_label
2362
read in datasets
find_pred_label
merge_actual_label
2366
read in datasets
find_pred_label
merge_actual_label
2368
read in datasets
find_pred_label
merge_actual_label
2369
read in datasets
find_pred_label
merge_actual_label
2371
read in datasets
find_pred_label
merge_actual_label
2378
read in datasets
find_pred_label
merge_actual_label
2379
read in datasets
find_pred

merge_actual_label
3135
read in datasets
find_pred_label
merge_actual_label
3136
read in datasets
find_pred_label
merge_actual_label
3137
read in datasets
find_pred_label
merge_actual_label
3140
read in datasets
find_pred_label
merge_actual_label
3141
read in datasets
find_pred_label
merge_actual_label
3143
read in datasets
find_pred_label
merge_actual_label
3145
read in datasets
find_pred_label
merge_actual_label
3146
read in datasets
find_pred_label
merge_actual_label
3147
read in datasets
find_pred_label
merge_actual_label
3149
read in datasets
find_pred_label
merge_actual_label
3150
read in datasets
find_pred_label
merge_actual_label
3155
read in datasets
find_pred_label
merge_actual_label
3156
read in datasets
find_pred_label
merge_actual_label
3157
read in datasets
find_pred_label
merge_actual_label
3160
read in datasets
find_pred_label
merge_actual_label
3161
read in datasets
find_pred_label
merge_actual_label
3162
read in datasets
find_pred_label
merge_actual_label
3165
read in

6080
read in datasets
find_pred_label
merge_actual_label
6083
read in datasets
find_pred_label
merge_actual_label
6088
read in datasets
find_pred_label
merge_actual_label
6090
read in datasets
find_pred_label
merge_actual_label
6099
read in datasets
find_pred_label
merge_actual_label
6108
read in datasets
find_pred_label
merge_actual_label
6113
read in datasets
find_pred_label
merge_actual_label
6116
read in datasets
find_pred_label
merge_actual_label
6118
read in datasets
find_pred_label
merge_actual_label
6122
read in datasets
find_pred_label
merge_actual_label
6123
read in datasets
find_pred_label
merge_actual_label
6128
read in datasets
find_pred_label
merge_actual_label
6133
read in datasets
find_pred_label
merge_actual_label
6136
read in datasets
find_pred_label
merge_actual_label
6138
read in datasets
find_pred_label
merge_actual_label
6139
read in datasets
find_pred_label
merge_actual_label
6161
read in datasets
find_pred_label
merge_actual_label
6163
read in datasets
find_pred