In [27]:
import os 
import pandas as pd

from sklearn.model_selection import train_test_split
import numpy as np

dir_name = '../../database_real/sentiment_data/'

In [28]:
# get data for hkex equity stock
hkex_files = os.path.join(dir_name,'stock_ticker_datasets/hkex_in.csv')

hkex = pd.read_csv(hkex_files)

hkex['Symbol'] = hkex['Symbol'].astype(str)
hkex_input = hkex['Symbol']

n = 400  #chunk row size
hkex_df = [hkex_input[i:i+n] for i in range(0,hkex_input.shape[0],n)]

hkex.set_index("Symbol" , inplace=True)
print(hkex.head())

                            Description
Symbol                                 
1            Cheung Kong (Holdings) Ltd
2                      CLP Holdings Ltd
3       Hong Kong and China Gas Co. Ltd
4                  Wharf (Holdings) Ltd
5                     HSBC Holdings plc


In [29]:
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import twitter_samples 

analyser = SentimentIntensityAnalyzer()

In [30]:
# append the compound vader score to the corresponding news
def read_news_path(df):
    print('read in datasets')
    cs=[]
    # append a compound score to every news row
    for row in range(len(df)):
        cs.append(analyser.polarity_scores(df['news'].iloc[row])['compound'])
    # append the column to original dataset
    df['compound_vader_score']=cs
    return df


# group by the mean compound vader score by dates
def find_news_pred_label(df,threshold):
    print('find_pred_label')
    news = df['news']
    # group the data by dates
    df = df.groupby(['dates'])['compound_vader_score'].mean().reset_index()
    final_label=[]
    
    # convert the vader score using a threshold to a sentiment label
    for i in range(len(df)):

        if df['compound_vader_score'].iloc[i] > threshold:
            final_label.append(2)
        elif df['compound_vader_score'].iloc[i] < -threshold:
            final_label.append(0)
        elif (df['compound_vader_score'].iloc[i] >= -threshold  
              and df['compound_vader_score'].iloc[i] <= threshold):
            final_label.append(1)

    df['vader_label'] = final_label
    return df


# merge the dataset with the hang seng index daily moving average
def merge_actual_label (df,hsi_movement_df):
    print('merge_actual_label')
    vader_data = df
    vader_data.set_index(keys = ["dates"],inplace=True)
    label_data = pd.read_csv(hsi_movement_df)
    label_data.set_index(keys = ["dates"],inplace=True)
    # inner join the two datasets using the date index
    merge = pd.merge(vader_data,label_data, how='inner', left_index=True, right_index=True)
    merge = merge.reset_index()
    # drop the redudant column 
    merge = merge.drop(['Unnamed: 0'],axis=1)
    
    return merge

In [1]:
def starter_vader(ticker):
    # get the full path of each ticker
    path = os.path.join(dir_name,'data-news/data-aastock/'+'data-'+ticker.zfill(5)+'-aastock.csv')           
    df = pd.read_csv(path,names=['dates','news','ticker','newstype'])
    # read append the compound vader score to the pandas dataframe
    df = read_news_path(df)
    # pass in the threshold to get the vader label
    df = find_news_pred_label(df,0.01)
            
    result_path = os.path.join(dir_name,'data-results/vader-results/hkex-results/hkex-aastock/'+'data-'+ticker.zfill(5)+'-result.csv')
    
    # get the full path of the hang seng index average csv file
    hsi_movement_path = os.path.join(dir_name,'train-data/hkex/hsi_movement.csv')  
    # merge the df pandas with the hsi_average
    df = merge_actual_label (df,hsi_movement_path)
    # store to the csv file if the dataset is not empty
    if (df.empty == False):
        df.to_csv(result_path,index=False)

# collect vader label for tickers in hkex    
for tickers in hkex_df:
     for ticker in tickers:
            print(ticker)
            starter_vader(ticker)            

NameError: name 'hkex_df' is not defined