In [None]:
!pip install textblob

In [None]:
!pip install --user -U nltk

In [None]:
import os 
import pandas as pd

dir_name= os.getcwd()+'/sentiment-data/'
hkex_files=os.path.join(dir_name,'stock_ticker_datasets/hkex.csv')

hkex=pd.read_csv(hkex_files) 
# read in the equity stock of hkex
hkex=hkex.loc[hkex['Category'] == 'Equity']

hkex['Ticker']=hkex['Ticker'].astype(str)
hkex_input=hkex['Ticker']

#chunk row size
n = 500 

print(hkex_input)
hkex_df = [hkex_input[i:i+n] for i in range(0,len(hkex_input),n)]

In [None]:
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import twitter_samples 

nltk.downloader.download('vader_lexicon')


In [None]:
analyser = SentimentIntensityAnalyzer()

In [None]:
#### Function for Vader Analysis 

# read from vader scores
def read_news_vader_path(df):
    print('read in datasets')
    cs=[]
    # append a compound score to every news row
    for row in range(len(df)):
        cs.append(analyser.polarity_scores(df['news'].iloc[row])['compound'])
    # append the column to original dataset
    df['compound_vader_score']=cs
    return df


# group by the mean compound vader score by dates
def find_news_vader_pred_label(df,threshold):
    print('find_pred_label')
    news = df['news']
    # group the data by dates
    df = df.groupby(['dates'])['compound_vader_score'].mean().reset_index()
    print(df)
    final_label=[]
    
    # convert the vader score using a threshold to a sentiment label
    for i in range(len(df)):

        if df['compound_vader_score'].iloc[i] > threshold:
            final_label.append(2)
        elif df['compound_vader_score'].iloc[i] < -threshold:
            final_label.append(0)
        elif (df['compound_vader_score'].iloc[i] >= -threshold  
              and df['compound_vader_score'].iloc[i] <= threshold):
            final_label.append(1)

    df['vader_label'] = final_label
    return df


# merge the dataset with the hang seng index daily moving average
def merge_vader_actual_label (df,hsi_movement_df):
    print('merge_actual_label')
    vader_data = df
    vader_data.set_index(keys = ["dates"],inplace=True)
    label_data = pd.read_csv(hsi_movement_df)
    label_data.set_index(keys = ["dates"],inplace=True)
    # inner join the two datasets using the date index
    merge = pd.merge(vader_data,label_data, how='inner', left_index=True, right_index=True)
    merge = merge.reset_index()
    # drop the redudant column 
    merge = merge.drop(['Unnamed: 0'],axis=1)
    
    return merge

In [None]:
#### Function for Textblob Analysis ###


from textblob import TextBlob
# append the normalize textblob (-1 to 1) score to the corresponding news
def read_news_textblob_path(df):
    print('read in datasets')
    cs=[]
    # append a compound score to every news row
    for row in range(len(df)):
        cs.append(TextBlob(df['news'].iloc[row]).sentiment[0])
    # append the column to original dataset
    df['compound_textblob_score']=cs
#     print(df)
    return df


# group by the mean compound textblob score by dates
def find_news_textblob_pred_label(df,threshold):
    print('find_pred_label')
#     print(df)
    news = df['news']
    # group the data by dates
    df = df.groupby(['dates'])['compound_textblob_score'].mean().reset_index()
    final_label=[]
    
    # convert the vader score using a threshold to a sentiment label
    for i in range(len(df)):

        if df['compound_textblob_score'].iloc[i] > threshold:
            final_label.append(2)
        elif df['compound_textblob_score'].iloc[i] < -threshold:
            final_label.append(0)
        elif (df['compound_textblob_score'].iloc[i] >= -threshold  
              and df['compound_textblob_score'].iloc[i] <= threshold):
            final_label.append(1)

    df['textblob_label'] = final_label
    return df

# merge the dataset with the hang seng index daily moving average
def merge_textblob_actual_label (df,hsi_movement_df):
    print('merge_actual_label')
    textblob_data = df
    textblob_data.set_index(keys = ["dates"],inplace=True)
    label_data = pd.read_csv(hsi_movement_df)
    label_data.set_index(keys = ["dates"],inplace=True)
    # inner join the two datasets using the date index
    merge = pd.merge(textblob_data,label_data, how='inner', left_index=True, right_index=True)
    merge = merge.reset_index()
    # drop the redudant column 
    merge = merge.drop(['Unnamed: 0'],axis=1)
    
    return merge





In [None]:
import pandas as pd
from nltk.tokenize import WordPunctTokenizer as wpt
import math


def build_dict():
    
    #Load master dictionary
    path = os.path.join(dir_name,'train-data/hkex/LoughranMcDonald_MasterDictionary_2016.csv') 
    
    master = pd.read_csv(path)
#     print(master)
    positive = master[master['Positive']>0]
#     print('positive %s', positive)
    negative = master[master['Negative']>0]
#     print('negative %s' negative)
    pos_words = positive['Word'].tolist()
    neg_words = negative['Word'].tolist()
    
    pos_words = [word.lower() for word in pos_words]
    neg_words = [word.lower() for word in neg_words]
    stopwords = pd.read_csv("https://drive.google.com/file/d/0B4niqV00F3msSktONVhfaElXeEk/view?usp=sharing",names=['Word'])
    stop_list = stopwords['Word'].tolist()
    return pos_words,neg_words,stop_list


def process_news_Loughran(df,pos_words,neg_words,stop_lists):
    print('read in datasets')
    cs=[]
    
    # tokenize the words in each news
    for row in range(len(df)):
        cs.append(word.lower() for word in wpt().tokenize(df['news'].iloc[row]))

#     Remove the stopwords
    words_new = [word for word in cs if word not in stop_lists]

    Loughran_label=[]
    for words_new in cs:
       
        words_new_pos = [word for word in words_new if word in pos_words]
        words_new_neg = [word for word in words_new if word in neg_words]
        
        if(len(words_new_pos)>len(words_new_neg)):
            Loughran_label.append(2)
        elif(len(words_new_pos)<len(words_new_neg)):
            Loughran_label.append(0)
        else:
            Loughran_label.append(1)
        print(Loughran_label)
        
    df['loughran_label']=Loughran_label
    df = df.groupby(['dates'])['loughran_label'].mean().round().reset_index()
#     print(df)
    return df

# merge the dataset with the hang seng index daily moving average
def merge_textblob_actual_label (df,hsi_movement_df):
    print('merge_actual_label')

    df.set_index(keys = ["dates"],inplace=True)
    label_data = pd.read_csv(hsi_movement_df)
    label_data.set_index(keys = ["dates"],inplace=True)
    # inner join the two datasets using the date index
    merge = pd.merge(df,label_data, how='inner', left_index=True, right_index=True)
    merge = merge.reset_index()
    # drop the redudant column 
    merge = merge.drop(['Unnamed: 0'],axis=1)
    
    return merge


In [None]:
### Starter function for vader sentiment analysis ###

def starter_vader(path,result_path):
    # get the full path of each ticker
   
    df = pd.read_csv(path,names=['dates','news','ticker','newstype'])
    # read append the compound vader score to the pandas dataframe
    df = read_news_vader_path(df)
    # pass in the threshold to get the vader label
    df = find_news_vader_pred_label(df,0.01)
    # store to the csv file if the dataset is not empty
    if (df.empty == False):
        df.to_csv(result_path,index=False)


In [None]:
### Starter function for textblolb sentiment analysis ###

def starter_textblob(path,result_path):
          
    df = pd.read_csv(path,names=['dates','news','ticker','newstype'])
    # read append the compound vader score to the pandas dataframe
    df = read_news_textblob_path(df)
    # pass in the threshold to get the vader label
    df = find_news_textblob_pred_label(df,0.01)
    
    db_df=pd.read_csv(result_path)

    db_df['textblob_label']=df['textblob_label']
    
    # store to the csv file if the dataset is not empty
#     print(db_df)
    if (db_df.empty == False):
        db_df.to_csv(result_path,index=False)
        


        


In [None]:
### Starter function for textblolb sentiment analysis ###

def starter_Loughran(path,result_path):
    
    # get the full path of each ticker          
    df = pd.read_csv(path,names=['dates','news','ticker','newstype'])
    
    pos_words,neg_words,stop_lists= build_dict()
    df = process_news_Loughran(df,pos_words,neg_words,stop_lists)

    db_df=pd.read_csv(result_path)
    
    db_df['loughran_label']=df['loughran_label']
    
    # store to the csv file if the dataset is not empty
    if (db_df.empty == False):
        db_df.to_csv(result_path,index=False)



In [None]:
# collect individual sentiment label for tickers in hkex    
def collect_individual_sentiment(ticker):
    try:
        print(ticker)
        path = os.path.join(dir_name,'data-news/data-aastock-equities/'+'data-'+ticker.zfill(5)+'-aastock.csv') 
        result_path = os.path.join(dir_name,'data-results/hkex-aastock/'+'data-'+ticker.zfill(5)+'-result.csv')
        starter_vader(path,result_path)  
        starter_textblob(path,result_path)  
        starter_Loughran(path,result_path)
    except Exception as e:
        print(e)
        pass
                
# for tickers in hkex_df:
#      for ticker in tickers:
#         collect_individual_sentiment(ticker)
            
            
# collect_individual_sentiment('669')

In [None]:
# get the market sentiment
path='hkex_agg_equity_news.csv'
result_path='hkex_agg_equity_news_label.csv'
print(path)
starter_vader(path,result_path)  
starter_textblob(path,result_path)  
starter_Loughran(path,result_path)

In [None]:
#load data

def load_data (df,path,new_path):
    try:
#         new_df =pd.read_csv(path,names=['dates','compound_vader_score','vader_label','textblob_label','loughran_label'],index_col='dates',usecols=['dates', 'vader_label','textblob_label'],parse_dates=['dates'], na_values=['nan'])
        new_df =pd.read_csv(path,names=['dates','compound_vader_score','vader_label','textblob_label','loughran_label'],index_col='dates',usecols=['dates', 'vader_label','textblob_label'],parse_dates=['dates'], na_values=['nan'])
    
        print(new_df)
        df=pd.merge(df,new_df, how='left', left_index=True, right_index=True)
        df=df.reset_index()
        df = df.rename(columns={'index': 'dates'})
        print(df)
        df.to_csv(new_path,index=False)
            
    except Exception as e:
        print(e)
        pass
                
                
   
    


In [None]:
import csv
import datetime, time

# get the existent sentiment label for all ticker
def sentiment_label_data_range(ticker):
    
    dates = pd.date_range('2017-01-01','2021-03-03',freq='B')
    dates=dates.strftime('%Y-%m-%d')
    df = pd.DataFrame(index=dates)
    path = os.path.join(dir_name,'data-results/hkex-aastock/'+'data-'+ticker.zfill(5)+'-result.csv') 
    new_path = os.path.join(dir_name,'data-results/temp_result/'+'data-'+ticker.zfill(5)+'-result.csv') 
    load_data(df,path,new_path)
    
# sentiment_label_data_range('6618')
# for ticker in hkex_input:
#     sentiment_label_data_range(ticker)


In [None]:
# get the merge data with the specific time frame
dates = pd.date_range('2017-01-01','2021-03-03',freq='B')
dates=dates.strftime('%Y-%m-%d')
df = pd.DataFrame(index=dates)
path='hkex_agg_equity_news_label.csv'
new_path='hkex_market_equity_news_label.csv'
load_data(df,  path,new_path)

In [None]:
# df =pd.read_csv('hkex_agg_equity_news_label.csv',names=['dates','compound_vader_score','vader_label','compound_textblob_score','textblob_label','loughran_label'])
# df=df.drop(['compound_textblob_score'], axis=1)
# df.to_csv('hkex_agg_equity_news_label.csv',index=False)

In [None]:
# use the vader score as the marekt label
# usecols=['dates', 'vader_label','textblob_label','loughran_label']
import numpy as np 
def merge_market_individual_sentiment(ticker):
    path = os.path.join(dir_name,'data-results/temp_result/'+'data-'+ticker.zfill(5)+'-result.csv')
    market_path='hkex_market_equity_news_label.csv'
#         df =pd.read_csv(path,names=['dates','vader_label','textblob_label','loughran_label'],index_col='dates',usecols=usecols,parse_dates=['dates'], na_values=['nan'])
    df =pd.read_csv(path, na_values=['nan'])
  

    df_market=pd.read_csv(market_path)
    v_label=[]
    t_label=[]

   
    for i in range(0,len(df)):

        isnull=pd.isnull(df.at[i,'vader_label'])
        is_market_null=pd.isnull(df_market.at[i,'vader_label'])
        
        if(isnull==True ):
            if(is_market_null==False):
                v_label.append(df_market.at[i,'vader_label'])
            else:
                v_label.append(0)
        else:
            v_label.append(df.at[i,'vader_label'])
            

    
    for i in range(0,len(df)):
        isnull=pd.isnull(df.at[i,'textblob_label'])
        is_market_null=pd.isnull(df_market.at[i,'textblob_label'])
        if(isnull==True ):
            if(is_market_null==False):
                t_label.append(df_market.at[i,'textblob_label'])
            else:
                t_label.append(0)
        else:
            t_label.append(df.at[i,'textblob_label'])
            
            
#     print(t_label)


    df['vader_label']=v_label
    df['textblob_label']=t_label
    print(df)
    df.to_csv(path,index=False)

               
# merge_market_individual_sentiment('669')

In [None]:
for ticker in hkex_input:
    try:
        
        merge_market_individual_sentiment(ticker)
    
    except Exception as e:
        print(e)
        pass

In [None]:
# extract the data for the 36 testing dataset
test_df=['669','175','1211','2319','6186','168','2269','6618','01801','1','4','19','700','3690','9988','823','16','1113','1299','5','939','2','3','2688','941','762','6823','968','868','2382','2899','1818','2689','883','386','857']test_df=['669','175','1211','2319','6186','168','2269','6618','01801','1','4','19','700','3690','9988','823','16','1113','1299','5','939','2','3','2688','941','762','6823','968','868','2382','2899','1818','2689','883','386','857']

for ticker in test_df:
    
    path = os.path.join(dir_name,'data-results/temp_result/'+'data-'+ticker.zfill(5)+'-result.csv')
    new_path = os.path.join(dir_name,'data-results/testing/testing_df/'+'data-'+ticker.zfill(5)+'-result.csv')
    df=pd.read_csv(path)
    df=df.drop(['Unnamed: 0'],axis=1)
    df.to_csv(new_path,index=False)
    
    