In [1]:
!pip install seaborn

Collecting seaborn
  Downloading seaborn-0.11.1-py3-none-any.whl (285 kB)
[K     |████████████████████████████████| 285 kB 5.7 MB/s eta 0:00:01
Collecting scipy>=1.0
  Downloading scipy-1.6.0-cp39-cp39-macosx_10_9_x86_64.whl (30.9 MB)
[K     |████████████████████████████████| 30.9 MB 10.6 MB/s eta 0:00:01
[?25hInstalling collected packages: scipy, seaborn
Successfully installed scipy-1.6.0 seaborn-0.11.1
You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.9/bin/python3 -m pip install --upgrade pip' command.[0m


In [13]:
import os 
import pandas as pd

from sklearn.model_selection import train_test_split
import numpy as np

# get the current directory
#dir_name = os.getcwd()+'/data/'
dir_name = '../database/sentiment-data/'

In [14]:
import nltk

# download vader lexicon
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/angelwoo/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [15]:
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

In [16]:
# read in the tweets from the input directory
def read_tweets_us_path(path):
    print('read_us_path')
    #join the path together to form an absolute path
    path=os.path.join(dir_name,'train-data/'+path)
    df=pd.read_csv(path)
    cs=[]
    # calculate the compound score for tweets in each row
    for row in range(len(df)):
        cs.append(analyzer.polarity_scores(df['tweets'].iloc[row])['compound'])
    # create a new column and append the compound score for each row
    df['compound_vader_score']=cs
    print(df)
    #return the final dataframe with compound score
    return df

# read news from the input path
def read_news_us_path(path):
    print('read_us_path')
    # join the input directory to form an absolute directory
    path=os.path.join(dir_name,'train-data/'+path)
    df=pd.read_csv(path)
    cs=[]
     # calculate the compound score for news in each row
    for row in range(len(df)):
        cs.append(analyzer.polarity_scores(df['news'].iloc[row])['compound'])
    df['compound_vader_score']=cs
#     df = df[(df[['compound_vader_score']] != 0).all(axis=1)].reset_index(drop=True)
    print(df)
    return df

In [17]:
# get the mean compound score of the tweets in each date
def find_tweets_pred_label(grouped_data,file_name,perc_change):
    print('find_pred_label')
    tweets=grouped_data['tweets']
    # group the data by compound index ('dates','ticker')
    grouped_data=grouped_data.groupby(['dates','ticker'])['compound_vader_score'].mean().reset_index()
    final_label=[]
    # convert the mean compound score to sentiment label: 0(negative) 1(neutral) 2(positive)
    for i in range(len(grouped_data)):

        if grouped_data['compound_vader_score'].iloc[i]>perc_change:
            final_label.append(2)
        elif grouped_data['compound_vader_score'].iloc[i]<-perc_change:
            final_label.append(0)
        elif (grouped_data['compound_vader_score'].iloc[i]>=-perc_change  and grouped_data['compound_vader_score'].iloc[i]<=perc_change):
            final_label.append(1)

    grouped_data['vader_label']=final_label
    grouped_data['tweets']=tweets
    grouped_data.to_csv(file_name)

# get the mean compound score of the news in each date
def find_news_pred_label(grouped_data,file_name,perc_change):
    print('find_pred_label')
    news=grouped_data['news']
    grouped_data=grouped_data.groupby(['dates','ticker'])['compound_vader_score'].mean().reset_index()
    final_label=[]
    # convert the mean compound score to sentiment label: 0(negative) 1(neutral) 2(positive)
    for i in range(len(grouped_data)):
        if grouped_data['compound_vader_score'].iloc[i]>perc_change:
            final_label.append(2)
        elif grouped_data['compound_vader_score'].iloc[i]<-perc_change:
            final_label.append(0)
        elif (grouped_data['compound_vader_score'].iloc[i]>=-perc_change  
              and grouped_data['compound_vader_score'].iloc[i]<=perc_change):
            final_label.append(1)

    # add new column of the final sentiment label
    grouped_data['vader_label']=final_label
    grouped_data['news']=news
    grouped_data.to_csv(file_name)

In [18]:
# merge the final sentiment label with the acutal label datasets
# join_path: the datasets after gone through the vader prediction
# label_data: the datasets which contains the actual price movement of the ticker in each date
def merge_actual_label (join_path,label_data):
    print('merge_actual_label')
    vader_data=pd.read_csv(join_path)
    vader_data.set_index(keys = ["dates","ticker"],inplace=True)

    label_data=pd.read_csv(label_data)
    label_data.set_index(keys = ["dates","ticker"],inplace=True)

    # create a inner join between the predicted datasets and datasets that contains the actual label
    merge=pd.merge(vader_data,label_data, how='inner', left_index=True, right_index=True)
    merge.drop(columns=['Unnamed: 0_y'],axis=1)

    print(merge)
    return merge


In [19]:
import seaborn as sns
def make_confusion_matrix(cf,group_names=None,categories='auto',count=True,percent=True,cbar=True,xyticks=True,xyplotlabels=True,
sum_stats=True,figsize=None,cmap='Blues', title=None):

    blanks = ['' for i in range(cf.size)]

    if group_names and len(group_names)==cf.size:
        group_labels = ["{}\n".format(value) for value in group_names]
    else:
        group_labels = blanks

    if count:
        group_counts = ["{0:0.0f}\n".format(value) for value in cf.flatten()]
    else:
        group_counts = blanks

    if percent:
        group_percentages = ["{0:.2%}".format(value) for value in cf.flatten()/np.sum(cf)]
    else:
        group_percentages = blanks

    box_labels = [f"{v1}{v2}{v3}".strip() for v1, v2, v3 in zip(group_labels,group_counts,group_percentages)]
    box_labels = np.asarray(box_labels).reshape(cf.shape[0],cf.shape[1])


    if sum_stats:
        accuracy  = np.trace(cf) / float(np.sum(cf))
        if len(cf)==2:
            precision = cf[1,1] / sum(cf[:,1])
            recall    = cf[1,1] / sum(cf[1,:])
            f1_score  = 2*precision*recall / (precision + recall)
            stats_text = "\n\nAccuracy={:0.3f}\nPrecision={:0.3f}\nRecall={:0.3f}\nF1 Score={:0.3f}".format(
                accuracy,precision,recall,f1_score)
        else:
            stats_text = "\n\nAccuracy={:0.3f}".format(accuracy)
    else:
        stats_text = ""



    if figsize==None:

        figsize = plt.rcParams.get('figure.figsize')

    if xyticks==False:
        categories=False


    # MAKE THE HEATMAP VISUALIZATION
    plt.figure(figsize=figsize)
    sns.heatmap(cf,annot=box_labels,fmt="",cmap=cmap,cbar=cbar,xticklabels=categories,yticklabels=categories)

    if xyplotlabels:
        plt.ylabel('True label')
        plt.xlabel('Predicted label' + stats_text)
    else:
        plt.xlabel(stats_text)
    
    if title:
        plt.title(title)

In [20]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

# draw the confusion matrix of the predicted label and the actual label
def validation(df,name):
    print(df)
    labels = ['True Neg','False Pos','False Neg','True Pos']
    categories = ['Negative','Neutral', 'Positive']
    make_confusion_matrix(cm, group_names=labels, categories=categories )
    df.to_csv(name)

In [21]:
# starter function for the tweets
# path: represent the path which stored the tweets
# file name: represent the place to stored the output predicted file
# label_data: represent the file which stores the corresponding price movement of each ticker under each date
def us_tweets_starter(path,file_name,label_data,percent_change):
    # read in the tweets
    grouped_data=read_tweets_us_path(path)
    # make prediction of the sentiment in each date
    find_tweets_pred_label(grouped_data,file_name,percent_change)
    # merge the prediction datasets with the actual price movement data
    df=merge_actual_label (file_name,label_data)
    return df


# starter function for the news
# path: represent the path which stored the tweets
# file name: represent the place to stored the output predicted file
# label_data: represent the file which stores the corresponding price movement of each ticker under each date
def us_news_starter(path,file_name,label_data,percent_change):
    # read in the news
    grouped_data=read_news_us_path(path)
    # make prediction of the sentiment in each date
    find_news_pred_label(grouped_data,file_name,percent_change)
    # merge the prediction datasets with the actual price movement data
    df=merge_actual_label (file_name,label_data)
    
    return df

In [22]:
# Test which threshold value perform the best accuracies
# threshold=np.linspace(0.0,0.02, num=5)
# nasdaq_tweets_accuracies=[]
# for val in threshold:
#     print (val)


nasdaq_tweets_path = 'nasdaq/nasdaq_no_labelled_tweets.csv'
nasdaq_vader_tweets_path = os.path.join(dir_name,'train-data/nasdaq/nasdaq_vader_tweets.csv')
join_path = os.path.join(dir_name,'train-data/nasdaq/nasdaq_labelled_data.csv')
nasdaq_tweets_df = us_tweets_starter(nasdaq_tweets_path,nasdaq_vader_tweets_path,join_path,0.01)

read_us_path


FileNotFoundError: [Errno 2] No such file or directory: '../database/sentiment-data/train-data/nasdaq/nasdaq_no_labelled_tweets.csv'

In [12]:
# visualize the confusion matrix and accuracy
validation(nasdaq_tweets_df,'nasdaq_vader_tweets_label.csv')

NameError: name 'nasdaq_tweets_df' is not defined

In [None]:
# Test which threshold value perform the best accuracies
# threshold=np.linspace(0.0, 0.02, num=5)
# nasdaq_news_accuracies=[]
# for val in threshold:
#     print(val)

nasdaq_news_path='nasdaq/nasdaq_no_labelled_news.csv'

nasdaq_vader_news_path=os.path.join(dir_name,'train-data/nasdaq/nasdaq_vader_news.csv')
join_path=os.path.join(dir_name,'train-data/nasdaq/nasdaq_labelled_data.csv')

nasdaq_news_df=us_news_starter(nasdaq_news_path,nasdaq_vader_news_path,join_path,0.01)
# nasdaq_news_accuracies.append(find_accuracy(nasdaq_news_df))


In [None]:
# visualize the confusion matrix and accuracy
validation(nasdaq_news_df,'nasdaq_vader_news_label.csv')

In [None]:
# Test which threshold value perform the best accuracies
# threshold=np.linspace(0.0, 0.02, num=5)
# nyse_newss_accuracies=[]
# for val in threshold:
#     print(val)
nyse_news_path='nyse/nyse_no_labelled_news.csv'
nyse_vader_news_path=os.path.join(dir_name,'train-data/nyse/nyse_vader_news.csv')
join_path=os.path.join(dir_name,'train-data/nyse/nyse_labelled_data.csv')
        
nyse_news_df=us_news_starter(nyse_news_path,nyse_vader_news_path,join_path,0.01)
#     nyse_newss_accuracies.append(find_accuracy(nyse_news_df))

In [None]:
# visualize the confusion matrix and accuracy
validation(nyse_news_df,'nyse_vader_news_label.csv')

In [None]:

# Test which threshold value perform the best accuracies

# threshold=np.linspace(0.0, 0.02, num=5)
# nyse_tweets_accuracies=[]
# for val in threshold:
#     print(val)

nyse_tweets_path='nyse/nyse_no_labelled_tweets.csv'
nyse_vader_tweets_path=os.path.join(dir_name,'train-data/nyse/nyse_vader_tweets.csv')
join_path=os.path.join(dir_name,'train-data/nyse/nyse_labelled_data.csv')
nyse_tweets_df=us_tweets_starter(nyse_tweets_path,nyse_vader_tweets_path,join_path,0.01)
#     nyse_tweets_accuracies.append(find_accuracy(nyse_tweets_df))


In [None]:
# visualize the confusion matrix and accuracy
validation(nyse_tweets_df,'nyse_vader_tweets.csv')

In [None]:



# Test which threshold value perform the best accuracies

# threshold=np.linspace(0.0, 0.02, num=5)
# hk_accuracies=[]
# for val in threshold:
#     print (val)

hkex_news_path='hkex/hkex_no_labelled_news.csv'
hkex_vader_news_path=os.path.join(dir_name,'train-data/hkex/hkex_vader_news.csv')
join_path=os.path.join(dir_name,'train-data/hkex/hkex_merge_data.csv')
hkex_news_df=us_news_starter(hkex_news_path,hkex_vader_news_path,join_path,0.01)
#     hk_accuracies.append(find_accuracy(hkex_news_df))


In [None]:
# visualize the confusion matrix and accuracy
validation(hkex_news_df,'hkex_vader_news.csv')
