# 1. Get 300 Financial News Headlines


In [58]:
import pandas as pd
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.max_columns', 10)

headlines_df = pd.read_csv('data/300_stock_headlines.csv')
headlines_df.drop('Unnamed: 0.1', axis=1, inplace=True)
headlines_df.drop('Unnamed: 0', axis=1, inplace=True)
headlines_df.reset_index()
headlines_df.head(5)


Unnamed: 0,headline,url,publisher,date,stock
0,Agilent Technologies Announces Pricing of $5…… Million of Senior Notes,http://www.gurufocus.com/news/1153187/agilent-technologies-announces-pricing-of-500-million-of-s...,GuruFocus,2020-06-01 00:00:00,A
1,Agilent (A) Gears Up for Q2 Earnings: What's in the Cards?,http://www.zacks.com/stock/news/931205/agilent-a-gears-up-for-q2-earnings-whats-in-the-cards?cid...,Zacks,2020-05-18 00:00:00,A
2,J.P. Morgan Asset Management Announces Liquidation of Six Exchange-Traded Funds,http://www.gurufocus.com/news/1138923/jp-morgan-asset-management-announces-liquidation-of-six-ex...,GuruFocus,2020-05-15 00:00:00,A
3,"Pershing Square Capital Management, L.P. Buys Agilent Technologies Inc, The Howard Hughes Corp, ...",http://www.gurufocus.com/news/1138704/pershing-square-capital-management-lp-buys-agilent-technol...,GuruFocus,2020-05-15 00:00:00,A
4,Agilent Awards Trilogy Sciences with a Golden Ticket at LabCentral,http://www.gurufocus.com/news/1134012/agilent-awards-trilogy-sciences-with-a-golden-ticket-at-la...,GuruFocus,2020-05-12 00:00:00,A


# 2. Calculate Financial Sentiment for each headline
Using a pre-trained model fine-tuned on financial news/report data

In [59]:
#Calculate financial sentiment for each headline

from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
nlp = pipeline("sentiment-analysis", model=finbert, tokenizer=tokenizer)
headlines = headlines_df["headline"].tolist()

#get financial sentiment for all headlines
results = nlp(headlines)

#show results for first 2 headlines
print(results[:2])

[{'label': 'Neutral', 'score': 0.9999771118164062}, {'label': 'Neutral', 'score': 0.999295711517334}]


# 3. Join Financial Sentiment and Headline into a single Dataframe

In [60]:
#Put headlines and financial sentiment in 1 dataframe
sentiment_df = pd.DataFrame(results)
sentiment_df.reset_index()

result_df = pd.concat([headlines_df, sentiment_df],axis=1)
#show the first 5 rows 
result_df.head(5)

Unnamed: 0,headline,url,publisher,date,stock,label,score
0,Agilent Technologies Announces Pricing of $5…… Million of Senior Notes,http://www.gurufocus.com/news/1153187/agilent-technologies-announces-pricing-of-500-million-of-s...,GuruFocus,2020-06-01 00:00:00,A,Neutral,0.999977
1,Agilent (A) Gears Up for Q2 Earnings: What's in the Cards?,http://www.zacks.com/stock/news/931205/agilent-a-gears-up-for-q2-earnings-whats-in-the-cards?cid...,Zacks,2020-05-18 00:00:00,A,Neutral,0.999296
2,J.P. Morgan Asset Management Announces Liquidation of Six Exchange-Traded Funds,http://www.gurufocus.com/news/1138923/jp-morgan-asset-management-announces-liquidation-of-six-ex...,GuruFocus,2020-05-15 00:00:00,A,Neutral,0.999535
3,"Pershing Square Capital Management, L.P. Buys Agilent Technologies Inc, The Howard Hughes Corp, ...",http://www.gurufocus.com/news/1138704/pershing-square-capital-management-lp-buys-agilent-technol...,GuruFocus,2020-05-15 00:00:00,A,Neutral,0.999965
4,Agilent Awards Trilogy Sciences with a Golden Ticket at LabCentral,http://www.gurufocus.com/news/1134012/agilent-awards-trilogy-sciences-with-a-golden-ticket-at-la...,GuruFocus,2020-05-12 00:00:00,A,Positive,0.971711


# 4. Generate Embeddings for each headline
Using a HuggingFace Sentence Embedder

In [61]:
#Generate embeddings (vectors) for each headline
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-distilroberta-v1')
headline_vectors = [ model.encode(sentence) for sentence in result_df['headline']]
#check how many dimensions in a single vector 
headline_vectors[0].shape

(768,)

# 5. Connect to Redis

In [62]:
import redis

#connect to redis
host = 'localhost'
port = 6379
redis_conn = redis.Redis(host = host, port = port)
print ('Connected to redis')

Connected to redis


# 6 Utility functions to load data into Redis 
We'll be loading into a "hash" structure (a table-like structure)

In [63]:
#load articles into redis hash
import numpy as np
def load_vectors(client:redis.Redis, headlines_df, vector_data,vector_field_name):
    #pipeline the 300 articles in one go
    p = client.pipeline(transaction=False)
    for index, row in headlines_df.iterrows():    
        #hash key
        key='article:'+ str(index)
        #hash fields
        headline=row['headline']
        url=row['url']
        publisher=row['publisher']
        date=row['date']
        label=row['label']
        score=row['score']
        headline_vector = vector_data[index].astype(np.float32).tobytes()
        headline_data_mapping ={'headline':headline,'url':url,'publisher':publisher,'label':label,'score':score, vector_field_name:headline_vector}
        
        p.hset(key,mapping=headline_data_mapping)
    p.execute()
    

# 7.Utility Functions to Define vector indexes 

In [64]:
#Utility Functions to Create Indexes on Vector field

def create_flat_index (redis_conn,vector_field_name,number_of_vectors, vector_dimensions=768, distance_metric='COSINE'):
    create_command =  ["FT.CREATE", "idx", "SCHEMA","publisher","TAG","headline","TEXT","label","TAG","score","NUMERIC"]
    create_command += ["headline_vector", "VECTOR", "FLAT", "8", 
                        "TYPE", "FLOAT32", 
                        "DIM", str(vector_dimensions), 
                        "DISTANCE_METRIC", str(distance_metric), 
                        "INITIAL_CAP", 300]
    redis_conn.execute_command(*create_command)

def create_hnsw_index (redis_conn,vector_field_name,number_of_vectors, vector_dimensions, distance_metric='COSINE',M=40,EF=200):
    
    create_command =  ["FT.CREATE", "idx", "SCHEMA","publisher","TAG","headline","TEXT","label","TAG","score","NUMERIC"]
    create_command += ["headline_vector", "VECTOR", "HNSW", "12", 
                        "TYPE", "FLOAT32", 
                        "DIM", str(vector_dimensions), 
                        "DISTANCE_METRIC", str(distance_metric), 
                        "INITIAL_CAP", 300,
                        "M", M, 
                        "EF_CONSTRUCTION", EF]
    
    redis_conn.execute_command(*create_command)
    

# 8. Load and Index data (HNSW Vector Index)

In [65]:
NUMBER_ARTICLES = 300
VECTOR_FIELD_NAME = 'headline_vector'
DISTANCE_METRIC = 'COSINE'
DIMENSIONS = 768

redis_conn.flushall()
create_hnsw_index(redis_conn,VECTOR_FIELD_NAME,NUMBER_ARTICLES,DIMENSIONS,DISTANCE_METRIC)
load_vectors(redis_conn,result_df,headline_vectors,VECTOR_FIELD_NAME)
print ('300 News Articles loaded and indexed')

300 News Articles loaded and indexed


# 9. A simple FT.SEARCH (without vector similarity)
## Get 5 articles published by 'GuruFocus' 

FT.SEARCH QUERY = @publisher:{GuruFocus}

In [66]:

from redis.commands.search import Search
from redis.commands.search.query import Query


q = Query(f'@publisher:{{GuruFocus}}').return_fields('headline','publisher','label','score').paging(0,5)
docs = redis_conn.ft().search(q).docs

for doc in docs:
    print ("********DOCUMENT: " + str(doc.id) + ' ********')
    print(doc.headline)
    print(doc.publisher)


********DOCUMENT: article:0 ********
Agilent Technologies Announces Pricing of $5…… Million of Senior Notes
GuruFocus
********DOCUMENT: article:2 ********
J.P. Morgan Asset Management Announces Liquidation of Six Exchange-Traded Funds
GuruFocus
********DOCUMENT: article:3 ********
Pershing Square Capital Management, L.P. Buys Agilent Technologies Inc, The Howard Hughes Corp, ...
GuruFocus
********DOCUMENT: article:4 ********
Agilent Awards Trilogy Sciences with a Golden Ticket at LabCentral
GuruFocus
********DOCUMENT: article:5 ********
Agilent Technologies Inc (A) CEO and President Michael R. Mcmullen Sold $–.4 million of Shares
GuruFocus


# 10. A simple FT.SEARCH (only vector similarity)
## Get top 4 articles with healines semantically similar to "downturn in european markets" 

FT.SEARCH QUERY = *=>[KNN 4 @headline_vector $QUERY_BLOB]

In [67]:
#query for similarity

user_query='downturn in european markets'
e = model.encode(user_query)

q = Query(f'*=>[KNN $K @headline_vector $BLOB]').return_fields('headline','publisher','label','score').sort_by('__headline_vector_score').paging(0,4).dialect(2)

#parameters to be passed into search
params_dict = {"K": 4, "BLOB": e.tobytes()}
docs = redis_conn.ft().search(q,params_dict).docs

for doc in docs:
    print ("********DOCUMENT: " + str(doc.id) + ' ********')
    print(doc.headline)
    print(doc.publisher)


********DOCUMENT: article:29 ********
Agilent Technologies to Adjourn Annual Meeting Until April –7, —…—…
GuruFocus
********DOCUMENT: article:30 ********
Agilent Introduces CrossLab Connect Services for Lab-wide Asset Monitoring
GuruFocus
********DOCUMENT: article:31 ********
6 Guru Stocks Trading With Low Price-Sales Ratios
GuruFocus
********DOCUMENT: article:32 ********
Stock Upgrades: Agilent Technologies Shows Rising Relative Strength
Investor's Business Daily


# 11. A Hybrid Query FT.SEARCH (vector and non-vector search criteria)
## Get top 5 articles with 
- headlines semantically similar to "downturn in european markets"  AND
- negative sentiment (label=negative)

FT.SEARCH QUERY = (@label:{negative})=>[KNN 5 @headline_vector $QUERY_BLOB]

In [None]:
#query 
user_query='downturn in european markets'
e = model.encode(user_query)

#build query
q = Query(f' (@label:{{negative}})=>[KNN $K @headline_vector $BLOB]').return_fields('headline','publisher','label','score').sort_by('__headline_vector_score').paging(0,5).dialect(2)

#parameters to be injected into query
params_dict = {"K": 5, "BLOB": e.tobytes()}

#FT.SEARCH 
docs = redis_conn.ft().search(q,params_dict).docs

for doc in docs:
    print ("********DOCUMENT: " + str(doc.id) + ' ********')
    print(doc.headline)
    print(doc.publisher)
    print(doc.label)
    


# 12. Another Hybrid Query FT.SEARCH (vector and non-vector search criteria)
## Get top 5 articles with 
- headlines semantically similar to "downturn in european markets"  AND
- negative sentiment (label=negative)
- containing the word **'Agilent'** **ON any text fields** on the index)



In [69]:
#query for similarity

user_query='downturn in european markets'
e = model.encode(user_query)

q = Query(f'(Agilent @label:{{negative}})=>[KNN $K @headline_vector $BLOB]').return_fields('headline','publisher','label','score').sort_by('__headline_vector_score').paging(0,5).dialect(2)

#parameters to be passed into search
params_dict = {"K": 5, "BLOB": e.tobytes()}
docs = redis_conn.ft().search(q,params_dict).docs

for doc in docs:
    print ("********DOCUMENT: " + str(doc.id) + ' ********')
    print(doc.headline)
    print(doc.publisher)
    print(doc.label)
