# Scrape news and Analyse sentiments
This notebook shows an example of scraping news articles linked to specific traded companies and utilizing our predeployed sentiment analysis model server to predict the sentiment of the author towards said companies.

In [23]:
# if the nuclio-jupyter package is not installed run !pip install nuclio-jupyter
import nuclio

## Environment

In [None]:
%nuclio env -c V3IO_ACCESS_KEY=${V3IO_ACCESS_KEY}
%nuclio env -c V3IO_USERNAME=${V3IO_USERNAME}
%nuclio env -c V3IO_API=${V3IO_API}

In [None]:
%%nuclio cmd -c
pip install beautifulsoup4
pip install pandas
pip install v3io_frames

In [None]:
%%nuclio config 
kind = "nuclio"
spec.build.baseImage = "mlrun/mlrun:0.6.5"

## Function

In [None]:
# nuclio: start-code

In [1]:
from bs4 import BeautifulSoup as bs
from urllib.request import Request, urlopen
import requests
import pandas as pd
import v3io_frames as v3f
from unicodedata import normalize
from datetime import datetime
import re
import os
import mlrun.feature_store as fs
import mlrun
from mlrun.datastore.targets import ParquetTarget
import json

In [45]:
def get_stock_news_page(stock_string):
    request = Request('https://www.investing.com/equities/' + stock_string + '-news', headers={"User-Agent": "Mozilla/5.0"})
    content = urlopen(request).read()
    return bs(content, 'html.parser')

def get_internal_article_links(page,sym):
    if(sym == "INTC"): # in webpage of INTEL, the html tags are arranged differently
        news = page.find_all('div', attrs={'class': 'js-search-ga-items articles mediumTitle1'})[0]
    else:
        news = page.find_all('div', attrs={'class': 'mediumTitle1'})[1]
    articles = news.find_all('article', attrs={'class': 'js-article-item articleItem'})
    return ['https://www.investing.com' + a.find('a').attrs['href'] for a in articles]

def get_article_page(article_link):
    request = Request(article_link, headers={"User-Agent": "Mozilla/5.0"})
    content = urlopen(request).read()
    return bs(content, 'html.parser')

def clean_paragraph(paragraph):
    paragraph = re.sub(r'\(http\S+', '', paragraph)
    paragraph = re.sub(r'\([A-Z]+:[A-Z]+\)', '', paragraph)
    paragraph = re.sub(r'[\n\t\s\']', ' ', paragraph)
    return normalize('NFKD', paragraph)    

def extract_text(article_page):
    text_tag = article_page.find('div', attrs={'class': 'WYSIWYG articlePage'})
    paragraphs = text_tag.find_all('p')
    text = '\n'.join([clean_paragraph(p.get_text()) for p in paragraphs[:-1]])
    return text

def get_publish_time(article):
    tag = article.find('script',{"type" : "application/ld+json"}).contents[0]
    tag_dict = json.loads(str(tag))
    dateModified = tag_dict["dateModified"]
    return datetime.strftime(datetime.strptime(dateModified, '%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S')

def get_score(paragraph_scores):
    return sum([score - 1 for score in paragraph_scores['outputs']]) / len(paragraph_scores)

def get_article_scores(context, articles, endpoint):
    scores = [] 
    endpoint = endpoint + "/v2/models/model1/predict"
    for i, article in enumerate(articles):
        context.logger.info(f'getting score for article {i + 1}\\{len(articles)}')
        event_data = {'inputs': article.split('\n')}
        resp = requests.put(endpoint, json=json.dumps(event_data))
        scores.append(get_score(json.loads(resp.text)))
    return scores

def construct_dataframe(sentiments, items,times):
    tickers = [x[0] for x in items]
    stock_sent = pd.DataFrame({"symbol": tickers, "sentiment": sentiments, "last_reaction": times})
    return stock_sent

In [46]:
def init_context(context):
    context.logger.info("init news reader context")
    setattr(context, 'PROJECT_NAME', os.getenv('PROJECT_NAME', 'stocks-' + os.getenv('V3IO_USERNAME')))
    mlrun.set_environment(project = context.PROJECT_NAME)
    
    # Declaring feature set
    stocks_sent_set = fs.FeatureSet("news", entities=[fs.Entity("symbol")],timestamp_key="Datetime")
    stocks_sent_set.set_targets(targets=[ParquetTarget(name="news",partitioned=True,time_partitioning_granularity="day")]
                                ,with_defaults=False)
    setattr(context, 'stock_feature_set', stocks_sent_set)
    
    # saving timestamps to know what to ingest
    last_trade_times = {}
    setattr(context, 'last_trade_times', last_trade_times)
    
    v3io_framesd = os.getenv('V3IO_FRAMESD', 'framesd:8081')
    token = os.getenv('TOKEN', '')
    client = v3f.Client(v3io_framesd, container=os.getenv('V3IO_CONTAINER', 'users'), token=token)
    setattr(context, 'v3c', client)
    # Create V3IO Tables and add reference to context
    setattr(context, 'stocks_stream', os.getenv('STOCKS_STREAM', os.getenv('V3IO_USERNAME') + '/stocks/stocks_stream'))
    context.v3c.create(backend='stream', table=context.stocks_stream, if_exists=1)

    setattr(context, 'stocks_tsdb', os.getenv('STOCKS_TSDB_TABLE', os.getenv('V3IO_USERNAME') + '/stocks/stocks_tsdb'))
    context.v3c.create(backend='tsdb', table=context.stocks_tsdb, rate='1/s', if_exists=1)

    setattr(context, 'sentiment_model_endpoint',
            os.getenv('SENTIMENT_MODEL_ENDPOINT', '')) # in the '' should be the sentiment-analysis model endpoint
    context.logger.info(f"set sentiment_model_endpoint {context.sentiment_model_endpoint}")
    sym_to_url = {'GOOGL': 'google-inc', 'MSFT': 'microsoft-corp', 'AMZN': 'amazon-com-inc',
                  'AAPL': 'apple-computer-inc', 'INTC' : 'intel-corp'}
    setattr(context, 'sym_to_url', sym_to_url)
    setattr(context, 'stocks_kv', os.getenv('STOCKS_KV', os.getenv('V3IO_USERNAME') + '/stocks/stocks_kv'))
    context.logger.info('end init context')

In [57]:
def handler(context, event):
    '''
    Extracting news from investing.com using beautifulsoup, 
    '''
    context.logger.info(f'Getting news about {context.sym_to_url}')
    syms = []
    contents = []
    links = []
    times = []
    sentiments = []
    all_records = []
    for sym, url_string in list(context.sym_to_url.items()):
        context.logger.info(f'Getting news about {sym}')
        news_page = get_stock_news_page(url_string)
        article_links = get_internal_article_links(news_page,sym)
        article_pages = [get_article_page(link) for link in article_links]
        articles = [extract_text(article_page) for article_page in article_pages]
        curr_sentiments = get_article_scores(context, articles, context.sentiment_model_endpoint)
        curr_times = [get_publish_time(article_page) for article_page in article_pages]
        sentiments += curr_sentiments
        times += curr_times
        time = datetime.strptime(curr_times[0],"%Y-%m-%d %H:%M:%S")
        last = context.last_trade_times.get(sym)
        if not last:
            last = datetime(1990,1,1)
        for article, link, sentiment, time in zip(articles, article_links, curr_sentiments, curr_times):
            record = {
                'content': article,
                'time': time,
                'symbol': sym,
                'link': link,
                'sentiment': sentiment
            }
            context.v3c.execute('stream', context.stocks_stream, 'put', args={'data': json.dumps(record)})
            timestamped_record = record.copy()
            timestamped_record["time"] = datetime.strptime(record["time"],"%Y-%m-%d %H:%M:%S")
            if(last):
                if(timestamped_record.get("time")>last):
                    all_records.append(timestamped_record)
                    context.last_trade_times[sym] = timestamped_record.get("time")
            else:
                all_records.append(timestamped_record)
                context.last_trade_times[sym] = timestamped_record.get("time")
            syms.append(sym)
            contents.append(article)
            links.append(link)
            
        context.v3c.execute('kv', context.stocks_kv, command='update', args={'key': sym,
                                                                         'expression': f"SET sentiment='{sentiments[-1]}';last_reaction='{times[-1]}'"})
    all_records = pd.DataFrame(all_records)
    if(all_records.shape[0]>0):
        all_records.columns = ["Content","Datetime","symbol","Link","Sentiment"]
        # Localizing the datetime
        all_records["Datetime"] = all_records["Datetime"].dt.tz_localize('UTC')
        # writing to featureset
        context.logger.info(f"Writing new dataframe with shape {all_records.shape} to feature store")
        fs.ingest(context.stock_feature_set, all_records, infer_options=fs.InferOptions.default(),
                 overwrite=False)
    else:
        context.logger.info("No new data to ingest")

    if len(sentiments) > 0:
        df = pd.DataFrame.from_dict({'sentiment': sentiments,
                                     'time': times,
                                     'symbol': syms})
        df = df.set_index(['time', 'symbol'])
        df.index = df.index.set_levels([pd.to_datetime(df.index.levels[0]), df.index.levels[1]])
        df = df.sort_index(level=0, axis=0)
        context.v3c.write(backend='tsdb', table=context.stocks_tsdb, dfs=df)
    return "Done"

In [58]:
# nuclio: end-code

## Test locally

In [None]:
init_context(context)

In [60]:
from nuclio import Event
event = Event()

In [None]:
s = handler(context, event)

## Deploy to cluster

please run the cells below in order to get the SENTIMENT_MODEL_ENDPOINT endopoint

In [None]:
from mlrun import code_to_function,auto_mount
import os
import nuclio
# Export bare function
fn = code_to_function('news-reader',
                      handler='handler')
fn.export('02-read-news.yaml')

# Set parameters for current deployment
fn.add_trigger('cron', nuclio.triggers.CronTrigger('10s'))
fn.set_envs({'V3IO_CONTAINER': 'users',
             'STOCKS_STREAM': os.getenv('V3IO_USERNAME') + '/stocks/stocks_stream',
             'STOCKS_TSDB_TABLE': os.getenv('V3IO_USERNAME') + '/stocks/stocks_tsdb',
             'SENTIMENT_MODEL_ENDPOINT': '', # make sure you insert the right endpoint when running test
             'PROJECT_NAME' :"stocks-" + os.getenv('V3IO_USERNAME')})
fn.spec.max_replicas = 1

In [None]:
addr = fn.deploy(project= "stocks-" + os.getenv('V3IO_USERNAME'))

In [None]:
!curl {addr}

### in case running this notebook without running the entire project (means sentiment analysis serving function isn't depolyed) run the cells below to deploy the sentiment analysis serving function, then copy the given url after depolyment to the sentiment analysis endpoint in the set_enviornemt section above

In [None]:
# Deploying sentiment analysis serving function
import mlrun
fn = mlrun.import_function(url = "hub://sentiment_analysis_serving")
fn.apply(mlrun.auto_mount())
# make sure you have the model - if not - download it from the project notebook
fn.add_model(model_path="/User/test/demos/stock-analysis/models/model.pt",key="model1",class_name="SentimentClassifierServing")
addr = fn.deploy(project="stocks-" + os.getenv('V3IO_USERNAME'))

In [126]:
!curl {addr}