## Importing the necessary packages

In [557]:
import spacy
nlp2 = spacy.load("en_core_web_lg")
import requests
from bs4 import BeautifulSoup
import itertools
from spacy.lang.en.stop_words import STOP_WORDS
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
import nltk
nltk.download('vader_lexicon')

# MAIN

# Text Analysis

## Objective

Using Python programming language and any publicly available libraries analyze articles in Yahoo!Finance area (https://finance.yahoo.com/). The code should be able to process any article from available current set

#### Assumption:
 - For simplicity extracting only the main page list of articles, with no scrolling down the page 

In [None]:
url = 'https://finance.yahoo.com'

# Extracting articles from yahoo finance by using the function 'get_article_content(url)'
new_content = get_article_content(url)

# 1. Named Entity Extraction

Extract all available entities (person names, organizations, locations, medical codes, time expressions, quantities, monetary values, percentages, etc). Build Key/Value pairs for numeric values (quantities, monetary values etc) - for example: revenue =  $123,456. Build weight of each "key" based on frequency of appearance in currently analyzed articles.

### 1.1. Displaying extracted articles with founded entities

In [213]:
display_articles_and_entities(new_content)

TITLE: Amazon Tells Echelon Fitness to Stop Selling $500 Prime Bike
-------------------------------------------------------------------------------------------------------

CONTENT:
-------------------------------------------------------------------------------------------------------





TITLE: 3 “Strong Buy” Stocks That Are Flirting With a Bottom
-------------------------------------------------------------------------------------------------------

CONTENT:
-------------------------------------------------------------------------------------------------------





TITLE: Tesla’s Battery Day Letdown Risks $320 Billion Stock Gain
-------------------------------------------------------------------------------------------------------

CONTENT:
-------------------------------------------------------------------------------------------------------





TITLE: Tesla rival? Bill Gates-backed electric vehicle battery startup comes into limelight
-------------------------------------------------------------------------------------------------------

CONTENT:
-------------------------------------------------------------------------------------------------------







### 1.2. Displaying some entity statistics:

In [212]:
# Extracting all named entiites and their statistics by using the functions 'entities_and_statistics(contents)''
all_entities, entity_type_frequency, entity_frequency, entity_occurence_percent = entities_and_statistics(new_content)

# Displaying the results with the function 'display_entity_statistics(p1, p2, p3, p4)'
display_entity_statistics(all_entities, entity_type_frequency, entity_frequency, entity_occurence_percent)

ENTITY STATISTICS:


Total entities: 402
-------------------------------------------------------------------------------------------------------
Total unique entites: 246
-------------------------------------------------------------------------------------------------------


Top 10 entity occurrences:
-------------------------------------------------------------------------------------------------------
Tesla : 27
Musk : 14
QURE : 11
Amazon : 10
TCMD : 9
Progenity : 8
Echelon : 6
Mah : 5
Furlong : 5
Schwartz : 5


Top 10 entity occurrences in percentage:
-------------------------------------------------------------------------------------------------------
Tesla : 7.000000000000001%
Musk : 3.0%
QURE : 3.0%
Amazon : 2.0%
TCMD : 2.0%
Progenity : 2.0%
Echelon : 1.0%
Mah : 1.0%
Furlong : 1.0%
Schwartz : 1.0%


Entity type frequency:
-------------------------------------------------------------------------------------------------------
ORG (Companies, agencies, institutions, etc.) : 159
DA

### 1.3. Displaying w Key/Value pairs statistics:

In [389]:
key_value_pairs, key_weights_percent, key_weights, total_keys = generic_overview_num_values(new_content)
display_key_value_statistics(key_value_pairs, key_weights_percent, key_weights, total_keys)

NUMERIC KEY/VALUE STATISTICS:


Total keys/values: 113
-------------------------------------------------------------------------------------------------------
Total unique keys: 77
-------------------------------------------------------------------------------------------------------


Top 10 key occurrences:
-------------------------------------------------------------------------------------------------------
year : 11
million : 8
billion : 5
target : 4
kWh : 3
bike : 2
© : 2
Q1 : 2
volumes : 2
Buys : 2


Top 10 key occurrences in percentage:
-------------------------------------------------------------------------------------------------------
year : 10.0%
million : 7.000000000000001%
billion : 4.0%
target : 4.0%
kWh : 3.0%
bike : 2.0%
© : 2.0%
Q1 : 2.0%
volumes : 2.0%
Buys : 2.0%




### 1.4. Displaying Sentence Dependency for choosen key word == 'target'

In [413]:
# Choosed key word from top 10 key occurrences
key_word = 'target'

sentences = key_word_sents(new_content, key_word)
display_sentence_dependency(sentences)

Sentence:
-------------------------------------------------------------------------------------------------------
To this end, Mah rates PROG an Overweight (i.e. Buy) along with a $17 price target.

Sentence Dependency:
-------------------------------------------------------------------------------------------------------





Sentence:
-------------------------------------------------------------------------------------------------------
Given the $13.33 average price target, shares could climb 60% higher in the next year.

Sentence Dependency:
-------------------------------------------------------------------------------------------------------





Sentence:
-------------------------------------------------------------------------------------------------------
All of this prompted Furlong to keep a Buy rating and $62 price target on the stock.

Sentence Dependency:
-------------------------------------------------------------------------------------------------------





Sentence:
-------------------------------------------------------------------------------------------------------
The $62.33 average price target brings the upside potential to 91%.

Sentence Dependency:
-------------------------------------------------------------------------------------------------------





Sentence:
-------------------------------------------------------------------------------------------------------
Along with the call, he attached a $67 price target, suggesting 68% upside potential from current levels.

Sentence Dependency:
-------------------------------------------------------------------------------------------------------





Sentence:
-------------------------------------------------------------------------------------------------------
In addition, the $69.89 average price target indicates 75% upside potential.

Sentence Dependency:
-------------------------------------------------------------------------------------------------------







# 2. Relation Extraction

Identify relation between entities (Affiliation, Location, Part of, Social). Identify if entity is object or subject (Subject does the action. Object is the center of action.)

### 2.1. Displaying relation extraction

In [505]:
relation_entities(new_content)

Entity:      Bloomberg
Relation:    Companies, agencies, institutions, etc.
Dependency:  None
POS:         proper noun
-------------------------------------------------------------------------------------------------------

Entity:      Amazon.com Inc.
Relation:    Companies, agencies, institutions, etc.
Dependency:  compound
POS:         proper noun
-------------------------------------------------------------------------------------------------------

Entity:      Echelon Fitness
Relation:    Companies, agencies, institutions, etc.
Dependency:  compound
POS:         proper noun
-------------------------------------------------------------------------------------------------------

Entity:      Echelon
Relation:    Companies, agencies, institutions, etc.
Dependency:  nominal subject
POS:         proper noun
-------------------------------------------------------------------------------------------------------

Entity:      EX
Relation:    Companies, agencies, institutions, etc.
Depend

Entity:      Tesla Inc.’s
Relation:    Companies, agencies, institutions, etc.
Dependency:  compound
POS:         proper noun
-------------------------------------------------------------------------------------------------------

Entity:      Battery Day
Relation:    Titles of books, songs, etc.
Dependency:  compound
POS:         proper noun
-------------------------------------------------------------------------------------------------------

Entity:      $320 billion
Relation:    Monetary values, including unit
Dependency:  modifier of quantifier
POS:         symbol
-------------------------------------------------------------------------------------------------------

Entity:      this year
Relation:    Absolute or relative dates or periods
Dependency:  determiner
POS:         determiner
-------------------------------------------------------------------------------------------------------

Entity:      Elon Musk
Relation:    People, including fictional
Dependency:  compound
POS: 

# 3. Events Identification
Based on the article content identify if it cover specific event or not.

#### Assumption :
    - Assuming event to be, in the given article any of it's sentences there are DATE and EVENT entities!

### 3.1 Displaying even identification.

In [584]:
event_identification(new_content)

Event Title:

Tesla rival? Bill Gates-backed electric vehicle battery startup comes into limelight
-------------------------------------------------------------------------------------------------------


Event sentence:


-------------------------------------------------------------------------------------------------------


Event content:



# 4. Sentiment Analysis
Analyze each paragraph of the article and provide sentiment score for it based on Financial context of it.  Use "key" weight from "Named Entity Extraction" to provide the final article score.

#### Assumption :
    - Using NLTK's 'VADER' sentiment analyser. Vader is trained on news data using lexicon-based approach and rates them as positive or negative.

### 4.1. Displaying sentiment analysis:

In [619]:
sentiment_analysis(new_content)

Title:
Amazon Tells Echelon Fitness to Stop Selling $500 Prime Bike
-------------------------------------------------------------------------------------------------------

Sentence:
---------
Amazon.com Inc. told Echelon Fitness to stop selling an exercise bike that was promoted and branded as a product developed in partnership with the e-commerce giant.


VADER Sentiment Scores:
-----------------------
neg: 0.076
neu: 0.756
pos: 0.168
compound: 0.4019



Sentence:
---------
Echelon announced the EX-Prime Smart Connect Bike earlier this week and said it was developed “in collaboration with Amazon.”

VADER Sentiment Scores:
-----------------------
neg: 0.0
neu: 0.87
pos: 0.13
compound: 0.4019



Sentence:
---------
The machine was listed on Amazon’s website for $500, a steep discount to machines offered by Peloton Interactive Inc.


VADER Sentiment Scores:
-----------------------
neg: 0.0
neu: 1.0
pos: 0.0
compound: 0.0



Sentence:
---------
“This bike is not an Amazon product or rela

Sentence:
---------
To sum it all up, Mah said, “We believe Progenity shares are undervalued given the robust recovery in the core testing business and multiple upcoming growth catalysts.”


VADER Sentiment Scores:
-----------------------
neg: 0.0
neu: 0.769
pos: 0.231
compound: 0.7351



Sentence:
---------
To this end, Mah rates PROG an Overweight (i.e. Buy) along with a $17 price target.

VADER Sentiment Scores:
-----------------------
neg: 0.152
neu: 0.848
pos: 0.0
compound: -0.3612



Sentence:
---------
Should his thesis play out, a twelve-month gain of 105% could potentially be in the cards.

VADER Sentiment Scores:
-----------------------
neg: 0.0
neu: 0.691
pos: 0.309
compound: 0.7003



Sentence:
---------
(To watch Mah’s track record, click here)


VADER Sentiment Scores:
-----------------------
neg: 0.0
neu: 1.0
pos: 0.0
compound: 0.0



Sentence:
---------
Are other analysts in agreement?

VADER Sentiment Scores:
-----------------------
neg: 0.0
neu: 0.556
pos: 0.444
compo

Sentence:
---------
Best Stocks to Buy, a newly launched tool that unites all of TipRanks’ equity insights.


VADER Sentiment Scores:
-----------------------
neg: 0.0
neu: 0.678
pos: 0.322
compound: 0.6908



Sentence:
---------
The opinions expressed in this article are solely those of the featured analysts.

VADER Sentiment Scores:
-----------------------
neg: 0.0
neu: 1.0
pos: 0.0
compound: 0.0



Sentence:
---------
The content is intended to be used for informational purposes only.

VADER Sentiment Scores:
-----------------------
neg: 0.0
neu: 1.0
pos: 0.0
compound: 0.0



Sentence:
---------
It is very important to do your own analysis before making any investment.

VADER Sentiment Scores:
-----------------------
neg: 0.0
neu: 0.851
pos: 0.149
compound: 0.2716



Title:
Tesla’s Battery Day Letdown Risks $320 Billion Stock Gain
-------------------------------------------------------------------------------------------------------

Sentence:
---------
Tesla Inc.’s highly anticipate

Sentence:
---------
“I think this is going to be a bit harder than what they think, and I don’t think we’ll see a lot of volume out of that for quite some time.”


VADER Sentiment Scores:
-----------------------
neg: 0.0
neu: 1.0
pos: 0.0
compound: 0.0



Sentence:
---------
most important and long-standing partner on batteries is Osaka-based Panasonic, but it also has smaller-scale agreements with Contemporary Amperex Technology Co., or CATL, in China’s Fujian province and South Korea’s LG Chem.


VADER Sentiment Scores:
-----------------------
neg: 0.0
neu: 0.877
pos: 0.123
compound: 0.4933



Sentence:
---------
Read more: LG Chem, Panasonic Slide as Tesla Looks to Lower Battery Costs


VADER Sentiment Scores:
-----------------------
neg: 0.155
neu: 0.845
pos: 0.0
compound: -0.296



Sentence:
---------
The highly technical Battery Day presentation included several nuggets of news that were overshadowed by the talk of cathodes and electrolytes.

VADER Sentiment Scores:
-------------

Sentence:
---------
Ferrari's Most Exclusive Car Up for Auction

VADER Sentiment Scores:
-----------------------
neg: 0.0
neu: 0.77
pos: 0.23
compound: 0.2006





## Functions used in the Text Analysis

In [583]:
def get_article_content(url):
    
    """
    
    The Functions gets the Finance Yahoo url as a parameter and returns articles from the list of
    first page articles!
    
    Note: For simplicity getting only on the first page with no scrolling down the page!
    
    """

    returned_html = requests.get(url).text

    soup = BeautifulSoup(returned_html, 'html.parser')

    article_content = {}

    # After analyzing yahoo website, founded that articles are in <h3> tag with class = 'Mb(5px)'
    for each in soup.findAll("h3", {"class": "Mb(5px)"}):

        # Title of the article
        title = each.text

        # Link of the article
        href = 'https://finance.yahoo.com'+each.find('a')['href']

        # Requesting the article page
        returned_href = requests.get(href).text

        # Parsing the page
        soup_content_parser = BeautifulSoup(returned_href, 'html.parser')

        # After analyzing the page, founded that article content is in div and class 'caas-body' in paragraphs
        soup_content = soup_content_parser.find("div", {"class": "caas-body"})

        # Creating empty list to append each paragraph
        list_of_paras = []

        # Looping over each paragraph to extract the text
        for paragraph in soup_content.findAll("p"):
            list_of_paras.append(paragraph.text)

        # Concatinating the paragraphs
        whole_article = "\n".join(list_of_paras)

        # Appending to the dictionary
        article_content[title] = whole_article
   
    return article_content



def display_articles_and_entities(contents):
    for title, content in contents.items():
        print(f"TITLE: {title}")
        print("-------------------------------------------------------------------------------------------------------")
        print()
        print(f"CONTENT:")
        print("-------------------------------------------------------------------------------------------------------")
        spacy.displacy.render(nlp(content), jupyter=True, style='ent')
        print()
        print()
        print()
    




def entities_and_statistics(new_content):
    """
    The function gets contens in a dictionary type and makes entity statistics
    
    """
    
    list_entities = []
    count_entity_types = {}
    count_entities = {}
    entity_occurence_percent = {}

    for title, content in new_content.items():

        #  Tokenizing the string content
        doc = nlp(content)

        # Assigning all the founded entities in the current content
        all_entites = doc.ents

        # Iterating over each entity to extract make some statistics
        for entity in all_entites:
            list_entities.append(entity.text)

            if entity.label_+" ("+spacy.explain(entity.label_)+")" in count_entity_types:
                count_entity_types[entity.label_+" ("+spacy.explain(entity.label_)+")"] += 1
            else:
                count_entity_types[entity.label_+" ("+spacy.explain(entity.label_)+")"] = 1

            if entity.text in count_entities:
                count_entities[entity.text] += 1
            else:
                count_entities[entity.text] = 1

    # Sorting into reverse order 
    count_entity_types = {k: v for k, v in sorted(count_entity_types.items(), key=lambda item: item[1], reverse = True)}
    count_entities = {k: v for k, v in sorted(count_entities.items(), key=lambda item: item[1], reverse = True)}
    
    # Weight of each entity based on frequency of appearance 
    entity_occurence_percent = {k: f"{round(v/len(list_entities), 2) * 100}%" for k,v in count_entities.items()}
    
    return list_entities, count_entity_types, count_entities, entity_occurence_percent




def display_entity_statistics(all_entities, entity_type_frequency, entity_frequency, entity_occurence_percent):
    print("ENTITY STATISTICS:")
    print()
    print()
    print(f"Total entities: {len(list_entities)}")
    print("-------------------------------------------------------------------------------------------------------")
    print(f"Total unique entites: {len(set(list_entities))}")
    print("-------------------------------------------------------------------------------------------------------")
    print()
    print()
    print(f"Top 10 entity occurrences:")
    print("-------------------------------------------------------------------------------------------------------")
    for key, value in dict(itertools.islice(entity_frequency.items(), 10)).items():
        print(key, ":", value)
    print()
    print()
    print(f"Top 10 entity occurrences in percentage:")
    print("-------------------------------------------------------------------------------------------------------")
    for key, value in dict(itertools.islice(entity_occurence_percent.items(), 10)).items():
        print(key, ":", value)
    print()
    print()
    print(f"Entity type frequency:")
    print("-------------------------------------------------------------------------------------------------------")
    for key, value in entity_type_frequency.items():
        print(key, ":", value)
        
        
def generic_overview_num_values(contents):
    
    """
    This function gets contents as parameter and returns a dictionary for numeric values.
    
    Explation of key/value findings:
    -----------------------------
    value - by using 'Spacy' library, first we tokenize the text then with 'token.like_num' boolean,
    finding the words which are numeric. Then getting its left subtree first value and right subtree first value and concatenating into one string
    
    key - after finding numeric token getting its head token by token.head argument
    
    Example:
    -------
    {
    'target': '$ 67',
    'market': '~2.3 billion',
    }
    
    
    """
    
    total_keys = 0
    
    key_weights = {}

    key_value_pairs = dict()
    

    for title, content in contents.items():
        
        nlp_content = nlp(content)

        for sent in nlp_content.sents:
            
            # Removing stop words and punctuations
            sent_edited = nlp(" ".join([token.text for token in sent if not token.is_stop and not token.is_punct]))
            
            
            for token in sent_edited:

                # Finding numeric value
                if token.like_num:
                    
                    # Finding its left and right tree values
                    if len(list(token.lefts))>0:
                        left_tree_value  = list(token.lefts)[0].text
                    else:
                        left_tree_value = ""
                    if len(list(token.rights)) > 0:
                        right_tree_value  = list(token.rights)[0].text
                    else:
                        right_tree_value = ""
                    
                    # Concatinating
                    value = left_tree_value + " " + token.text + " "+ right_tree_value

                    # Finding Numeric values head
                    head_token = token.head.text
                    
                    # Adding to the dictionary
                    key_value_pairs[head_token] = value
                    
                    # Increasing number of total_keys founded
                    total_keys += 1
                    
                    if head_token in key_weights:
                        key_weights[head_token] += 1
                    else:
                        key_weights[head_token] = 1
                        
                        
    # Sorting into reverse order 
    key_weights = {k: v for k, v in sorted(key_weights.items(), key=lambda item: item[1], reverse = True)}
    

    # Weight of each key based on frequency of appearance 
    key_weights_percent = {k: f"{round(v/total_keys, 2) * 100}%" for k,v in key_weights.items()}
    
    
    return key_value_pairs, key_weights_percent, key_weights, total_keys


def display_key_value_statistics(key_value_pairs, key_weights_percent, key_weights, total_keys):
    print("NUMERIC KEY/VALUE STATISTICS:")
    print()
    print()
    print(f"Total keys/values: {total_keys}")
    print("-------------------------------------------------------------------------------------------------------")
    print(f"Total unique keys: {len(key_weights_percent)}")
    print("-------------------------------------------------------------------------------------------------------")
    print()
    print()
    print(f"Top 10 key occurrences:")
    print("-------------------------------------------------------------------------------------------------------")
    for key, value in dict(itertools.islice(key_weights.items(), 10)).items():
        print(key, ":", value)
    print()
    print()
    print(f"Top 10 key occurrences in percentage:")
    print("-------------------------------------------------------------------------------------------------------")
    for key, value in dict(itertools.islice(key_weights_percent.items(), 10)).items():
        print(key, ":", value)
    print()
    print()
    
    
    
def key_word_sents(contents, key_word):
    
    """
    
    The Function gets the contents and a key word for numeric value
    and returns its sentences where this key word appears in the contents
    
    """
    
    key_word_sentences = []
    
    for title, content in contents.items():
        
        nlp_content = nlp(content)

        for sent in nlp_content.sents:
            
            for token in sent:
                
                # Finding numeric value
                if token.like_num:
                    
                    if token.head.text == key_word:
                        key_word_sentences.append(sent)
                        
                        
    return key_word_sentences

def display_sentence_dependency(sentences):
    
    for sent in sentences:
        print("Sentence:")
        print("-------------------------------------------------------------------------------------------------------")
        print(sent)
        print()
        print("Sentence Dependency:")
        print("-------------------------------------------------------------------------------------------------------")
        spacy.displacy.render(sent, style='dep', jupyter = True, options = {'distance': 100})
        print()
        print()
        print()
        
        
def relation_entities(contens):
    
    """
    This function gets contens and displays each unique entity's:
        - Relation(label)
        - Dependency(Syntatic dependency)
        - POS(Part-Of-Speech)
    
    """
    seen = set()
    
    # For each content
    for title, content in contens.items():
        
        # for each entity in the content
        for entity in nlp(content).ents:
            
            # Skipping the entity if alreade has been displayed
            if entity.text not in seen:
                print(f"{'Entity:':<12} {entity.text}")
                print(f"{'Relation:':<12} {spacy.explain(entity.label_)}")
                for token in entity:
                    print(f"{'Dependency:':<12} {spacy.explain(token.dep_)}")
                    print(f"{'POS:':<12} {spacy.explain(token.pos_)}")
                    break
                print("-------------------------------------------------------------------------------------------------------")
                print()
                seen.add(entity.text)
                
                
                

                

def event_identification(contents):

    """
    This function iterates over each article and checks,
    whether any of the article covers any specific event or not.
    If there are prints them their title, content, and the sentence where event was found.
    If not prints, that it could not find any event article
    
    """


    event_article = dict()

    # Iterating over each title and content
    for title, content in contents.items():

        nlp_content = nlp2(content)

        # Iterating over each sentence of the given article
        for sent in nlp_content.sents:

            # Assigning each entity label in a list, so we can check whether there are 'DATE' and 'EVENT' entityt
            each_sent_ents = [ent.label_ for ent in sent.ents]

            # Checking for event. If thera assigning its title content and the sentence where 'DATE' and 'EVENT' founded!
            if "DATE" in each_sent_ents and "EVENT" in each_sent_ents:
                event_article[title] = {'content' : nlp_content, 'event sentence': sent}
                break

    # If there are even articles printing them out, else printing no event article found!
    if len(event_article) > 0:
        for title, content in event_article.items():
            print("Event Title:")
            print()
            print(title)
            print("-------------------------------------------------------------------------------------------------------")
            print()
            print()
            print("Event sentence:")
            spacy.displacy.render(content['event sentence'], jupyter=True, style='ent')
            print("-------------------------------------------------------------------------------------------------------")
            print()
            print()
            print("Event content:")
            print()
            spacy.displacy.render(content['content'], jupyter=True, style='ent')

    else:
        print("No Event were found on the Articles!")
        
        
        
def sentiment_analysis(contents):
    
    """
    This function gets each articles paragraphs and prints 
    it's sentiment score using NLTK's VADER sentiment analyser

    """

    for title, content in contents.items():

        print("Title:")
        print(title)
        print("-------------------------------------------------------------------------------------------------------")
        print()

        nlp_content = nlp2(content)
        for sent in nlp_content.sents:
            if len(str(sent).split(" ")) > 2:
                scores = SIA().polarity_scores(str(sent))
                print("Sentence:")
                print("---------")
                print(sent)
                print()
                print("VADER Sentiment Scores:")
                print("-----------------------")
                for str_score, num_score in scores.items():
                    print(f"{str_score}: {num_score}")
                print()
                print()
                print()