# Dependencies

In [1]:
#from transformers import BertTokenizer, BertModel
import pandas as pd
pd.options.display.max_colwidth = 100
import numpy as np
#from scipy.spatial.distance import cosine
#import nltk
#import torch

import spacy
import spacy_transformers
from spacy import displacy
from collections import Counter
import en_core_web_trf
nlp = en_core_web_trf.load()


In [2]:
###Others
from ipynb.fs.full.SQL_wikidata import find_Qid
from ipynb.fs.full.News_analysys/News_Scraper import scrape_ticker_fv
from ipynb.fs.full.Knowledge_BERT_single_words import contextual_entity



Today is  2021-10-20


Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Functions

In [3]:
def find_entities(s, verbose = 0):
    
    ###Annotate entities in text
    nlp_s = nlp(s)
    if verbose == 1:
        displacy.render(nlp_s, jupyter=True, style='ent')
        
    excluded_labels = ['TIME', 'DATE', 'CARDINAL', 'ORDINAL', 'POSITIONAL', 'GPE', 'PERCENT', 'MONEY']
    ents = [{'text':X.text, 'lemma':X.lemma_, 'label':X.label_, 's_char':X.start_char, 'e_char':X.end_char} for X in nlp_s.ents if X.label_ not in excluded_labels]
    
    return ents

In [4]:
def sentence_preannotate(sentence, entities, company_info, verbose = 0):
    
    ###Use fixed description for company name
    for i, ent in enumerate(entities):
        if ent['text'].lower() in [info.lower() for info in company_info['aliases']]:

            best_desc = " (" + company_info['description'] + ") "        
            sentence = sentence[:ent['e_char']] + best_desc + sentence[ent['e_char']:]
            b_keep = ent['e_char']
            company_des_len = len(best_desc) 
            entities.pop(i)
            
            return sentence, entities, b_keep, company_des_len

    return sentence, entities, -1, 0
    

In [5]:
def sentence_annotate(sentence, company_info, cutoff = 0.70, verbose = 0, kill_doubles = True):
    
    entities = find_entities(sentence, verbose = verbose)
    sentence, entities, b_keep, company_des_len = sentence_preannotate(sentence, entities, company_info, verbose)

    completed_entities = []
    final_sentence = sentence
    added_len = 0
    for ent in entities:
        ###Avoid double annotations
        #if (kill_doubles and [ent['text'], ent['label']]  in completed_entities) or ent['text'].lower() in company_info['aliases']:
        if (kill_doubles and ent['lemma']  in completed_entities) or ent['lemma'].lower() in company_info['aliases']:
            continue
        
        try:
            
            distances = contextual_entity(ent['lemma'], sentence, ent['text'], verbose)
            best_desc_ind = np.argmax(distances["distance"].values)
            best_dist = np.max(distances["distance"].values)
            best_desc = " (" + distances.iloc[best_desc_ind]["snippet"].replace(ent['text'] + " is a ", "") + ") "        
            
            ###Need to displace insertion point if it comes after the standard company description from preannotation.
            if ent['e_char'] > b_keep: insert_at = ent['e_char'] + added_len + company_des_len
            else: insert_at = ent['e_char'] + added_len                

            ###Insert annotation only if confidence is high.
            if best_dist > cutoff:
                final_sentence = final_sentence[:insert_at] + best_desc + final_sentence[insert_at:]
                added_len += len(best_desc)
                completed_entities.append(ent['lemma'])
            
        except Exception as e: print(e)
            
    return final_sentence
        


# Main

In [7]:
company_info = {'aliases':['Apple', 'Apple inc.', 'Apple inc'], 'description':find_Qid('Apple Inc.')['description']}
company_info

{'aliases': ['Apple', 'Apple inc.', 'Apple inc'],
 'description': 'American technology company based in Cupertino, California'}

In [8]:
###Download text
news = [t[2] for t in scrape_ticker_fv('aapl', verbose = 0, filtering = 'apple')]

In [10]:
sentence = news[30]
print(sentence_annotate(sentence, company_info, verbose = 1))

AirPods is a wireless headphones designed and sold by Apple Inc. 0.9042751434773778
AirPods is a wireless earbuds designed and sold by Apple Inc. 0.8838199237549093
AirPods is a wireless earbuds designed and sold by Apple Inc. 0.8838199237549093
AirPods is a system on a chip (SoC) designed by Apple Inc. used in AirPods and a select variety of Beats headphones. 0.8801571095208066
AirPods is a 2020 audio track by Astrid S 0.8686082209810049

MacBook Pro is a laptop model by Apple 0.9002168012512097
MacBook Pro is a scientific article published on 01 October 2019 0.886362401527318
MacBook Pro is a Intel-based line of Macintosh notebook computers 0.8670542387190462
MacBook Pro is a Wikimedia template 0.8614781052905796

Apple Music is a mobile application developed by Apple 0.9012545328705068
Apple Music is a Internet online music service by Apple 0.8928938204907422
Apple Music is a music concert series held by Apple, Inc. 0.876347055365011
Apple Music is a radio station operated by Apple 

In [11]:

for c in news[30:35]:
    try:
        print(sentence_annotate(c, company_info, verbose = 1))
        print("")
    except Exception as e: print(e, '""')
   

AirPods is a wireless headphones designed and sold by Apple Inc. 0.9042751434773778
AirPods is a wireless earbuds designed and sold by Apple Inc. 0.8838199237549093
AirPods is a wireless earbuds designed and sold by Apple Inc. 0.8838199237549093
AirPods is a system on a chip (SoC) designed by Apple Inc. used in AirPods and a select variety of Beats headphones. 0.8801571095208066
AirPods is a 2020 audio track by Astrid S 0.8686082209810049

MacBook Pro is a laptop model by Apple 0.9002168012512097
MacBook Pro is a scientific article published on 01 October 2019 0.886362401527318
MacBook Pro is a Intel-based line of Macintosh notebook computers 0.8670542387190462
MacBook Pro is a Wikimedia template 0.8614781052905796

Apple Music is a mobile application developed by Apple 0.9012545328705068
Apple Music is a Internet online music service by Apple 0.8928938204907422
Apple Music is a music concert series held by Apple, Inc. 0.876347055365011
Apple Music is a radio station operated by Apple 

AirPods is a wireless headphones designed and sold by Apple Inc. 0.884263706774613
AirPods is a 2020 audio track by Astrid S 0.8776724092904558
AirPods is a system on a chip (SoC) designed by Apple Inc. used in AirPods and a select variety of Beats headphones. 0.8608112783942227
AirPods is a wireless earbuds designed and sold by Apple Inc. 0.8603688965760294
AirPods is a wireless earbuds designed and sold by Apple Inc. 0.8603688965760294

Apple (American technology company based in Cupertino, California)  unveils new AirPods (wireless headphones designed and sold by Apple Inc.)  and high-end Mac computers as it expands its holiday lineup



Apple Music is a Internet online music service by Apple 0.8737458265202332
Apple Music is a mobile application developed by Apple 0.8722074644262275
Apple Music is a chart 0.8555329087380215
Apple Music is a radio station operated by Apple Inc. 0.8467023879626735
Apple Music is a music concert series held by Apple, Inc. 0.8383284622524054

Apple Music (Internet online music service by Apple)  at half the normal price? Yes, there is a catch



MacOS is a family of operating systems produced since 1984 by Apple for Macintosh computers 0.9030223659735157
MacOS is a operating system for Apple computers, launched in 2001 as Mac OS X 0.8905724000592091
MacOS is a macOS 0.8825327365927005
MacOS is a original operating system of Apple Mac (1984–2001) 0.8648926930345544
MacOS is a 15th major version of the macOS operating system 0.8485888308214622

MacOS (family of operating systems produced since 1984 by Apple for Macintosh computers)  Monterey release date: Apple (American technology company based in Cupertino, California)  to release new computer operating system next week



MacBook Pros is a Intel-based line of Macintosh notebook computers 0.8949870053361111
MacBook Pros is a scientific article published on 01 October 2019 0.8514319754309648
MacBook Pros is a Wikimedia template 0.8051535807874199
MacBook Pros is a Wikimedia template 0.8051535807874199

M1 Pro is a system on a chip (SoC) designed by Apple Inc. 0.9127741324313836
M1 Pro is a system on a chip (SoC) designed by Apple Inc. for the Macintosh computers and iPad Pro tablets 0.9115101058962158
M1 Pro is a 2018 Smartphone developed by Asus 0.8992021938902596
M1 Pro is a form of competition that is facilitated by electronic systems, particularly video games 0.8284504034215787
M1 Pro is a form of wrestling that combines athletics with theater 0.8274694545916563

M1 Max is a system on a chip (SoC) designed by Apple Inc. 0.9127873602600503
M1 Max is a German painter, draftsman, printmaker, sculptor and writer (1884-1950) 0.7851895728242126
M1 Max is a German painter, sculptor and graphic artist (1891-1