In [4]:
import pathlib
import codecs
import json
import pysentiment2 as ps
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import pipeline
import numpy as np
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [5]:
def sentiment_analysis_signal(text):
    """Computes sentiment scores based on Loughran and McDonald dictionary."""
    lm = ps.LM()
    tokens = lm.tokenize(text)
    score = lm.get_score(tokens)
    score = {key: (int(value) if isinstance(value, np.integer) else float(value))
             for key, value in score.items()}
    return score


def language_similarity(prev_text, curr_text):
    """Computes cosine similarity between two Item 7 filings."""
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([prev_text, curr_text])
    similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
    return similarity[0][0]


def advanced_nlp_analysis(text):
    """Uses BERT-based model for sentiment and topic classification."""
    bert_sentiment = pipeline("sentiment-analysis", model="ProsusAI/finbert")
    chunk_size = 512
    chunks = [text[i : i + chunk_size] for i in range(0, len(text), chunk_size)]
    scores = []
    for chunk in chunks:
        result = bert_sentiment(chunk)[0]
        label = result['label']
        match label:
            case 'positive':
                score = 1
            case 'negative':
                score = -1
            case 'neutral':
                score = 0
        scores.append(score * result['score'])
    
    return sum(scores) / max(len(scores), 1)


In [6]:
def read_file(year, ticker):
    output_dir = pathlib.Path(f'api-data/{year}/{ticker}')
    with codecs.open(f'{output_dir}/10_K_info.txt', 'r', 'utf-8', errors='ignore') as f:
        return json.load(f)

def process_tickers(tickers, years):
    """Processes all tickers and years, applying signals to Item 7 text."""
    for i, year in tqdm(enumerate(years)):
        for ticker in tickers:
            curr_data = read_file(year, ticker)
            if not curr_data:
                continue
            curr_text = curr_data.get("item_7_text", "")
            prev_text = None

            try:
                prev_data = read_file(int(years[i])-1, ticker)
                prev_text = prev_data.get("item_7_text", "")
            except:
                prev_text = ""
            
            curr_data["sentiment_score"] = sentiment_analysis_signal(curr_text)
            try:
                curr_data["similarity_score"] = language_similarity(prev_text, curr_text)
            except:
                curr_data["similarity_score"] = 0
            try:
                curr_data["nlp_result"] = advanced_nlp_analysis(curr_text)
            except:
                curr_data["nlp_result"] = 0
            
            output_dir = pathlib.Path(f'api-data-signal/{year}/{ticker}')
            output_dir.mkdir(parents=True, exist_ok=True)
            with codecs.open(f'{output_dir}/10_K_info.txt', 'w', 'utf-8', errors='ignore') as f:
                json.dump(curr_data, f)

In [7]:
tickers = [
    "AMZN", "AXP", "AMGN", "AAPL", "BA", "CAT", "CSCO", "CVX", 
    "GS", "HD", "HON", "IBM", "JNJ", "KO", "JPM", "MCD", "MMM", 
    "MRK", "MSFT", "NKE", "PG", "SHW", "TRV", "UNH", "CRM", "NVDA", 
    "VZ", "V", "WMT", "DIS"
]
years = ['2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023']
process_tickers(tickers, years)

14it [02:52, 12.30s/it]


In [None]:
print(results)

{'2010': {'AXP': {'sentiment': {'Positive': 1, 'Negative': 0, 'Polarity': 0.9999990000010001, 'Subjectivity': 0.041666664930555625}, 'language_similarity': 0.0, 'nlp_analysis': 0.0}, 'MSFT': {'sentiment': {'Positive': 99, 'Negative': 120, 'Polarity': -0.09589041052104835, 'Subjectivity': 0.03631238599961658}, 'language_similarity': 0.9703686277651167, 'nlp_analysis': 0.05044493079185486}}, '2011': {'AXP': {'sentiment': {'Positive': 1, 'Negative': 0, 'Polarity': 0.9999990000010001, 'Subjectivity': 0.041666664930555625}, 'language_similarity': 0.8939305715734549, 'nlp_analysis': 0.0}, 'MSFT': {'sentiment': {'Positive': 96, 'Negative': 129, 'Polarity': -0.14666666601481482, 'Subjectivity': 0.03749999999375}, 'language_similarity': 0.9949837013659897, 'nlp_analysis': 0.05371086600887982}}}


In [None]:
read_file("2009", "AXP")

FileNotFoundError: [Errno 2] No such file or directory: 'api-data/2009/AXP/10_K_info.txt'