In [1]:
! pip install pandas numpy dash dash-bootstrap-components plotly wordcloud matplotlib nltk textblob nrclex bertopic sentence_transformers

Collecting dash
  Downloading dash-2.18.1-py3-none-any.whl.metadata (10 kB)
Collecting dash-bootstrap-components
  Downloading dash_bootstrap_components-1.6.0-py3-none-any.whl.metadata (5.2 kB)
Collecting nrclex
  Downloading NRCLex-4.0-py3-none-any.whl.metadata (3.2 kB)
Collecting bertopic
  Downloading bertopic-0.16.4-py3-none-any.whl.metadata (23 kB)
Collecting sentence_transformers
  Downloading sentence_transformers-3.2.0-py3-none-any.whl.metadata (10 kB)
Collecting dash-html-components==2.0.0 (from dash)
  Downloading dash_html_components-2.0.0-py3-none-any.whl.metadata (3.8 kB)
Collecting dash-core-components==2.0.0 (from dash)
  Downloading dash_core_components-2.0.0-py3-none-any.whl.metadata (2.9 kB)
Collecting dash-table==5.0.0 (from dash)
  Downloading dash_table-5.0.0-py3-none-any.whl.metadata (2.4 kB)
Collecting retrying (from dash)
  Downloading retrying-1.3.4-py3-none-any.whl.metadata (6.9 kB)
INFO: pip is looking at multiple versions of nrclex to determine which version

In [2]:
import requests
from bs4 import BeautifulSoup
import urllib.parse
import time
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import sent_tokenize
from transformers import pipeline
import spacy
from nrclex import NRCLex
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer


In [3]:
# Download the 'stopwords' corpus
nltk.download('stopwords')

# Optionally, download 'punkt' if you haven't already
nltk.download('punkt')

nltk.download('vader_lexicon')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [4]:
df = pd.DataFrame()
for month in range(1, 11):
  path = f'/content/drive/MyDrive/microsoft_articles/{month}_articles.json'
  df = pd.concat([df, pd.read_json(path)])

df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,Date,Title,Author,Text
0,2024-01-31 11:56:19+00:00,"Microsoft beats Q2 earnings on AI, cloud strength",Daniel Howley · Technology Editor,Microsoft (MSFT) announced its second quarter ...
1,2024-01-03 15:00:00+00:00,"If You Invested $10,000 in Microsoft When Saty...","Jeremy Bowman, The Motley Fool",Microsoft (NASDAQ: MSFT) is back on top of the...
2,2024-01-09 11:00:00+00:00,Why Microsoft Stock Rallied 57% in 2023,"Danny Vena, The Motley Fool",Shares of Microsoft (NASDAQ: MSFT) charged sha...
3,2024-01-30 21:35:57+00:00,Microsoft Corp (MSFT) Reports Robust Growth wi...,GuruFocus Research,"Revenue: $62.0 billion, an 18% increase year-o..."
4,2024-01-04 08:01:09+00:00,Microsoft is adding an AI button to PC keyboar...,Daniel Howley · Technology Editor,Microsoft (MSFT) is doubling down on its commi...


In [5]:
# Convert the date column to datetime
df['Date'] = pd.to_datetime(df['Date'])

# Sort the DataFrame by the 'Date' column
df = df.sort_values(by='Date')
print('shape before removing duplicates' +str(df.shape))
# Remove duplicate rows based on 'Title' and 'Date'
df = df.drop_duplicates(subset=['Title', 'Date'])

# Optionally, reset the index if you want a clean index after dropping duplicates
df.reset_index(drop=True, inplace=True)

print('shape after removing duplicates' +str(df.shape))
# Extract month for grouping
df['Month'] = df['Date'].dt.to_period('M')


df.head()

shape before removing duplicates(1491, 4)
shape after removing duplicates(1170, 4)


Unnamed: 0,Date,Title,Author,Text,Month
0,2024-01-01 12:00:27+00:00,Investors in Microsoft (NASDAQ:MSFT) have seen...,editorial-team@simplywallst.com (Simply Wall...,The most you can lose on any stock (assuming y...,2024-01
1,2024-01-02 15:09:36+00:00,Best AI Stock 2024: Alphabet Stock vs. Microso...,"Parkev Tatevosian, CFA, The Motley Fool",Fool.com contributor Parkev Tatevosian compare...,2024-01
2,2024-01-03 14:42:51+00:00,1 Artificial Intelligence (AI) Stock Poised to...,"Parkev Tatevosian, CFA, The Motley Fool",Fool.com contributor Parkev Tatevosian highlig...,2024-01
3,2024-01-03 15:00:00+00:00,"If You Invested $10,000 in Microsoft When Saty...","Jeremy Bowman, The Motley Fool",Microsoft (NASDAQ: MSFT) is back on top of the...,2024-01
4,2024-01-03 15:29:29+00:00,Microsoft Copilot is now available on iOS and ...,Aisha Malik,"Over the holiday season, Microsoft quietly lau...",2024-01


In [6]:
# Function to clean text
def clean_text(text):
    # Remove URLs, special characters, numbers, and extra spaces
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#','', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply cleaning
df['clean_content_context'] = df['Text'].apply(clean_text)

# Display the first few rows of the cleaned dataset
df.head()

Unnamed: 0,Date,Title,Author,Text,Month,clean_content_context
0,2024-01-01 12:00:27+00:00,Investors in Microsoft (NASDAQ:MSFT) have seen...,editorial-team@simplywallst.com (Simply Wall...,The most you can lose on any stock (assuming y...,2024-01,the most you can lose on any stock assuming yo...
1,2024-01-02 15:09:36+00:00,Best AI Stock 2024: Alphabet Stock vs. Microso...,"Parkev Tatevosian, CFA, The Motley Fool",Fool.com contributor Parkev Tatevosian compare...,2024-01,foolcom contributor parkev tatevosian compares...
2,2024-01-03 14:42:51+00:00,1 Artificial Intelligence (AI) Stock Poised to...,"Parkev Tatevosian, CFA, The Motley Fool",Fool.com contributor Parkev Tatevosian highlig...,2024-01,foolcom contributor parkev tatevosian highligh...
3,2024-01-03 15:00:00+00:00,"If You Invested $10,000 in Microsoft When Saty...","Jeremy Bowman, The Motley Fool",Microsoft (NASDAQ: MSFT) is back on top of the...,2024-01,microsoft nasdaq msft is back on top of the te...
4,2024-01-03 15:29:29+00:00,Microsoft Copilot is now available on iOS and ...,Aisha Malik,"Over the holiday season, Microsoft quietly lau...",2024-01,over the holiday season microsoft quietly laun...


In [7]:
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stop_words])

df['clean_content_no_stopwords'] = df['clean_content_context'].apply(remove_stopwords)
df.head()

Unnamed: 0,Date,Title,Author,Text,Month,clean_content_context,clean_content_no_stopwords
0,2024-01-01 12:00:27+00:00,Investors in Microsoft (NASDAQ:MSFT) have seen...,editorial-team@simplywallst.com (Simply Wall...,The most you can lose on any stock (assuming y...,2024-01,the most you can lose on any stock assuming yo...,lose stock assuming dont use leverage money br...
1,2024-01-02 15:09:36+00:00,Best AI Stock 2024: Alphabet Stock vs. Microso...,"Parkev Tatevosian, CFA, The Motley Fool",Fool.com contributor Parkev Tatevosian compare...,2024-01,foolcom contributor parkev tatevosian compares...,foolcom contributor parkev tatevosian compares...
2,2024-01-03 14:42:51+00:00,1 Artificial Intelligence (AI) Stock Poised to...,"Parkev Tatevosian, CFA, The Motley Fool",Fool.com contributor Parkev Tatevosian highlig...,2024-01,foolcom contributor parkev tatevosian highligh...,foolcom contributor parkev tatevosian highligh...
3,2024-01-03 15:00:00+00:00,"If You Invested $10,000 in Microsoft When Saty...","Jeremy Bowman, The Motley Fool",Microsoft (NASDAQ: MSFT) is back on top of the...,2024-01,microsoft nasdaq msft is back on top of the te...,microsoft nasdaq msft back top tech world days...
4,2024-01-03 15:29:29+00:00,Microsoft Copilot is now available on iOS and ...,Aisha Malik,"Over the holiday season, Microsoft quietly lau...",2024-01,over the holiday season microsoft quietly laun...,holiday season microsoft quietly launched copi...


In [8]:
sia = SentimentIntensityAnalyzer()

def get_vader_sentiment_scores(text):
    sentences = sent_tokenize(text)
    sentiment_scores = {}

    for sentence in sentences:
        score = sia.polarity_scores(sentence)
        for sentiment in score:
          sentiment_scores[sentiment] = score[sentiment]

    return sentiment_scores

vader_sentiments_list = list(df['clean_content_context'].apply(get_vader_sentiment_scores).values)
vader_sentiments_df = pd.DataFrame(vader_sentiments_list)
vader_sentiments_df.head()

Unnamed: 0,neg,neu,pos,compound
0,0.026,0.752,0.222,0.9994
1,0.082,0.834,0.084,0.3612
2,0.083,0.832,0.084,0.0772
3,0.062,0.789,0.149,0.9976
4,0.014,0.867,0.119,0.9869


In [9]:
vader_sentiments_df.shape

(1170, 4)

In [None]:
# Initialize FinBERT pipeline for sentiment analysis using a model fine-tuned for financial text
try:
    finbert_pipeline = pipeline("sentiment-analysis", model="yiyanghkust/finbert-tone", tokenizer="yiyanghkust/finbert-tone")
except Exception as e:
    print(f"Error loading FinBERT model: {e}")

def get_finbert_sentiment_scores(text):
    """
    Function to compute FinBERT sentiment scores on financial text.
    It splits the text into sentences and handles cases where sentences are too long for the model.

    Args:
        text (str): The input financial text to analyze.

    Returns:
        list: A list of dictionaries containing sentiment scores for each sentence.
    """
    try:
        # Tokenize the text into individual sentences
        sentences = sent_tokenize(text)
        finbert_scores = []

        # Iterate over sentences for sentiment analysis
        for sentence in sentences:
            # If the sentence is too long, split it into smaller chunks of 512 characters
            if len(sentence) > 512:
                chunks = [sentence[i:i + 512] for i in range(0, len(sentence), 512)]
                for chunk in chunks:
                    try:
                        result = finbert_pipeline(chunk)
                        finbert_scores.append(result[0])  # Store the result of each chunk
                    except Exception as e:
                        print(f"Error analyzing chunk: {chunk[:30]}... -> {e}")
            else:
                # Analyze normally if the sentence is within the 512-character limit
                try:
                    result = finbert_pipeline(sentence)
                    finbert_scores.append(result[0])  # Store the result for each sentence
                except Exception as e:
                    print(f"Error analyzing sentence: {sentence[:30]}... -> {e}")

        return finbert_scores
    except Exception as e:
        print(f"Error during sentiment analysis: {e}")
        return []

# Example: Apply the function to a dataframe column containing financial text
# df['finbert_sentiments'] = df['clean_content_context'].apply(get_finbert_sentiment_scores)
finbert = df['clean_content_context'].apply(get_finbert_sentiment_scores)

config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

In [None]:
from transformers import pipeline
from nltk.tokenize import sent_tokenize

def load_model(model_name):
    """
    Loads the specified model for sentiment analysis.

    Args:
        model_name (str): The HuggingFace model name.

    Returns:
        pipeline: A HuggingFace pipeline loaded with the specified model.
    """
    try:
        model_pipeline = pipeline("sentiment-analysis", model=model_name, tokenizer=model_name)
        return model_pipeline
    except Exception as e:
        print(f"Error loading {model_name} model: {e}")
        return None

def get_sentiment_scores(text, model_pipeline):
    """
    Function to compute sentiment scores on financial text.
    It splits the text into sentences and handles cases where sentences are too long for the model.

    Args:
        text (str): The input text to analyze.
        model_pipeline (pipeline): The sentiment analysis model pipeline.

    Returns:
        list: A list of sentiment results for each sentence or chunk.
    """
    try:
        # Tokenize text into sentences
        sentences = sent_tokenize(text)
        sentiment_scores = []

        # Iterate over each sentence
        for sentence in sentences:
            if len(sentence) > 512:
                # Split long sentences into 512-character chunks
                chunks = [sentence[i:i + 512] for i in range(0, len(sentence), 512)]
                for chunk in chunks:
                    try:
                        result = model_pipeline(chunk)
                        sentiment_scores.append(result[0])  # Append chunk result
                    except Exception as e:
                        print(f"Error analyzing chunk: {chunk[:30]}... -> {e}")
            else:
                try:
                    result = model_pipeline(sentence)
                    sentiment_scores.append(result[0])  # Append sentence result
                except Exception as e:
                    print(f"Error analyzing sentence: {sentence[:30]}... -> {e}")

        return sentiment_scores
    except Exception as e:
        print(f"Error during sentiment analysis: {e}")
        return []

In [None]:
# Model names
finbert_model_name = "yiyanghkust/finbert-tone"
prosus_finbert_model_name = "ProsusAI/finbert"
roberta_financial_model_name = "textattack/roberta-financial-news"
bart_financial_model_name = "nlpjunkie/fin-news-bart-sentiment"
factera_financial_model_name = "amishra-factera/financial-sentiment-analysis"
pszemraj_finbert_model_name = "pszemraj/finbert"

# Load the model of your choice
finbert_pipeline = load_model(finbert_model_name)
prosus_finbert_pipeline = load_model(prosus_finbert_model_name)
roberta_financial_pipeline = load_model(roberta_financial_model_name)
bart_financial_pipeline = load_model(bart_financial_model_name)
factera_financial_pipeline = load_model(factera_financial_model_name)
pszemraj_finbert_pipeline = load_model(pszemraj_finbert_model_name)

# Apply sentiment analysis to a dataframe column
finbert_sentiments = df['clean_content_context'].apply(lambda text: get_sentiment_scores(text, finbert_pipeline))
prosus_finbert_sentiment = df['clean_notent_context'].apply(lambda text: get_sentiment_scores(text, prosus_finbert_pipeline))
roberta_financial_sentiment = df['clean_content_context'].apply(lambda text: get_sentiment_scores(text, roberta_financial_pipeline))
bart_financial_sentiment = df['clean_content_context'].apply(lambda text: get_sentiment_scores(text, bart_financial_pipeline))
factera_financial_sentiment = df['clean_content_context'].apply(lambda text: get_sentiment_scores(text, factera_financial_pipeline))
pszemraj_finbert_sentiment = df['clean_content_context'].apply(lambda text: get_sentiment_scores(text, pszemraj_finbert_pipeline))

In [None]:
# Function to return a DataFrame with mean scores for each label
def get_mean_scores_df(label_list, labels=['Negative', 'Neutral', 'Positive']):
    # Convert the list of dictionaries to DataFrame
    row_data = pd.DataFrame(label_list).groupby('label').mean().T

    # Ensure all columns (labels) are present, and fill missing ones with 0
    for label in labels:
        if label not in row_data.columns:
            row_data[label] = 0

    # Reorder the columns to match the desired order (only if they exist in the data)
    row_data = row_data[[label for label in labels if label in row_data.columns]]

    return row_data

# Adjust apply calls for each model, with appropriate labels
finbert_df = pd.concat(finbert_sentiments.apply(lambda x: get_mean_scores_df(x)).tolist(), ignore_index=True)
prosus_finbert_df = pd.concat(prosus_finbert_sentiment.apply(lambda x: get_mean_scores_df(x)).tolist(), ignore_index=True)
roberta_financial_df = pd.concat(roberta_financial_sentiment.apply(lambda x: get_mean_scores_df(x)).tolist(), ignore_index=True)
bart_financial_df = pd.concat(bart_financial_sentiment.apply(lambda x: get_mean_scores_df(x)).tolist(), ignore_index=True)
factera_financial_df = pd.concat(factera_financial_sentiment.apply(lambda x: get_mean_scores_df(x, ['Negative', 'Positive'])).tolist(), ignore_index=True)  # No 'Neutral'
pszemraj_finbert_df = pd.concat(pszemraj_finbert_sentiment.apply(lambda x: get_mean_scores_df(x)).tolist(), ignore_index=True)

# Combine all DataFrames side-by-side for comparison
combined_df = pd.concat([finbert_df, prosus_finbert_df, roberta_financial_df, bart_financial_df, factera_financial_df, pszemraj_finbert_df], axis=1)

# Optionally, rename columns to reflect the sentiment model
combined_df.columns = ['FinBERT_Neg', 'FinBERT_Neu', 'FinBERT_Pos',
                       'Prosus_Neg', 'Prosus_Neu', 'Prosus_Pos',
                       'Roberta_Neg', 'Roberta_Neu', 'Roberta_Pos',
                       'BART_Neg', 'BART_Neu', 'BART_Pos',
                       'Factera_Neg', 'Factera_Pos',  # Factera lacks 'Neutral'
                       'Pszemraj_Neg', 'Pszemraj_Neu', 'Pszemraj_Pos']

combined_df.head()


In [None]:
# import numpy as np
# import pandas as pd


# # Function to return a DataFrame with mean scores for each label
# def get_mean_scores_df(label_list):
#     labels = ['Negative',	'Neutral', 'Positive']
#     final_df = pd.DataFrame(columns=labels)
#     row_data = pd.DataFrame(label_list).groupby('label').mean().T

#     # Ensure all columns are present and fill missing ones with 0
#     for label in labels:
#         if label not in row_data.columns:
#             row_data[label] = 0

#     # Reorder the columns to match the desired order
#     row_data = row_data[labels]

#     # Append the row to the final DataFrame
#     final_df = pd.concat([df, row_data], ignore_index=True)

#     return final_df



# # Use apply with axis=1 to iterate row-wise
# finbert_df = pd.concat(finbert.apply(lambda row: get_mean_scores_df(row)).tolist(), ignore_index=True)
# finbert_df.head()

In [None]:
# def load_loughran_mcdonald_lexicon(path):
#     lexicon_df = pd.read_excel(path)
#     positive_words = set(lexicon_df[lexicon_df['Positive'] == 1]['Word'].str.lower())
#     negative_words = set(lexicon_df[lexicon_df['Negative'] == 1]['Word'].str.lower())
#     return positive_words, negative_words

# positive_words, negative_words = load_loughran_mcdonald_lexicon('/content/drive/MyDrive/microsoft_articles/Loughran-McDonald_MasterDictionary_1993-2023.xlsx')

# def get_domain_sentiment(text, positive_words, negative_words):
#     tokens = nltk.word_tokenize(text.lower())
#     pos_count = len([word for word in tokens if word in positive_words])
#     neg_count = len([word for word in tokens if word in negative_words])
#     return {'positive': pos_count, 'negative': neg_count}

# # df['domain_sentiments'] = df['clean_content_no_stopwords'].apply(lambda text: get_domain_sentiment(text, positive_words, negative_words))
# domain_sentiments = df['clean_content_no_stopwords'].apply(lambda text: get_domain_sentiment(text, positive_words, negative_words))

In [None]:
import spacy
from nrclex import NRCLex

# Load SpaCy model (ensure you have it installed, e.g., 'en_core_web_sm')
nlp = spacy.load('en_core_web_sm')

def extract_entities_and_emotions(text):
    doc = nlp(text)
    results = []

    for sent in doc.sents:
        sent_text = sent.text
        entities = [ent.text for ent in sent.ents]
        emotions = NRCLex(sent_text).raw_emotion_scores
        results.append({
            'sentence': sent_text,
            'entities': entities,
            'emotions': emotions
        })

    return results

# df['entities_and_emotions'] = df['clean_content_context'].apply(extract_entities_and_emotions)
entities_and_emotions = df['clean_content_context'].apply(extract_entities_and_emotions)

In [None]:
entities_and_emotions

In [None]:
import numpy as np
import pandas as pd

def extract_emotions(row):
    # Initialize an empty dictionary to store sums of emotions
    summed_emotions = {}
    count = len(row)  # Number of dictionaries in the row

    # Loop through each index in the row
    for index in range(count):  # Assuming row is a list of dictionaries
        emotions_dict = row[index]['emotions']  # Extract the emotions dictionary

        # Add up emotion values, initializing keys if they don't exist yet
        for emotion, value in emotions_dict.items():
            if emotion in summed_emotions:
                summed_emotions[emotion] += value
            else:
                summed_emotions[emotion] = value

    # Calculate the mean by dividing the summed values by the number of entries
    mean_emotions = {emotion: value / count for emotion, value in summed_emotions.items()}

    # Convert the dictionary to a DataFrame row
    return pd.DataFrame([mean_emotions])

# Apply the function to every row in the entities_and_emotions DataFrame
emotions_df = pd.concat(entities_and_emotions.apply(lambda row: extract_emotions(row)).tolist(), ignore_index=True)

# Replace NaN values with 0 if necessary
emotions_df.replace(np.nan, 0, inplace=True)
emotions_df.head()

In [None]:
# Add prefixes to each DataFrame
final_df = df.add_prefix('original_')  # Prefix for the original df
vader_sentiments_df_with_prefix = vader_sentiments_df.add_prefix('vader_')  # Prefix for VADER sentiment
finbert_df_with_prefix = finbert_df.add_prefix('finbert_')  # Prefix for FinBERT sentiment
prosus_finbert_df_with_prefix = prosus_finbert_df.add_prefix('prosus_finbert_')  # Prefix for Prosus FinBERT sentiment
roberta_financial_df_with_prefix = roberta_financial_df.add_prefix('roberta_financial_')  # Prefix for Roberta financial sentiment
bart_financial_df_with_prefix = bart_financial_df.add_prefix('bart_financial_')  # Prefix for BART financial sentiment
factera_financial_df_with_prefix = factera_financial_df.add_prefix('factera_')  # Prefix for Factera financial sentiment
pszemraj_finbert_df_with_prefix = pszemraj_finbert_df.add_prefix('pszemraj_finbert_')  # Prefix for Pszemraj FinBERT sentiment
emotions_df_with_prefix = emotions_df.add_prefix('emotions_')  # Prefix for emotions

In [None]:
# List of DataFrames to concatenate
dfs = [final_df, vader_sentiments_df_with_prefix, finbert_df_with_prefix, prosus_finbert_df_with_prefix,
        roberta_financial_df_with_prefix, bart_financial_df_with_prefix,
        factera_financial_df_with_prefix, pszemraj_finbert_df_with_prefix, emotions_df_with_prefix]

# Concatenate all DataFrames along axis=1
final_df = pd.concat(dfs, axis=1)

# Show the resulting DataFrame
final_df.head()


In [None]:
final_df.dropna(inplace=True)
final_df.columns

In [None]:
final_df.to_csv(path_or_buf='/content/drive/MyDrive/microsoft_articles/final_df.csv')

In [None]:
# final_df = pd.read_csv('/content/drive/MyDrive/microsoft_articles/final_df.csv')

In [None]:
# final_df['original_Date'].info