# Guidance text LSEG

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m16.2 MB/s[0m eta [36m0:00:0

In [None]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
df_guidance = pd.read_csv('/content/drive/MyDrive/Dissertation_Final /Data/Raw/lseg_text.csv')

In [None]:
df_guidance.shape

(3811, 7)

# summurize

In [None]:
#summurize the long parts

df_guidance['text_length'] = df_guidance['Guidance Text'].apply(lambda x: len(str(x)))

long_texts = df_guidance[df_guidance['Guidance Text'].apply(lambda x: len(str(x)) > 512)]
print(f"Number of texts longer than 512 characters: {len(long_texts)}")

Number of texts longer than 512 characters: 703


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

df_guidance['num_tokens'] = df_guidance['Guidance Text'].apply(lambda x: len(tokenizer.encode(str(x))))

long_token_texts = df_guidance[df_guidance['num_tokens'] > 512]

print(f"Number of texts with more than 512 tokens: {len(long_token_texts)}")

Token indices sequence length is longer than the specified maximum sequence length for this model (604 > 512). Running this sequence through the model will result in indexing errors


Number of texts with more than 512 tokens: 67


In [None]:
from transformers import pipeline

# Create a summarization pipeline
summarizer = pipeline("summarization")

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [None]:
def summarize_text(text):
    # Check if the text is a string
    if isinstance(text, str):
        # Tokenize the text
        tokens = tokenizer.tokenize(text)

        # If the text is too long, summarize it
        if len(tokens) > 512:
            summary = summarizer(text, max_length=512, min_length=200, do_sample=False)
            return summary[0]['summary_text']
    # If the text is not a string, return it as is
    return text

In [None]:
df_guidance['Guidance Text'] = df_guidance['Guidance Text'].apply(summarize_text)#this takes 45 min to RUN

Your max_length is set to 512, but your input_length is only 500. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=250)
Your max_length is set to 512, but your input_length is only 503. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=251)
Your max_length is set to 512, but your input_length is only 503. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=251)
Your max_length is set to 512, but your input_length is only 503. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=2

# Pre-process

In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download required resources from NLTK
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


# Preprocessing function
def preprocess_text(text):
    if pd.isnull(text):
        return ""

    # Convert text to lowercase
    text = text.lower()

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Join the tokens back into a string
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

# Apply preprocessing to the 'Guidance Text' column
df_guidance['preprocessed_text'] = df_guidance['Guidance Text'].apply(preprocess_text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
df_guidance.head(3)

Unnamed: 0,Instrument,Activation Date,Guidance Measure,Guidance High Value,Guidance Low Value,Guidance Text,The Doc Type,text_length,num_tokens,preprocessed_text
0,LSEG.L,,,,,,,3,3,
1,LSEG.L,,,,,,,3,3,
2,LSEG.L,,,,,,,3,3,


In [None]:
# calculate percentage of NaN values in each column
nan_percent = df_guidance.isna().mean() * 100
print(nan_percent)

Instrument              0.000000
Activation Date        31.723957
Guidance Measure       31.723957
Guidance High Value    90.396221
Guidance Low Value     90.396221
Guidance Text          31.986355
The Doc Type           31.723957
text_length             0.000000
num_tokens              0.000000
preprocessed_text       0.000000
dtype: float64


In [None]:
import numpy as np

# Replace empty strings with NaN
df_guidance.replace("", np.nan, inplace=True)

# Calculate percentage of NaN values in each column
nan_percent = df_guidance.isna().mean() * 100
print(nan_percent)

Instrument              0.000000
Activation Date        31.723957
Guidance Measure       31.723957
Guidance High Value    90.396221
Guidance Low Value     90.396221
Guidance Text          31.986355
The Doc Type           31.723957
text_length             0.000000
num_tokens              0.000000
preprocessed_text      32.038835
dtype: float64


In [None]:
df_guidance_cleaned = df_guidance.drop(columns=['Guidance High Value', 'Guidance Low Value'])
df_guidance_cleaned.dropna(how='any', inplace=True)

In [None]:
# calculate percentage of NaN values in each column
nan_percent = df_guidance_cleaned.isna().mean() * 100
print(nan_percent)

Instrument           0.0
Activation Date      0.0
Guidance Measure     0.0
Guidance Text        0.0
The Doc Type         0.0
text_length          0.0
num_tokens           0.0
preprocessed_text    0.0
dtype: float64


In [None]:
df_guidance_cleaned.shape

(2375, 8)

# FinBERT

In [None]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, pipeline

#TONE
# Load the pre-trained model and tokenizer
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone', num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

# Create a sentiment-analysis pipeline
nlp = pipeline("sentiment-analysis", model=finbert, tokenizer=tokenizer)
df_guidance_cleaned['preprocessed_text'] = df_guidance_cleaned['preprocessed_text'].fillna('').astype(str)

# Apply the pipeline to your DataFrame column
df_guidance_cleaned['sentiment_results'] = df_guidance_cleaned['preprocessed_text'].apply(lambda x: nlp(x)[0]) #result
df_guidance_cleaned['sentiment_label'] = df_guidance_cleaned['sentiment_results'].apply(lambda x: x['label']) #label
df_guidance_cleaned['sentiment_score'] = df_guidance_cleaned['sentiment_results'].apply(lambda x: x['score']) #score

In [None]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, pipeline

#ESG
# Load the pre-trained model and tokenizer
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-esg', num_labels=4)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-esg')

# Create a sentiment-analysis pipeline
nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer)

# Apply the pipeline to your DataFrame column
df_guidance_cleaned['esg_classification'] = df_guidance_cleaned['preprocessed_text'].apply(lambda x: nlp(x)[0])
df_guidance_cleaned['esg_classification_label'] = df_guidance_cleaned['esg_classification'].apply(lambda x: x['label'])
df_guidance_cleaned['esg_classification_score'] = df_guidance_cleaned['esg_classification'].apply(lambda x: x['score'])

In [None]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, pipeline

# Load the pre-trained model and tokenizer
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-fls', num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-fls')

# Create a sentiment-analysis pipeline
nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer)

# Prepare lists to hold classification results
classification_results = []
classification_labels = []
classification_scores = []

# Process text and store results
for text in df_guidance_cleaned['preprocessed_text']:
    classification = nlp(text)[0]
    classification_results.append(classification)
    classification_labels.append(classification['label'])
    classification_scores.append(classification['score'])

# Add results to DataFrame
df_guidance_cleaned['fls_classification'] = classification_results
df_guidance_cleaned['fls_classification_label'] = classification_labels
df_guidance_cleaned['fls_classification_score'] = classification_scores

In [None]:
df_guidance_cleaned.dtypes

Instrument                   object
Activation Date              object
Guidance Measure             object
Guidance Text                object
The Doc Type                 object
text_length                   int64
num_tokens                    int64
preprocessed_text            object
sentiment_results            object
sentiment_label              object
sentiment_score             float64
esg_classification           object
esg_classification_label     object
esg_classification_score    float64
fls_classification           object
fls_classification_label     object
fls_classification_score    float64
dtype: object

In [None]:
# Define a mapping dictionary for sentiment labels

#SENTIMENT
label_mapping = {'Neutral': 0, 'Positive': 1, 'Negative': 2}
df_guidance_cleaned['sentiment_numerical'] = df_guidance_cleaned['sentiment_label'].map(label_mapping)

#ESG
esg_mapping = {'Environmental': 0, 'Social': 2, 'Governance': 3, 'None': 4}
df_guidance_cleaned['esg_numerical'] = df_guidance_cleaned['esg_classification_label'].map(esg_mapping)

#FLS
fls_mapping = {'Non-specific FLS': 0,'Specific FLS': 1, 'Not FLS': 2}
df_guidance_cleaned['fls_numerical'] = df_guidance_cleaned['fls_classification_label'].map(fls_mapping)

In [None]:
# Drop columns
columns_to_drop = ['text_length', 'sentiment_label','fls_classification_label','esg_classification_label']
df_guidance_cleaned = df_guidance_cleaned.drop(columns=columns_to_drop)

In [None]:
#df_guidance = df_guidance.drop(['Activation Date','Guidance Text','num_tokens','sentiment_results','esg_classification','fls_classification'], axis=1)

In [None]:
df_guidance_cleaned.head(3)

Unnamed: 0,Instrument,Activation Date,Guidance Measure,Guidance Text,The Doc Type,num_tokens,preprocessed_text,sentiment_results,sentiment_score,esg_classification,esg_classification_score,fls_classification,fls_classification_score,sentiment_numerical,esg_numerical,fls_numerical
12,LSEG.L,2021-03-05T07:00:03Z,Revenue,5-7% Total Income (excluding recoveries) CAGR ...,Press Release,59,5-7 % total income ( excluding recovery ) cagr...,"{'label': 'Neutral', 'score': 0.9993962049484253}",0.999396,"{'label': 'None', 'score': 0.9837270975112915}",0.983727,"{'label': 'Not FLS', 'score': 0.8151842355728149}",0.815184,0,4,2
13,LSEG.L,2021-03-05T09:00:00Z,Revenue,How much should we think is netted off for the...,Transcript,29,"much think netted synergy 2021 , guess , full ...","{'label': 'Neutral', 'score': 0.9997803568840027}",0.99978,"{'label': 'None', 'score': 0.9469303488731384}",0.94693,"{'label': 'Not FLS', 'score': 0.8872433304786682}",0.887243,0,4,2
14,LSEG.L,2021-07-02T12:00:01Z,Revenue,CAGR) in the total market for Data & Analytics...,Press Release,158,"cagr ) total market data & analytics . event ,...","{'label': 'Positive', 'score': 0.9999123811721...",0.999912,"{'label': 'None', 'score': 0.8084553480148315}",0.808455,"{'label': 'Specific FLS', 'score': 0.462712168...",0.462712,1,4,1


In [None]:
df_guidance_cleaned.columns

Index(['Instrument', 'Activation Date', 'Guidance Measure', 'Guidance Text',
       'The Doc Type', 'num_tokens', 'preprocessed_text', 'sentiment_results',
       'sentiment_score', 'esg_classification', 'esg_classification_score',
       'fls_classification', 'fls_classification_score', 'sentiment_numerical',
       'esg_numerical', 'fls_numerical'],
      dtype='object')

In [None]:
import pandas as pd

# Ensure 'Activation Date' is in datetime format
df_guidance_cleaned['Activation Date'] = pd.to_datetime(df_guidance_cleaned['Activation Date'])

# Extract 'Year' and 'Month' from 'Activation Date'
df_guidance_cleaned['Year'] = df_guidance_cleaned['Activation Date'].dt.year
df_guidance_cleaned['Month'] = df_guidance_cleaned['Activation Date'].dt.month

In [None]:
# Group the data by 'Year' and get the unique months for each year
months_per_year = df_guidance_cleaned.groupby('Year')['Month'].unique().to_dict()

# Define a list of all months (from 1 to 12)
all_months = list(range(1, 13))

# Check if every year has all twelve months
for year, months in months_per_year.items():
    missing_months = set(all_months) - set(months)
    if missing_months:
        print(f"For the year {year}: Months {sorted(list(missing_months))} are missing.")
    else:
        print(f"For the year {year}: All twelve months are present.")

For the year 2015: Months [1, 2, 3, 4, 5, 6, 8, 9, 11, 12] are missing.
For the year 2016: Months [1, 2, 3, 4, 5, 7, 8, 9, 10, 11] are missing.
For the year 2017: All twelve months are present.
For the year 2018: All twelve months are present.
For the year 2019: All twelve months are present.
For the year 2020: All twelve months are present.
For the year 2021: All twelve months are present.
For the year 2022: All twelve months are present.
For the year 2023: Months [8, 9, 10, 11, 12] are missing.


In [None]:
df_guidance_cleaned.shape

(2375, 18)

In [None]:
df_guidance_cleaned

Unnamed: 0,Instrument,Activation Date,Guidance Measure,Guidance Text,The Doc Type,num_tokens,preprocessed_text,sentiment_results,sentiment_score,esg_classification,esg_classification_score,fls_classification,fls_classification_score,sentiment_numerical,esg_numerical,fls_numerical,Year,Month
12,LSEG.L,2021-03-05 07:00:03+00:00,Revenue,5-7% Total Income (excluding recoveries) CAGR ...,Press Release,59,5-7 % total income ( excluding recovery ) cagr...,"{'label': 'Neutral', 'score': 0.9993962049484253}",0.999396,"{'label': 'None', 'score': 0.9837270975112915}",0.983727,"{'label': 'Not FLS', 'score': 0.8151842355728149}",0.815184,0,4,2,2021,3
13,LSEG.L,2021-03-05 09:00:00+00:00,Revenue,How much should we think is netted off for the...,Transcript,29,"much think netted synergy 2021 , guess , full ...","{'label': 'Neutral', 'score': 0.9997803568840027}",0.999780,"{'label': 'None', 'score': 0.9469303488731384}",0.946930,"{'label': 'Not FLS', 'score': 0.8872433304786682}",0.887243,0,4,2,2021,3
14,LSEG.L,2021-07-02 12:00:01+00:00,Revenue,CAGR) in the total market for Data & Analytics...,Press Release,158,"cagr ) total market data & analytics . event ,...","{'label': 'Positive', 'score': 0.9999123811721...",0.999912,"{'label': 'None', 'score': 0.8084553480148315}",0.808455,"{'label': 'Specific FLS', 'score': 0.462712168...",0.462712,1,4,1,2021,7
15,LSEG.L,2022-08-05 09:00:00+00:00,EBIT,And we are firmly on track to deliver our 5% t...,Transcript,26,"firmly track deliver 5 % 7 % revenue target , ...","{'label': 'Positive', 'score': 0.9999604225158...",0.999960,"{'label': 'None', 'score': 0.9684473872184753}",0.968447,"{'label': 'Not FLS', 'score': 0.7201486229896545}",0.720149,1,4,2,2022,8
16,LSEG.L,2021-03-05 07:00:03+00:00,EBITDA,"remain unchanged, except to reflect the divest...",Press Release,31,"remain unchanged , except reflect divestment b...","{'label': 'Neutral', 'score': 0.9999809265136719}",0.999981,"{'label': 'None', 'score': 0.976659893989563}",0.976660,"{'label': 'Not FLS', 'score': 0.8567500710487366}",0.856750,0,4,2,2021,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3806,MKTX.O,2018-06-12 10:30:00+00:00,Revenue,"And just to size it up, for example, on -- if ...",Transcript,156,"size , example , -- 're trading u.s. high-grad...","{'label': 'Positive', 'score': 1.0}",1.000000,"{'label': 'None', 'score': 0.9191204905509949}",0.919120,"{'label': 'Not FLS', 'score': 0.9480698108673096}",0.948070,1,4,2,2018,6
3807,MKTX.O,2018-07-25 10:00:00+00:00,Revenue,"But I'm guessing hiring kind of continues, how...",Transcript,102,"'m guessing hiring kind continues , think traj...","{'label': 'Negative', 'score': 0.9966436624526...",0.996644,"{'label': 'None', 'score': 0.9937105178833008}",0.993711,"{'label': 'Not FLS', 'score': 0.7224313020706177}",0.722431,2,4,2,2018,7
3808,MKTX.O,2017-10-25 10:00:00+00:00,Revenue,"And given the client adoption of Trax, the rep...",Transcript,105,"given client adoption trax , reporting mechani...","{'label': 'Positive', 'score': 1.0}",1.000000,"{'label': 'None', 'score': 0.9782870411872864}",0.978287,"{'label': 'Specific FLS', 'score': 0.807399690...",0.807400,1,4,1,2017,10
3809,MKTX.O,2018-06-12 10:30:00+00:00,Revenue,"And just to size it up, for example, on -- if ...",Transcript,156,"size , example , -- 're trading u.s. high-grad...","{'label': 'Positive', 'score': 1.0}",1.000000,"{'label': 'None', 'score': 0.9191204905509949}",0.919120,"{'label': 'Not FLS', 'score': 0.9480698108673096}",0.948070,1,4,2,2018,6


In [None]:
df_guidance_cleaned.to_csv('lseg_text_classified.csv')