## Import packages

In [60]:
import pandas as pd
import numpy as np
import plotly.express as px
from glob import glob
import os 
import sys
import re
import plotly.graph_objs as go
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords

## Data Pre-processing

In [None]:
# Load Financial News Dataset

path = r'C:\Users\hazourah\Downloads\20061020_20131126_bloomberg_news'
news_list = []
news_titles = []
news_dates = []
for sub_dir in os.listdir(path):
    for tmp_news in os.listdir(path+'/'+sub_dir):
        f = open(path + '/' + sub_dir + '/' + tmp_news, "r")
        news_list.append(f.readlines())
        news_titles.append(tmp_news)
        news_dates.append(sub_dir)

In [62]:
df = pd.DataFrame()

In [63]:
df['News'] = news_list
df['Titles'] = news_titles
df['Date'] = news_dates

In [64]:
df.head(5)

Unnamed: 0,News,Titles,Date
0,"[-- China Stocks Will Rise 30% in 2007, ABN Am...",china-stocks-will-rise-30-in-2007-abn-amro-pre...,2006-11-09
1,[-- FirstRand Says Law May Pare Fees by 788 Mi...,firstrand-says-law-may-pare-fees-by-788-millio...,2006-11-09
2,[-- Vale Third-Quarter Net Profit Rises 47% on...,vale-third-quarter-net-profit-rises-47-on-ore-...,2006-11-09
3,[-- Bank of Communications May Sell $7 Bln Sha...,bank-of-communications-may-sell-7-bln-shares-b...,2006-11-17
4,[-- Ameristar Casino Shares Rise on Buyout Spe...,ameristar-casino-shares-rise-on-buyout-specula...,2006-11-20


In [65]:
def clean(text):
    """
        Clean the text in input
    """
    result = " ".join(text[7:])
    result = re.sub('\n', ' ', result)
    result = re.sub('\t', ' ', result)
    result = re.sub('`', ' ', result)
    result = re.sub('"', ' ', result)
    result = re.sub(" \'s", "'s", result)
    result = re.sub('( ){2,}', ' ', result)
    result = result.split('To contact the reporter')[0]
    if result[0] == ' ':
        result = result[1:]
    if result[-1] == ' ':
        result = result[:-1]
    return result

# Clean all news
df['Cleaned_News'] = df['News'].apply(lambda x: clean(x))

In [66]:
df = df.sort_values(by = 'Date')

In [67]:
# Dictionary of English Contractions
contractions_dict = { "ain't": "are not","aren't": "are not",
                     "can't": "cannot","can't've": "cannot have",
                     "'cause": "because","could've": "could have","couldn't": "could not",
                     "couldn't've": "could not have", "didn't": "did not","doesn't": "does not",
                     "don't": "do not","hadn't": "had not","hadn't've": "had not have",
                     "hasn't": "has not","haven't": "have not","he'd": "he would",
                     "he'd've": "he would have","he'll": "he will", "he'll've": "he will have",
                     "how'd": "how did","how'd'y": "how do you","how'll": "how will",
                     "I'd": "I would", "I'd've": "I would have","I'll": "I will",
                     "I'll've": "I will have","I'm": "I am","I've": "I have", "isn't": "is not",
                     "it'd": "it would","it'd've": "it would have","it'll": "it will",
                     "it'll've": "it will have", "let's": "let us","ma'am": "madam",
                     "mayn't": "may not","might've": "might have","mightn't": "might not", 
                     "mightn't've": "might not have","must've": "must have","mustn't": "must not",
                     "mustn't've": "must not have", "needn't": "need not",
                     "needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not",
                     "oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not",
                     "shan't've": "shall not have","she'd": "she would","she'd've": "she would have",
                     "she'll": "she will", "she'll've": "she will have","should've": "should have",
                     "shouldn't": "should not", "shouldn't've": "should not have","so've": "so have",
                     "that'd": "that would","that'd've": "that would have", "there'd": "there would",
                     "there'd've": "there would have", "they'd": "they would",
                     "they'd've": "they would have","they'll": "they will",
                     "they'll've": "they will have", "they're": "they are","they've": "they have",
                     "to've": "to have","wasn't": "was not","we'd": "we would",
                     "we'd've": "we would have","we'll": "we will","we'll've": "we will have",
                     "we're": "we are","we've": "we have", "weren't": "were not","what'll": "what will",
                     "what'll've": "what will have","what're": "what are", "what've": "what have",
                     "when've": "when have","where'd": "where did", "where've": "where have",
                     "who'll": "who will","who'll've": "who will have","who've": "who have",
                     "why've": "why have","will've": "will have","won't": "will not",
                     "won't've": "will not have", "would've": "would have","wouldn't": "would not",
                     "wouldn't've": "would not have","y'all": "you all", "y'all'd": "you all would",
                     "y'all'd've": "you all would have","y'all're": "you all are",
                     "y'all've": "you all have", "you'd": "you would","you'd've": "you would have",
                     "you'll": "you will","you'll've": "you will have", "you're": "you are",
                     "you've": "you have"}

# Regular expression for finding contractions
contractions_re=re.compile('(%s)' % '|'.join(contractions_dict.keys()))

# Function for expanding contractions
def expand_contractions(text,contractions_dict=contractions_dict):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, text)

# Expanding Contractions in the reviews
df['Cleaned_News'] = df['Cleaned_News'].apply(lambda x:expand_contractions(x))

## Exploratory Data Analysis - EDA

In [68]:
print('Number of News:', len(df))

Number of News: 151


In [69]:
tmp = df.groupby('Date').count().reset_index()
px.line(tmp, x="Date",y="News")

In [70]:
xwords = [len(x.split()) for x in df.Cleaned_News]
trace1 = go.Histogram(x=xwords, opacity=0.65, name="Word Count", marker=dict(color='rgba(171, 50, 96, 0.6)'))
data = [trace1]
layout = go.Layout(barmode='overlay',
                   title='Word Count of Headlines',
                   xaxis=dict(title='Word Count'),
                   yaxis=dict( title='Number of News'))
fig = go.Figure(data=data, layout=layout)
fig.show()

In [71]:
xchars = [len(x) for x in df.Cleaned_News]
trace1 = go.Histogram(x=xchars, opacity=0.65, name="Word Count", marker=dict(color='rgba(12, 50, 196, 0.6)'))
data = [trace1]
layout = go.Layout(barmode='overlay',
                   title='Characters Count of News',
                   xaxis=dict(title='Char Count'),
                   yaxis=dict( title='Number of News'))
fig = go.Figure(data=data, layout=layout)
fig.show()

In [72]:
df = df[['Date', 'Cleaned_News']]

### Bag of Words (BoW)

### Term frequencies

In [73]:
stop_words =  set(stopwords.words('english'))
cv = CountVectorizer(stop_words = stop_words)
counts = cv.fit_transform(df.Cleaned_News)

tmp_cv = pd.DataFrame(counts.sum(axis=0),columns=cv.get_feature_names()).T.sort_values(0,ascending=False).head(30)
fig = go.Figure()
fig.add_trace(go.Bar(
    x=tmp_cv.index,
    y=tmp_cv[0],
    name='Term frequencies',
    marker_color='indianred'
))

# Here we modify the tickangle of the xaxis, resulting in rotated labels.
fig.update_layout(xaxis_tickangle=-45, title = 'Most common words')
fig.show()

### TF-IDF

In [74]:
cv = TfidfVectorizer(stop_words = stop_words, max_df=0.8)
counts = cv.fit_transform(df.Cleaned_News)

tmp_cv = pd.DataFrame(counts.sum(axis=0),columns=cv.get_feature_names()).T.sort_values(0,ascending=False).head(30)
fig = go.Figure()
fig.add_trace(go.Bar(
    x=tmp_cv.index,
    y=tmp_cv[0],
    name='Tf-idf',
    marker_color='indianred'
))

# Here we modify the tickangle of the xaxis, resulting in rotated labels.
fig.update_layout(xaxis_tickangle=-45, title='Most important words')
fig.show()

## Fine-tune BERT language model on Financial News Corpus

In [75]:
df.rename(columns={"Cleaned_News":"text"}, inplace=True)
df = df[['text']]

In [76]:
df.to_csv(r'train.csv', index=False)

In [77]:
import transformers
from transformers import AutoTokenizer

print(transformers.__version__)

4.3.2


In [78]:
#!pip install datasets

In [79]:
from datasets import load_dataset
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

In [80]:
model_checkpoint = "yiyanghkust/finbert-pretrain"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, max_len = 512, do_lower_case=False)

In [81]:
from transformers import AutoModelForMaskedLM

model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

Some weights of the model checkpoint at yiyanghkust/finbert-pretrain were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [109]:
def get_tokenized_dataset():
    tokenized_datasets = load_dataset('csv', data_files={'train':  r'./train.csv'})
  
    def tokenize_function(examples):
        # Remove empty lines
        examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()]
        return tokenizer(
            examples["text"],
            padding="max_length",
            truncation=True,
            max_length=512,
            return_special_tokens_mask=True,
        )
    return tokenized_datasets.with_transform(tokenize_function)

In [110]:
def get_data_collator():
    return DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [111]:
model.to('cpu')

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30873, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [112]:
tokenized_datasets = get_tokenized_dataset()

Using custom data configuration default-faefd385ea8f6527
Reusing dataset csv (C:\Users\hazourah\.cache\huggingface\datasets\csv\default-faefd385ea8f6527\0.0.0\bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a)


  0%|          | 0/1 [00:00<?, ?it/s]

In [113]:
data_collator = get_data_collator()

In [115]:
args = TrainingArguments(output_dir="./content/TPUCheckpoints", do_train=True, per_device_train_batch_size=32, weight_decay=0.01, 
                    num_train_epochs=1, save_steps=5,
                    disable_tqdm=False, remove_unused_columns=False)

In [None]:
trainer = Trainer(
      model=model,
      args=args,
      train_dataset=tokenized_datasets["train"],
      tokenizer=tokenizer,
      data_collator=data_collator,
)
trainer.train()

W&B installed but not logged in. Run `wandb login` or set the WANDB_API_KEY env variable.


In [98]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="./content/TPUCheckpoints/checkpoint-5/",
    tokenizer="./content/TPUCheckpoints/checkpoint-5/"
)

Some weights of BertModel were not initialized from the model checkpoint at ./content/TPUCheckpoints/checkpoint-5/ and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [105]:
fill_mask.tokenizer.mask_token

'[MASK]'

In [108]:
fill_mask(f"The goal of life is {fill_mask.tokenizer.mask_token}.")

[{'sequence': 'goal of life is met.',
  'score': 0.19648458063602448,
  'token': 2375,
  'token_str': 'met'},
 {'sequence': 'goal of life is zero.',
  'score': 0.09745624661445618,
  'token': 4290,
  'token_str': 'zero'},
 {'sequence': 'goal of life is straightforward.',
  'score': 0.044013019651174545,
  'token': 25995,
  'token_str': 'straightforward'},
 {'sequence': 'goal of life is unchanged.',
  'score': 0.03409665450453758,
  'token': 3152,
  'token_str': 'unchanged'},
 {'sequence': 'goal of life is simple.',
  'score': 0.025178508833050728,
  'token': 4665,
  'token_str': 'simple'}]