<a href="https://colab.research.google.com/github/ThatCodeCodingGuy/Financial-Sentiment-Analysis-with-Machine-Learning-LSTM-and-BERT-Transformer/blob/main/BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Installing the "simpletransformers" Package**

In [None]:
!pip install simpletransformers



# **Importing Necessary Modules**

In [None]:
import re
import string
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download("stopwords")
nltk.download('wordnet')

from simpletransformers.classification import ClassificationArgs, ClassificationModel
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Looking at the dataset**

In [None]:
df = pd.read_csv('/content/finance.csv')
df.head(7)

Unnamed: 0.1,Unnamed: 0,Sentence,Sentiment
0,0,The GeoSolutions technology will leverage Bene...,positive
1,1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,2,"For the last quarter of 2010 , Componenta 's n...",positive
3,3,According to the Finnish-Russian Chamber of Co...,neutral
4,4,The Swedish buyout firm has sold its remaining...,neutral
5,5,$SPY wouldn't be surprised to see a green close,positive
6,6,Shell's $70 Billion BG Deal Meets Shareholder ...,negative


In [None]:
df.drop("Unnamed: 0", axis=1, inplace=True) #dropping the unecessary columns

In [None]:
df.rename(columns={"Sentiment": "target", "Sentence": "data"}, inplace=True) # renaming the columns 

In [None]:
df['target'] = df['target'].map({'negative': 0, 'neutral': 1, 'positive': 2}) # changing the values of the "target" column to integers

# **Data Cleaning**

In [None]:
def clean_text(text):
  '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
  # make text lowercase
  text = text.lower()
  # removing text within brackets
  text = re.sub('\[.*?\]', '', text)
  # removing text within parentheses
  text = re.sub('\(.*?\)', '', text)
  # removing numbers
  text = re.sub('\w*\d\w*', '', text)
  # if there's more than 1 whitespace, then make it just 1
  text = re.sub('\s+', ' ', text)
  # if there's a new line, then make it a whitespace
  text = re.sub('\n', ' ', text)
  # removing any quotes
  text = re.sub('\"+', '', text)
  # removing &amp;
  text = re.sub('(\&amp\;)', '', text)
  # removing any usernames
  text = re.sub('(@[^\s]+)', '', text)
  # removing any hashtags
  text = re.sub('(#[^\s]+)', '', text)
  # remove `rt` for retweet
  text = re.sub('(rt)', '', text)
  # string.punctuation is a string of all punctuation marks
  # so this gets rid of all punctuation
  text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
  # getting rid of `httptco`
  text = re.sub('(httptco)', '', text)

  return text

round = lambda x: clean_text(x)

In [None]:
encoder = LabelEncoder() #Label encoding for the values of the "target" column
df['target'] = encoder.fit_transform(df['target'])

# **Train-Test Split**

In [None]:
train_df, test_df = train_test_split(df, test_size=0.2)

# **Model Preparing**

In [None]:
model_args = ClassificationArgs()
model_args.num_train_epochs = 4
model_args.max_seq_length = 128
model_args.train_batch_size = 32
model_args.learning_rate = 2e-5
model_args.warmup_ratio = 0.2
model_args.local_rank = -1

In [None]:
model = ClassificationModel(
    'bert',
    'bert-base-uncased',
    num_labels=3,
    args=model_args,
    use_cuda=True
)

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [None]:
model.train_model(train_df)

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/4673 [00:00<?, ?it/s]



Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 0 of 4:   0%|          | 0/147 [00:00<?, ?it/s]

Running Epoch 1 of 4:   0%|          | 0/147 [00:00<?, ?it/s]

Running Epoch 2 of 4:   0%|          | 0/147 [00:00<?, ?it/s]

Running Epoch 3 of 4:   0%|          | 0/147 [00:00<?, ?it/s]

(588, 0.507760940765848)

# **Results**

In [None]:
result, model_outputs, wrong_predictions = model.eval_model(test_df, acc=accuracy_score)
result

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/1169 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/147 [00:00<?, ?it/s]

{'mcc': 0.6153423575507686,
 'acc': 0.7741659538066724,
 'eval_loss': 0.485573775103303}