<a href="https://colab.research.google.com/github/ThatCodeCodingGuy/Financial-Sentiment-Analysis-with-Machine-Learning-LSTM-and-BERT-Transformer/blob/main/lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Importing Necessary Modules**

In [3]:
import numpy as np
import pandas as pd
import re
import string


import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import GlobalMaxPool1D
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model

In [4]:
from google.colab import drive #connecting to Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Looking at the dataset**

In [5]:
df = pd.read_csv('/content/finance.csv')
df.head(7)

Unnamed: 0.1,Unnamed: 0,Sentence,Sentiment
0,0,The GeoSolutions technology will leverage Bene...,positive
1,1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,2,"For the last quarter of 2010 , Componenta 's n...",positive
3,3,According to the Finnish-Russian Chamber of Co...,neutral
4,4,The Swedish buyout firm has sold its remaining...,neutral
5,5,$SPY wouldn't be surprised to see a green close,positive
6,6,Shell's $70 Billion BG Deal Meets Shareholder ...,negative


In [6]:
df.drop("Unnamed: 0", axis=1, inplace=True) #dropping the unnecessary column

In [7]:
df.rename(columns={"Sentiment": "target", "Sentence": "data"}, inplace=True) #changing the names of the columns

In [8]:
df['target'] = df['target'].map({'negative': 0, 'neutral': 1, 'positive': 2}) #changing the value of the "target" column to integers

# **Data Cleaning**

In [9]:
def clean_text(text):
  '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
  # make text lowercase
  text = text.lower()
  # removing text within brackets
  text = re.sub('\[.*?\]', '', text)
  # removing text within parentheses
  text = re.sub('\(.*?\)', '', text)
  # removing numbers
  text = re.sub('\w*\d\w*', '', text)
  # if there's more than 1 whitespace, then make it just 1
  text = re.sub('\s+', ' ', text)
  # if there's a new line, then make it a whitespace
  text = re.sub('\n', ' ', text)
  # removing any quotes
  text = re.sub('\"+', '', text)
  # removing &amp;
  text = re.sub('(\&amp\;)', '', text)
  # removing any usernames
  text = re.sub('(@[^\s]+)', '', text)
  # removing any hashtags
  text = re.sub('(#[^\s]+)', '', text)
  # remove `rt` for retweet
  text = re.sub('(rt)', '', text)
  # string.punctuation is a string of all punctuation marks
  # so this gets rid of all punctuation
  text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
  # getting rid of `httptco`
  text = re.sub('(httptco)', '', text)

  return text

round = lambda x: clean_text(x)

In [10]:
df['data'] = df['data'].apply(clean_text)
df.head(7)

Unnamed: 0,data,target
0,the geosolutions technology will leverage bene...,2
1,esi on lows down to bk a real possibility,0
2,for the last quaer of componenta s net sales ...,2
3,according to the finnishrussian chamber of com...,1
4,the swedish buyout firm has sold its remaining...,1
5,spy wouldnt be surprised to see a green close,2
6,shells billion bg deal meets shareholder skep...,0


# **Train-Test Split**

In [11]:
X = df['data']
y = df['target']

# Split data into train , test
X_train , X_val , y_train , y_val = train_test_split(X,y,test_size=0.2,random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size = 0.25, random_state=1)

In [12]:
print('Shape of X_train : ' , X_train.shape)
print('Shape of y_train : ', y_train.shape)
print('Shape of X_test : ', X_test.shape)
print('Shape of y_test : ',y_test.shape)
print('Shape of X_val : ' , X_train.shape)
print('Shape of y_val : ', y_train.shape)

Shape of X_train :  (3504,)
Shape of y_train :  (3504,)
Shape of X_test :  (1169,)
Shape of y_test :  (1169,)
Shape of X_val :  (3504,)
Shape of y_val :  (3504,)


In [13]:
# Convert y to categorical for our loss function in the model
y_train = to_categorical(y_train, 3)
y_test  = to_categorical(y_test, 3)
y_val = to_categorical(y_val, 3)

# **Tokenization & Padding**

In [14]:
# Tokenize Texts
max_features = 20000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train))
list_tokenized_train = tokenizer.texts_to_sequences(X_train)
list_tokenized_val = tokenizer.texts_to_sequences(X_val)
list_tokenized_test = tokenizer.texts_to_sequences(X_test)

In [15]:
# convert tokenized texts into same padding size
maxlen = 200
embed_size = 128 
X_train_final = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_val_final = pad_sequences(list_tokenized_test, maxlen=maxlen)
X_test_final = pad_sequences(list_tokenized_test, maxlen=maxlen)

# **Model Creating**

In [20]:
inp = Input(shape=(maxlen, )) 
x   =  Embedding(max_features, embed_size)(inp)
x   =  LSTM(60, return_sequences=True, name='lstm_layer')(x)
x   =  GlobalMaxPool1D()(x)
x   =  Dense(64, activation="relu")(x)
x   =  Dropout(0.2)(x)
x   =  Dense(3, activation="softmax")(x)

In [21]:
model = Model(inputs=inp, outputs=x)
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [22]:
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 200)]             0         
                                                                 
 embedding_2 (Embedding)     (None, 200, 128)          2560000   
                                                                 
 lstm_layer (LSTM)           (None, 200, 60)           45360     
                                                                 
 global_max_pooling1d_2 (Glo  (None, 60)               0         
 balMaxPooling1D)                                                
                                                                 
 dense_2 (Dense)             (None, 64)                3904      
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                           

In [23]:
model.fit(X_train_final,y_train, epochs=2,validation_data=(X_val_final,y_val))

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f3d6109ca10>

# **Results**

In [24]:
model.evaluate(X_test_final,y_test)



[0.7381033897399902, 0.6988879442214966]

In [25]:
lstm_pred = np.round(model.predict(X_test_final))

In [26]:
from sklearn.metrics import classification_report

In [27]:
# Precision , Recall , F1-score
cr = classification_report(y_test,lstm_pred)
print(cr)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       168
           1       0.72      0.87      0.79       616
           2       0.82      0.51      0.63       385

   micro avg       0.75      0.63      0.68      1169
   macro avg       0.51      0.46      0.47      1169
weighted avg       0.65      0.63      0.62      1169
 samples avg       0.63      0.63      0.63      1169



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
