# **Stock market news feed semantic analysis** *(Combined deep learning models)*

1 news 1 data
LSTM and BERT

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Copy the dataset to the local environment
!cp "/content/drive/MyDrive/Combined_News_DJIA.csv" "Combined_News_DJIA.csv"

In [None]:
!pip install torchtext==0.6.0

Collecting torchtext==0.6.0
[?25l  Downloading https://files.pythonhosted.org/packages/f2/17/e7c588245aece7aa93f360894179374830daf60d7ed0bbb59332de3b3b61/torchtext-0.6.0-py3-none-any.whl (64kB)
[K     |█████                           | 10kB 20.5MB/s eta 0:00:01[K     |██████████▏                     | 20kB 10.2MB/s eta 0:00:01[K     |███████████████▎                | 30kB 6.3MB/s eta 0:00:01[K     |████████████████████▍           | 40kB 3.1MB/s eta 0:00:01[K     |█████████████████████████▌      | 51kB 3.8MB/s eta 0:00:01[K     |██████████████████████████████▋ | 61kB 4.3MB/s eta 0:00:01[K     |████████████████████████████████| 71kB 3.3MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/f5/99/e0808cb947ba10f575839c43e8fafc9cc44e4a7a2c8f79c60db48220a577/sentencepiece-0.1.95-cp37-cp37m-manylinux2014_x86_64.whl (1.2MB)
[K     |████████████████████████████████| 1.2MB 6.4MB/s 
Installing collected packages: sentencepiece, torchtext
  Foun

In [None]:
# Import the libraries 
import pandas as pd
import numpy as np
import pandas_datareader as web
import matplotlib.pyplot as plt
import string
import time
import torch.optim as optim
import torch.nn as nn
import torch
from torchtext import data
from sklearn.utils import shuffle
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('punkt')
from nltk.tokenize import word_tokenize  
from wordcloud import WordCloud
from numpy.random import MT19937
from numpy.random import RandomState, SeedSequence
import sklearn.metrics as metrics
import random

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# Number of merged news into one string
ROWS = 1

# Shuffle cycle number for the dataframe
SHUFFLE_CYCLE = 500

# Numpy random seed
NP_SEED = 1234

# Torch seed
TORCH_SEED = 1234

# Train percentage (train + valid)
TRAIN_SPLIT = 0.8

# Only validation split
VALIDATION_SPLIT = 0.1

# The label column 
LABEL_COLUMN = 0

In [None]:
# set seeds for reproduce
random.seed(NP_SEED)
rs = RandomState(MT19937(SeedSequence(NP_SEED)))
torch.manual_seed(TORCH_SEED)
np.random.seed(NP_SEED)
torch.backends.cudnn.deterministic = True

if torch.cuda.is_available():
  torch.cuda.manual_seed_all(TORCH_SEED)

In [None]:
# Load the dataset 
df_combined = pd.read_csv('Combined_News_DJIA.csv', index_col = "Date")

# Find the cells with NaN and after the rows for them
is_NaN = df_combined.isnull()
row_has_NaN = is_NaN.any(axis = 1)
rows_with_NaN = df_combined[row_has_NaN]

# Replace them
df_combined = df_combined.replace(np.nan, " ")

# Check the process
is_NaN = df_combined.isnull()
row_has_NaN = is_NaN.any(axis = 1)
rows_with_NaN = df_combined[row_has_NaN]

assert len(rows_with_NaN) is 0

# Get column names
combined_column_names = []
for column in df_combined.columns:
  combined_column_names.append(column)

# 2D array creation for the news based on macros
COLUMNS = len(df_combined)
news_sum = [[0 for i in range(COLUMNS)] for j in range(int((len(combined_column_names) - 1) / ROWS))]  

# Show the column names
print("Column names of the dataset:") 
print(combined_column_names)

# Merge the news
for row in range(len(df_combined)):
  for column in range(int((len(combined_column_names) - 1) / ROWS)):
    temp = ""
    news = ""
    for word in range(ROWS):
      news = df_combined[combined_column_names[(column * ROWS) + (word + 1)]][row]
      # Remove the b character at the begining of the string
      if news[0] is "b":
        news = " " + news[1:]
      temp = temp + news
    news_sum[column][row] = temp

# Show the first day second package of the news
print("\nThe first day second package of the news:")
print(news_sum[1][0])

# Drop the old columns
for column in range(len(combined_column_names) - 1):
  df_combined.drop(combined_column_names[column + 1], axis = 1, inplace = True)

# Create the new columns with the merged news
for column in range(int((len(combined_column_names) - 1) / ROWS)):
  colum_name = "News_" + str(column + 1)
  df_combined[colum_name] = news_sum[column]

news_sum = []
label_sum = []

# Get the column names
combined_column_names = []
for column in df_combined.columns:
  combined_column_names.append(column)

# Write out the column names 
print(combined_column_names)
print("\n")

# Connect the merged news with the labels
for column in range(len(df_combined)):
  for row in range(len(combined_column_names) - 1):
    news_sum.append(df_combined[combined_column_names[row + 1]][column])
    label_sum.append(df_combined[combined_column_names[LABEL_COLUMN]][column])

# Create the new DataFrame
df_sum_news_labels = pd.DataFrame(data = label_sum, index = None, columns = ["Label"])
df_sum_news_labels["News"] = news_sum

# Removing punctuations
temp_news = []
for line in news_sum:
  temp_attach = ""
  for word in line:
    temp = " "
    if word not in string.punctuation:
      temp = word
    temp_attach = temp_attach + "".join(temp)
  temp_news.append(temp_attach)

news_sum = temp_news
temp_news = []

# Remove numbers
for line in news_sum:
  temp_attach = ""
  for word in line:
    temp = " "
    if not word.isdigit():
      temp = word
    temp_attach = temp_attach + "".join(temp)
  temp_news.append(temp_attach)

# Remove space
for line in range(len(temp_news)):    
  temp_news[line] = " ".join(temp_news[line].split())

# Converting headlines to lower case
for line in range(len(temp_news)): 
    temp_news[line] = temp_news[line].lower()

# Update the data frame
df_sum_news_labels["News"] = temp_news

# Load the stop words
stop_words = set(stopwords.words('english'))

filtered_sentence = []
news_sum = df_sum_news_labels["News"]

# Remove stop words
for line in news_sum:
  word_tokens = word_tokenize(line)
  temp_attach = ""
  for word in word_tokens:
    temp = " "
    if not word in stop_words:
      temp = temp + word
    temp_attach = temp_attach + "".join(temp)
  filtered_sentence.append(temp_attach)

# Remove space
for line in range(len(filtered_sentence)):    
  filtered_sentence[line] = " ".join(filtered_sentence[line].split())

# Update the data frame
df_sum_news_labels["News"] = filtered_sentence

news_sum = df_sum_news_labels["News"]
null_indexes = []
index = 0

for line in news_sum:
  if line is "":
    null_indexes.append(index)
  index = index + 1

print(null_indexes)

for row in null_indexes:
  df_sum_news_labels = df_sum_news_labels.drop(row)

news_sum = df_sum_news_labels["News"]
null_indexes = []
index = 0

for line in news_sum:
  if line is "":
    null_indexes.append(index)
  index = index + 1
  
assert len(null_indexes) is 0

# Do the shuffle
for i in range(SHUFFLE_CYCLE):
  df_sum_news_labels = shuffle(df_sum_news_labels, random_state = rs)

# Reset the index
df_sum_news_labels.reset_index(inplace=True, drop=True)

# Create datasets
news_string = (df_sum_news_labels['News'].values).astype('U')
 
INPUT_SIZE = len(df_sum_news_labels)
# 80% train -> 9% valid, 81% train; 10% test
TRAIN_SIZE = int(TRAIN_SPLIT * INPUT_SIZE) 
VALID_SIZE = int(VALIDATION_SPLIT * TRAIN_SIZE)

# Create the train data set
train_dataset = df_sum_news_labels[:TRAIN_SIZE - VALID_SIZE] 

# Create the validation data set
valid_dataset = df_sum_news_labels[TRAIN_SIZE - VALID_SIZE:TRAIN_SIZE] 

# Create the test data set
test_dataset = df_sum_news_labels[TRAIN_SIZE:]

# Save them without the indexes
train_dataset.to_csv('drive/MyDrive/train.tsv', sep = '\t', index=False)
valid_dataset.to_csv('drive/MyDrive/valid.tsv', sep = '\t', index=False)
test_dataset.to_csv('drive/MyDrive/test.tsv', sep = '\t', index=False)

Column names of the dataset:
['Label', 'Top1', 'Top2', 'Top3', 'Top4', 'Top5', 'Top6', 'Top7', 'Top8', 'Top9', 'Top10', 'Top11', 'Top12', 'Top13', 'Top14', 'Top15', 'Top16', 'Top17', 'Top18', 'Top19', 'Top20', 'Top21', 'Top22', 'Top23', 'Top24', 'Top25']

The first day second package of the news:
 'BREAKING: Musharraf to be impeached.'
['Label', 'News_1', 'News_2', 'News_3', 'News_4', 'News_5', 'News_6', 'News_7', 'News_8', 'News_9', 'News_10', 'News_11', 'News_12', 'News_13', 'News_14', 'News_15', 'News_16', 'News_17', 'News_18', 'News_19', 'News_20', 'News_21', 'News_22', 'News_23', 'News_24', 'News_25']


[6947, 6948, 6949, 8723, 8724, 13134, 17048, 17049]


In [None]:
N_VALUE_RANGE_START = 1
N_VALUE_RANGE_END = 3
# Vocabulary size
MAX_VOCAB_SIZE = 7500

def generate_ngrams(input):
    n_grams = []
    n_values = []
    output = []

    for n_value in range(N_VALUE_RANGE_START, N_VALUE_RANGE_END + 1):
        n_values.append(n_value)

    for n_value in n_values:
        n_grams.append(set(zip(*[input[i:] for i in range(n_value)])))

    for n_gram in n_grams:
        for element in n_gram:
            output.append(' '.join(element))

    return output

NEWS = data.Field(#tokenize = 'spacy', 
                  preprocessing = generate_ngrams,
                  #tokenizer_language = 'en_core_web_sm',
                  #include_lengths = True)
)

LABELS = data.LabelField(dtype = torch.float)

fields = [('labels', LABELS), ('news', NEWS)]

train_data, valid_data, test_data = data.TabularDataset.splits(
                                        path = "drive/MyDrive",
                                        train = "train.tsv",
                                        validation = "valid.tsv",
                                        test = "test.tsv",
                                        format = "tsv",
                                        fields = fields,
                                        skip_header = True
)

NEWS.build_vocab(train_data,
                  max_size = MAX_VOCAB_SIZE)

LABELS.build_vocab(train_data)

# Set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Show it
print(device)

#Set the iterators for the data
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    sort_key = lambda x: x.news, #sort by n attribute (quote)
    batch_size = 32,
    sort_within_batch = True,
    device = device)

cuda


In [None]:
print('Train:')
for batch in train_iterator:
    print(batch)
    break
    
print('Valid:')
for batch in valid_iterator:
    print(batch)
    break
    
print('Test:')
for batch in test_iterator:
    print(batch)
    break

Train:

[torchtext.data.batch.Batch of size 32]
	[.labels]:[torch.cuda.FloatTensor of size 32 (GPU 0)]
	[.news]:[torch.cuda.LongTensor of size 86x32 (GPU 0)]
Valid:

[torchtext.data.batch.Batch of size 32]
	[.labels]:[torch.cuda.FloatTensor of size 32 (GPU 0)]
	[.news]:[torch.cuda.LongTensor of size 87x32 (GPU 0)]
Test:

[torchtext.data.batch.Batch of size 32]
	[.labels]:[torch.cuda.FloatTensor of size 32 (GPU 0)]
	[.news]:[torch.cuda.LongTensor of size 72x32 (GPU 0)]


In [None]:
pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/57495b5309f09fa501866e225c84532d1fd89536ea62406b2181933fb418/transformers-4.5.1-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 4.3MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 27.2MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 29.8MB/s 
Installing collected packages: sacremoses, tokenizers, transformers
Successfully installed sacremoses-0.0.45 tokenizers-0.10.2 transformers-4.5.1


In [None]:
# load pretrained bert
from transformers import BertTokenizer, BertModel

bert = BertModel.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




In [None]:
BERT_HIDDEN_DIM = 128
BERT_OUTPUT_DIM = 1
BERT_N_LAYERS = 3
BERT_BIDIRECTIONAL = True
BERT_DROPOUT = 0.25

class BERT_Model(nn.Module):
    def __init__(self,
                 bert,
                 hidden_dim,
                 output_dim,
                 n_layers,
                 bidirectional,
                 dropout):
        
        super().__init__()
        
        self.bert = bert
        
        embedding_dim = bert.config.to_dict()['hidden_size']
        
        self.rnn = nn.GRU(embedding_dim,
                          hidden_dim,
                          num_layers = n_layers,
                          bidirectional = bidirectional,
                          batch_first = True,
                          dropout = 0 if n_layers < 2 else dropout)
        
        self.out = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim) # linear out
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        text = torch.transpose(text,0,1)

        with torch.no_grad():
            embedded = self.bert(text)[0] # do not teach the bert layer parameters
        
        _, hidden = self.rnn(embedded)
        
        if self.rnn.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        else:
            hidden = self.dropout(hidden[-1,:,:])
        
        output = self.out(hidden)
        
        return output    

LSTM_EMBEDDING_DIM = 100
LSTM_HIDDEN_DIM = 64
LSTM_OUTPUT_DIM = 1
LSTM_N_LAYERS = 2
LSTM_BIDIRECTIONAL = False
LSTM_DROPOUT = 0.1
LSTM_INPUT_DIM = len(NEWS.vocab)
LSTM_PAD_IDX = NEWS.vocab.stoi[NEWS.pad_token]

class LSTM_Model(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)

        self.rnn = nn.LSTM(embedding_dim,
                          hidden_dim,
                          num_layers=n_layers, 
                          bidirectional=bidirectional, 
                          dropout=dropout)
    
        self.fc_out = nn.Linear(int(hidden_dim), output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        embedded = self.dropout(self.embedding(text))
        
        packed_output, (hidden, cell) = self.rnn(embedded)

        hidden = self.dropout(hidden[-1,:,:])

        return self.fc_out(hidden)

class Combined(nn.Module):
    def __init__(self, BERT_1, LSTM_1):
        super(Combined, self).__init__()
        self.BERT = BERT_1

        self.LSTM = LSTM_1
        
        self.fc1 = nn.Linear(2, 4)
        self.fc2 = nn.Linear(4, 16)
        self.fc3 = nn.Linear(16, 128)
        self.fc4 = nn.Linear(128, 256)
        self.fc5 = nn.Linear(256, 64)
        self.fc6 = nn.Linear(64, 8)
        self.fc7 = nn.Linear(8, 1)

    def forward(self, text):
        x1 = self.BERT(text)
        x2 = self.LSTM(text)
        x = torch.cat((x1, x2), dim=1)
        x = torch.sigmoid(self.fc1(x))
        x = torch.sigmoid(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))
        x = torch.sigmoid(self.fc4(x))
        x = torch.sigmoid(self.fc5(x))
        x = torch.sigmoid(self.fc6(x))

        return self.fc7(x)

# Create models and load state_dicts    
BERT = BERT_Model(bert,
                        BERT_HIDDEN_DIM,
                        BERT_OUTPUT_DIM,
                        BERT_N_LAYERS,
                        BERT_BIDIRECTIONAL,
                        BERT_DROPOUT)

LSTM = LSTM_Model(LSTM_INPUT_DIM, 
                        LSTM_EMBEDDING_DIM, 
                        LSTM_HIDDEN_DIM, 
                        LSTM_OUTPUT_DIM, 
                        LSTM_N_LAYERS, 
                        LSTM_BIDIRECTIONAL, 
                        LSTM_DROPOUT, 
                        LSTM_PAD_IDX)
# Load state dicts
BERT.load_state_dict(torch.load('drive/MyDrive/bert_best_model.pt'))
LSTM.load_state_dict(torch.load('drive/MyDrive/lstm_best-model.pt'))

model = Combined(BERT, LSTM)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} parameters')

The model has 2,161,127 parameters


In [None]:
for name, param in model.named_parameters():                
    if name.startswith('BERT'):
        param.requires_grad = False
    if name.startswith('LSTM'):
        param.requires_grad = False    

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 52,269 trainable parameters


In [None]:
UNK_IDX = NEWS.vocab.stoi[NEWS.unk_token]

model.LSTM.embedding.weight.data[UNK_IDX] = torch.zeros(LSTM_EMBEDDING_DIM)
model.LSTM.embedding.weight.data[LSTM_PAD_IDX] = torch.zeros(LSTM_EMBEDDING_DIM)

print(model.LSTM.embedding.weight.data)

tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [-1.2893e+00,  3.1971e-01, -1.1178e+00,  ..., -1.0642e+00,
         -7.8712e-01,  2.9927e-01],
        ...,
        [-9.5515e-01,  1.4307e-03,  7.2091e-01,  ...,  2.8553e-01,
          3.5123e-01,  8.5332e-02],
        [ 1.7967e+00, -4.4204e-01, -2.2764e-01,  ..., -1.3371e+00,
         -1.8561e+00,  4.4284e-02],
        [-9.3392e-01, -1.0174e+00,  1.7149e-01,  ..., -1.1119e+00,
         -8.8879e-01, -8.2188e-01]])


In [None]:
optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [None]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [None]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train() # turn on drop out
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        text = batch.news

        predictions = model(text).squeeze(1)
        
        loss = criterion(predictions, batch.labels)
        
        acc = binary_accuracy(predictions, batch.labels)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval() # turn off drop out
    
    with torch.no_grad():
    
        for batch in iterator:

            text = batch.news
            
            predictions = model(text).squeeze(1)
            
            loss = criterion(predictions, batch.labels)
            
            acc = binary_accuracy(predictions, batch.labels)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
best_valid_loss = float('inf')

no_improve_counter = 0
last_valid_acc = 0

train_loss_array = []
train_acc_array = []
valid_loss_array = []
valid_acc_array = []

for epoch in range(100):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)

    train_loss_array.append(train_loss)
    train_acc_array.append(train_acc)
    valid_loss_array.append(valid_loss)
    valid_acc_array.append(valid_acc)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'drive/MyDrive/comb_best-model.pt')

    # early leave
    no_improvements = ""

    if last_valid_acc > valid_acc:
        best_valid_loss = valid_loss
        no_improve_counter = no_improve_counter + 1
        no_improvements = "| There were no improvements on the validation set!"
    else:
        no_improve_counter = 0

    last_valid_acc = valid_acc

    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s {no_improvements}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

    if no_improve_counter is 5:
      print(f'The model is not improving, going to stop.')
      break

    if train_acc > 0.95:
      print(f'The model training is finished.')
      break  
    
    torch.save(model.state_dict(), 'drive/MyDrive/comb_last-model.pt')     

Epoch: 01 | Epoch Time: 6m 59s 
	Train Loss: 0.691 | Train Acc: 53.36%
	 Val. Loss: 0.690 |  Val. Acc: 53.93%
Epoch: 02 | Epoch Time: 6m 59s 
	Train Loss: 0.691 | Train Acc: 53.35%
	 Val. Loss: 0.690 |  Val. Acc: 53.93%
Epoch: 03 | Epoch Time: 6m 58s 
	Train Loss: 0.691 | Train Acc: 53.42%
	 Val. Loss: 0.690 |  Val. Acc: 53.93%
Epoch: 04 | Epoch Time: 6m 59s 
	Train Loss: 0.691 | Train Acc: 53.41%
	 Val. Loss: 0.691 |  Val. Acc: 53.93%
Epoch: 05 | Epoch Time: 6m 59s 
	Train Loss: 0.691 | Train Acc: 53.29%
	 Val. Loss: 0.691 |  Val. Acc: 53.93%
Epoch: 06 | Epoch Time: 6m 59s 
	Train Loss: 0.691 | Train Acc: 53.42%
	 Val. Loss: 0.690 |  Val. Acc: 53.93%
Epoch: 07 | Epoch Time: 6m 58s 
	Train Loss: 0.691 | Train Acc: 53.30%
	 Val. Loss: 0.690 |  Val. Acc: 53.93%
Epoch: 08 | Epoch Time: 7m 0s 
	Train Loss: 0.691 | Train Acc: 53.31%
	 Val. Loss: 0.690 |  Val. Acc: 53.93%
Epoch: 09 | Epoch Time: 6m 53s 
	Train Loss: 0.691 | Train Acc: 53.23%
	 Val. Loss: 0.690 |  Val. Acc: 53.93%
Epoch: 10 |

In [None]:
# Visualize the training
plt.figure(figsize=(16,8))
plt.title('Train and validation loss')
plt.plot(train_loss_array, color = "green", label = "Train loss")
plt.plot(valid_loss_array, color = "blue", label = "Valid loss")
plt.xlabel('Epoch',fontsize=18)
plt.ylabel('Loss',fontsize=18)
plt.legend(fontsize=18)
plt.show()

In [None]:
# Visualize the training
plt.figure(figsize=(16,8))
plt.title('Train and validation accuraccy')
plt.plot(train_acc_array, color = "green", label = "Train accuracy")
plt.plot(valid_acc_array, color = "blue", label = "Valid accuracy")
plt.xlabel('Epoch',fontsize=18)
plt.ylabel('Accuracy (%)',fontsize=18)
plt.legend(fontsize=18)
plt.show()

In [None]:
model.load_state_dict(torch.load('drive/MyDrive/comb_best-model.pt', map_location=torch.device('cpu')))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'-- Best model --')
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

model.load_state_dict(torch.load('drive/MyDrive/comb_last-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'\n-- Last model --')
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

In [None]:
model.load_state_dict(torch.load('drive/MyDrive/comb_best-model.pt'))

model.eval() # turn off drop out

predictions = []
labels = []

# Collect predictions and labels
for batch in test_iterator:
  text = batch.news

  predictions.append(model(text).squeeze(1))

  labels.append(batch.labels)

# Convert to numpy
pred_tensor = torch.cat(predictions).cpu()
labels_tensor = torch.cat(labels).cpu()

# Get the metrics
fpr, tpr, threshold = metrics.roc_curve(labels_tensor.detach().numpy(), pred_tensor.detach().numpy())
roc_auc = metrics.auc(fpr, tpr)

# Plot
plt.figure(figsize=(16,8))
plt.title('Receiver Operating Characteristic of the best model', fontsize = 18)
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right', fontsize = 18)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate', fontsize = 18)
plt.xlabel('False Positive Rate', fontsize = 18)
plt.show()

In [None]:
model.load_state_dict(torch.load('drive/MyDrive/comb_last-model.pt'))

model.eval() # turn off drop out

predictions = []
labels = []

# Collect predictions and labels
for batch in test_iterator:
  text = batch.news

  predictions.append(model(text).squeeze(1))

  labels.append(batch.labels)

# Convert to numpy
pred_tensor = torch.cat(predictions).cpu()
labels_tensor = torch.cat(labels).cpu()

# Get the metrics
fpr, tpr, threshold = metrics.roc_curve(labels_tensor.detach().numpy(), pred_tensor.detach().numpy())
roc_auc = metrics.auc(fpr, tpr)

# Plot
plt.figure(figsize=(16,8))
plt.title('Receiver Operating Characteristic of the last model', fontsize = 18)
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right', fontsize = 18)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate', fontsize = 18)
plt.xlabel('False Positive Rate', fontsize = 18)
plt.show()