In [1]:
import numpy as np
import pandas as pd
import random
import torch
import os
import spacy
from torchtext.vocab import GloVe, FastText
from torch.utils.data.sampler import SubsetRandomSampler
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report, accuracy_score
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
import nltk
import matplotlib.pyplot as plt
import tensorflow as tf

2023-08-18 14:15:32.178913: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-08-18 14:15:33.135792: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-08-18 14:15:33.136055: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1960] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your 

In [2]:
data_df = pd.read_csv('/data/home/ayyoobmohd/DLNLP/Glove-and-Sentiments/data/Dataset0.csv')
data_df = data_df.sample(frac=1, random_state=42)
print(data_df.Label.value_counts())
data_df.head(5)

neutral     2588
positive    1287
negative     125
Name: Label, dtype: int64


Unnamed: 0,Label,Review
555,positive,"Ruukki Romania , the local arm of Finnish meta..."
3491,negative,18 March 2010 A leakage in the gypsum pond was...
527,positive,The subscriptions increase Cargotec 's share c...
3925,neutral,The total value of the project is about EUR53m...
2989,neutral,"Its product portfolio comprises harvesters , f..."


In [3]:
from sklearn.model_selection import train_test_split

# Use train_test_split to split training data into training and validation sets
train_sentences, val_sentences, train_labels, val_labels = train_test_split(data_df["Review"].to_numpy(),
                                                                            data_df["Label"].to_numpy(),
                                                                            test_size=0.1, # dedicate 10% of samples to validation set
                                                                            random_state=42) # random state for reproducibility

In [4]:
# Check the lengths
len(train_sentences), len(train_labels), len(val_sentences), len(val_labels)

(3600, 3600, 400, 400)

In [5]:
# View the first 4 training sentences and their labels
train_sentences[:4], train_labels[:4]

(array(['The agreement was signed with Biohit Healthcare Ltd , the UK-based subsidiary of Biohit Oyj , a Finnish public company which develops , manufactures and markets liquid handling products and diagnostic test systems .',
        'Both operating profit and net sales for the 12-month period increased , respectively from EUR20 .8 m and EUR177 .7 m , as compared to the financial year 2004 .',
        "The online ice chart shows no ice in the area of Estonia 's sea ports on the coast of the Gulf of Finland .",
        'In Finland , Sampo Bank , now part of the Danske Bank group , will lay off staff from the administrative and support functions .'],
       dtype=object),
 array(['positive', 'positive', 'neutral', 'negative'], dtype=object))

In [6]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization # after TensorFlow 2.6

# Use the default TextVectorization variables
text_vectorizer = TextVectorization(max_tokens=None, # how many words in the vocabulary (all of the different words in your text)
                                    standardize="lower_and_strip_punctuation", # how to process text
                                    split="whitespace", # how to split tokens
                                    ngrams=None, # create groups of n-words?
                                    output_mode="int", # how to map tokens to numbers
                                    output_sequence_length=None) # how long should the output sequence of tokens be?
                                    # pad_to_max_tokens=True) # Not valid if using max_tokens=None

In [7]:
# Find average number of tokens (words) in training Revies
round(sum([len(i.split()) for i in train_sentences])/len(train_sentences))

23

In [8]:
# Setup text vectorization with custom variables
max_vocab_length = 10000 # max number of words to have in our vocabulary
max_length = 15 # max length our sequences will be (e.g. how many words from a Tweet does our model see?)

text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode="int",
                                    output_sequence_length=max_length)

# Fit the text vectorizer to the training text
text_vectorizer.adapt(train_sentences)

In [9]:
# Create sample sentence and tokenize it
sample_sentence = "There's a flood in my street!"
text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[   1,    7,    1,    4, 1352, 1310,    0,    0,    0,    0,    0,
           0,    0,    0,    0]])>

In [10]:
# Get the unique words in the vocabulary
words_in_vocab = text_vectorizer.get_vocabulary()
top_5_words = words_in_vocab[:5] # most common tokens (notice the [UNK] token for "unknown" words)
bottom_5_words = words_in_vocab[-5:] # least common tokens
print(f"Number of words in vocab: {len(words_in_vocab)}")
print(f"Top 5 most common words: {top_5_words}") 
print(f"Bottom 5 least common words: {bottom_5_words}")

Number of words in vocab: 9485
Top 5 most common words: ['', '[UNK]', 'the', 'of', 'in']
Bottom 5 least common words: ['005', '0030', '0025', '001', '000063']


In [11]:
tf.random.set_seed(42)
from tensorflow.keras import layers

embedding = layers.Embedding(input_dim=max_vocab_length, # set input shape
                             output_dim=128, # set size of embedding vector
                             embeddings_initializer="uniform", # default, intialize randomly
                             input_length=max_length, # how long is each input
                             name="embedding_1") 

# Get a random sentence from training set
random_sentence = random.choice(train_sentences)
print(f"Original text:\n{random_sentence}\
      \n\nEmbedded version:")

# Embed the random sentence (turn it into numerical representation)
sample_embed = embedding(text_vectorizer([random_sentence]))
sample_embed

Original text:
The company 's objective is to offer the best people flow experience by developing and delivering solutions that enable people to move smoothly , safely , comfortably and without waiting in buildings in an increasingly urbanizing environment .      

Embedded version:


<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[ 0.0083533 , -0.00074539,  0.02146458, ...,  0.04742965,
         -0.04287521,  0.04349596],
        [-0.04304116,  0.03123376,  0.0085089 , ...,  0.01759143,
          0.03372277,  0.02272005],
        [ 0.01984398,  0.03463571,  0.0363873 , ..., -0.00412066,
          0.02215226, -0.00210441],
        ...,
        [ 0.03057046,  0.04284093, -0.00378841, ...,  0.00465607,
         -0.02040966, -0.00751033],
        [ 0.00053488,  0.04692164,  0.01144242, ...,  0.0313756 ,
         -0.03296059, -0.01450782],
        [-0.04179817, -0.03717345, -0.03030258, ..., -0.0252841 ,
          0.00776132, -0.01706985]]], dtype=float32)>

In [12]:
# Check out a single token's embedding
sample_embed[0][0]

<tf.Tensor: shape=(128,), dtype=float32, numpy=
array([ 0.0083533 , -0.00074539,  0.02146458, -0.00887399, -0.03510289,
       -0.00777601, -0.01988485, -0.03624551,  0.02707094, -0.03894543,
        0.04827147, -0.03881482,  0.01039989, -0.03603768, -0.00893569,
       -0.02085767,  0.03172684,  0.02413476, -0.0487294 , -0.03907872,
       -0.0439254 ,  0.01758758, -0.0298792 , -0.02027115, -0.04018005,
        0.02525247, -0.00073118,  0.01083845, -0.02991383,  0.02897293,
        0.0411301 ,  0.03917264,  0.04939225, -0.03698512, -0.03215194,
        0.00813963,  0.02702612,  0.03881866,  0.00630585,  0.01173483,
       -0.01244243,  0.03282733,  0.03870443,  0.0456844 ,  0.00767218,
       -0.04829168,  0.04083549, -0.03541629, -0.00517128, -0.02891439,
       -0.02870722, -0.04868822,  0.01829317, -0.04672033, -0.03616424,
        0.01497174, -0.00268135, -0.00128616, -0.00748856,  0.04786054,
       -0.04254986, -0.03263223, -0.00956003,  0.0230718 , -0.01311659,
       -0.034252

## NB

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Create tokenization and modelling pipeline
model_0 = Pipeline([
                    ("tfidf", TfidfVectorizer()), # convert words to numbers using tfidf
                    ("clf", MultinomialNB()) # model the text
])

# Fit the pipeline to the training data
model_0.fit(train_sentences, train_labels)

In [14]:
baseline_score = model_0.score(val_sentences, val_labels)
print(f"Our baseline model achieves an accuracy of: {baseline_score*100:.2f}%")

Our baseline model achieves an accuracy of: 78.00%


In [15]:
# Function to evaluate: accuracy, precision, recall, f1-score
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
  """
  Calculates model accuracy, precision, recall and f1 score of a binary classification model.

  Args:
  -----
  y_true = true labels in the form of a 1D array
  y_pred = predicted labels in the form of a 1D array

  Returns a dictionary of accuracy, precision, recall, f1-score.
  """
  # Calculate model accuracy
  model_accuracy = accuracy_score(y_true, y_pred) * 100
  # Calculate model precision, recall and f1 score using "weighted" average
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
  return model_results

In [16]:
# Make predictions
baseline_preds = model_0.predict(val_sentences)
baseline_preds[:20]

# Get baseline results
baseline_results = calculate_results(y_true=val_labels,
                                     y_pred=baseline_preds)
baseline_results

  _warn_prf(average, modifier, msg_start, len(result))


{'accuracy': 78.0,
 'precision': 0.756631374354253,
 'recall': 0.78,
 'f1': 0.739515883872929}

In [17]:
import datetime
def create_tensorboard_callback(dir_name, experiment_name):
  """
  Creates a TensorBoard callback instand to store log files.

  Stores log files with the filepath:
    "dir_name/experiment_name/current_datetime/"

  Args:
    dir_name: target directory to store TensorBoard log files
    experiment_name: name of experiment directory (e.g. efficientnet_model_1)
  """
  log_dir = dir_name + "/" + experiment_name + "/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
  tensorboard_callback = tf.keras.callbacks.TensorBoard(
      log_dir=log_dir
  )
  print(f"Saving TensorBoard log files to: {log_dir}")
  return tensorboard_callback

SAVE_DIR = "model_logs"

In [37]:
# Set random seed and create embedding layer (new embedding layer for each model)
tf.random.set_seed(42)
from tensorflow.keras import layers
model_2_embedding = layers.Embedding(input_dim=max_vocab_length,
                                     output_dim=128,
                                     embeddings_initializer="uniform",
                                     input_length=max_length,
                                     name="embedding_2")


# Create LSTM model
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = model_2_embedding(x)
print(x.shape)
# x = layers.LSTM(64, return_sequences=True)(x) # return vector for each word in the Tweet (you can stack RNN cells as long as return_sequences=True)
x = layers.LSTM(64)(x) # return vector for whole sequence
print(x.shape)
x = layers.Dense(64, activation="relu")(x) # optional dense layer on top of output of LSTM cell
outputs = layers.Dense(3, activation="softmax")(x)
print(outputs.shape)
model_2 = tf.keras.Model(inputs, outputs, name="model_2_LSTM")

(None, 15, 128)
(None, 64)
(None, 3)


In [38]:
# Compile model
model_2.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [39]:
model_2.summary()

Model: "model_2_LSTM"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_8 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (Text  (None, 15)                0         
 Vectorization)                                                  
                                                                 
 embedding_2 (Embedding)     (None, 15, 128)           1280000   
                                                                 
 lstm_7 (LSTM)               (None, 64)                49408     
                                                                 
 dense_13 (Dense)            (None, 64)                4160      
                                                                 
 dense_14 (Dense)            (None, 3)                 195       
                                                      

In [40]:
# Fit model
model_2_history = model_2.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks=[create_tensorboard_callback(SAVE_DIR, 
                                                                     "LSTM")])

Saving TensorBoard log files to: model_logs/LSTM/20230818-143830
Epoch 1/5


ValueError: in user code:

    File "/data/home/ayyoobmohd/miniconda3/envs/DL/lib/python3.11/site-packages/keras/src/engine/training.py", line 1338, in train_function  *
        return step_function(self, iterator)
    File "/data/home/ayyoobmohd/miniconda3/envs/DL/lib/python3.11/site-packages/keras/src/engine/training.py", line 1322, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/data/home/ayyoobmohd/miniconda3/envs/DL/lib/python3.11/site-packages/keras/src/engine/training.py", line 1303, in run_step  **
        outputs = model.train_step(data)
    File "/data/home/ayyoobmohd/miniconda3/envs/DL/lib/python3.11/site-packages/keras/src/engine/training.py", line 1081, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "/data/home/ayyoobmohd/miniconda3/envs/DL/lib/python3.11/site-packages/keras/src/engine/training.py", line 1139, in compute_loss
        return self.compiled_loss(
    File "/data/home/ayyoobmohd/miniconda3/envs/DL/lib/python3.11/site-packages/keras/src/engine/compile_utils.py", line 265, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "/data/home/ayyoobmohd/miniconda3/envs/DL/lib/python3.11/site-packages/keras/src/losses.py", line 142, in __call__
        losses = call_fn(y_true, y_pred)
    File "/data/home/ayyoobmohd/miniconda3/envs/DL/lib/python3.11/site-packages/keras/src/losses.py", line 268, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "/data/home/ayyoobmohd/miniconda3/envs/DL/lib/python3.11/site-packages/keras/src/losses.py", line 2432, in binary_crossentropy
        backend.binary_crossentropy(y_true, y_pred, from_logits=from_logits),
    File "/data/home/ayyoobmohd/miniconda3/envs/DL/lib/python3.11/site-packages/keras/src/backend.py", line 5809, in binary_crossentropy
        return tf.nn.sigmoid_cross_entropy_with_logits(

    ValueError: `logits` and `labels` must have the same shape, received ((None, 3) vs (None, 1)).
