In [1]:
import pandas as pd
import numpy as np
from os import listdir
from os.path import isfile, join
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import re
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel, BertConfig, TFBertForSequenceClassification
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

2024-01-31 11:33:34.975445: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-31 11:33:35.390409: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-31 11:33:35.390455: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-31 11:33:35.476614: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-31 11:33:35.655772: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-31 11:33:35.659887: I tensorflow/core/platform/cpu_feature_guard.cc:1

In [2]:
def tokenize_movies(review, length):
    return tokenizer.encode_plus(
        review,
        add_special_tokens=True,
        max_length=length,
        truncation=True,
        padding='max_length',
        return_attention_mask=True
    )

# Load the datasets

In [None]:
path = './data/nlp-gold/imdb/include_negotions/'

files = [join(path, f) for f in listdir(path) if isfile(join(path, f))]
print(files)

df_imdb = pd.concat((pd.read_csv(f) for f in files), ignore_index=True)

In [None]:
df_rt = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/nlp-gold/rottentomatoes/include_negotions/rt_data.csv.gz')

# Tokenizing and Sampling

In [None]:
df_imdb.dropna(subset=['review_detail'], inplace=True)
df_imdb.dropna(subset=['rating'], inplace=True)

df_imdb = df_imdb[df_imdb['review_detail'].str.len() <= 1000]

df_imdb['review_detail'] = df_imdb['review_detail'].astype("string")

max_length_imdb = df_imdb['review_detail'].apply(lambda x: len(x) if pd.notna(x) else 0).max()
print(max_length_imdb)

In [None]:
df_imdb_positive = df_imdb[df_imdb['sentiment'] == "POSITIVE"]
df_imdb_negative = df_imdb[df_imdb['sentiment'] == "NEGATIVE"]
df_imdb_medium = df_imdb[df_imdb['sentiment'] == "MEDIUM"]

num_samples_per_class = 12000 // 3
df_imdb_positive = df_imdb_positive.sample(n=num_samples_per_class, random_state=42)
df_imdb_negative = df_imdb_negative.sample(n=num_samples_per_class, random_state=42)
df_imdb_medium = df_imdb_medium.sample(n=num_samples_per_class + 1, random_state=42)

print("IMDB POS: "+ str(len(df_imdb_positive)))
print("IMDB NEG: "+ str(len(df_imdb_negative)))
print("IMDB MED: "+ str(len(df_imdb_medium)))

df_imdb = pd.concat([df_imdb_positive, df_imdb_negative, df_imdb_medium])

In [None]:
df_imdb['review_detail'] = df_imdb['review_detail'].apply(lambda x: tokenize_movies(x, max_length_imdb))

df_imdb['input_ids'] = df_imdb['review_detail'].apply(lambda x: x['input_ids'])
df_imdb['token_type_ids'] = df_imdb['review_detail'].apply(lambda x: x['token_type_ids'])
df_imdb['attention_mask'] = df_imdb['review_detail'].apply(lambda x: x['attention_mask'])
df_imdb['label_sentiment'] = df_imdb['sentiment'].apply(lambda x: 0 if x == "NEGATIVE" else 1 if x == "MEDIUM" else 2 if x == "POSITIVE" else -1)
df_imdb = df_imdb.drop('review_detail', axis=1)
df_imdb = df_imdb.drop('sentiment', axis=1)

In [None]:
df_rt.dropna(subset=['review_detail'], inplace=True)
df_rt.dropna(subset=['rating'], inplace=True)

df_rt['review_detail'] = df_rt['review_detail'].astype("string")

max_length_rt = df_rt['review_detail'].apply(lambda x: len(x) if pd.notna(x) else 0).max()
print(max_length_rt)

In [None]:
df_rt['review_detail'] = df_rt['review_detail'].apply(lambda x: tokenize_movies(x, max_length_rt))

df_rt['input_ids'] = df_rt['review_detail'].apply(lambda x: x['input_ids'])
df_rt['token_type_ids'] = df_rt['review_detail'].apply(lambda x: x['token_type_ids'])
df_rt['attention_mask'] = df_rt['review_detail'].apply(lambda x: x['attention_mask'])
df_rt['label_sentiment'] = df_rt['sentiment'].apply(lambda x: 0 if x == "NEGATIVE" else 1 if x == "MEDIUM" else 2 if x == "POSITIVE" else -1)
df_rt['label_top_critic'] = df_rt['top_critic'].apply(lambda x: 0 if x == False else 1 if x == True else -1)
df_rt = df_rt.drop('review_detail', axis=1)
df_rt = df_rt.drop('sentiment', axis=1)
df_rt = df_rt.drop('top_critic', axis=1)

# Save to H5

In [None]:
df_rt.to_hdf('/content/drive/MyDrive/Colab Notebooks/data/nlp-gold/rottentomatoes/rt_data_tokenized.h5', key='df', mode='w')

In [None]:
df_imdb.to_hdf('/content/drive/MyDrive/Colab Notebooks/data/nlp-gold/imdb/imdb_data_tokenized.h5', key='df', mode='w')

# Training the IMDB Model

In [None]:
df_imdb.head()

In [15]:
train_data_imdb, test_data_imdb = train_test_split(df_imdb, test_size=0.2)

In [16]:
input_ids_list_train_imdb = train_data_imdb['input_ids'].tolist()
attention_mask_list_train_imdb = train_data_imdb['attention_mask'].tolist()
token_type_ids_list_train_imdb = train_data_imdb['token_type_ids'].tolist()
labels_list_train_imdb = train_data_imdb['label_sentiment'].tolist()

input_ids_list_test_imdb = test_data_imdb['input_ids'].tolist()
attention_mask_lis_test_imdb = test_data_imdb['attention_mask'].tolist()
token_type_ids_list_test_imdb = test_data_imdb['token_type_ids'].tolist()
labels_list_test_imdb = test_data_imdb['label_sentiment'].tolist()

In [19]:
input_ids_train_imdb = tf.convert_to_tensor(input_ids_list_test_imdb)
attention_mask_train_imdb = tf.convert_to_tensor(attention_mask_lis_test_imdb)
token_type_ids_train_imdb = tf.convert_to_tensor(token_type_ids_list_test_imdb)
labels_train_imdb = tf.convert_to_tensor(labels_list_test_imdb)

input_ids_test_imdb = tf.convert_to_tensor(input_ids_list_test_imdb)
attention_mask_test_imdb = tf.convert_to_tensor(attention_mask_lis_test_imdb)
token_type_ids_test_imdb = tf.convert_to_tensor(token_type_ids_list_test_imdb)
labels_test_imdb = tf.convert_to_tensor(labels_list_test_imdb)

In [27]:
dataset_test_imdb = tf.data.Dataset.from_tensor_slices(((input_ids_test_imdb, attention_mask_test_imdb, token_type_ids_test_imdb), labels_test_imdb))
dataset_test_imdb = dataset_test_imdb.batch(16)  # Adjust batch size as needed
dataset_test_imdb = dataset_test_imdb.prefetch(tf.data.experimental.AUTOTUNE)

dataset_train_imdb = tf.data.Dataset.from_tensor_slices(((input_ids_test_imdb, attention_mask_test_imdb, token_type_ids_test_imdb), labels_test_imdb))
dataset_train_imdb = dataset_train_imdb.batch(2)  # Adjust batch size as needed
dataset_train_imdb = dataset_train_imdb.prefetch(tf.data.experimental.AUTOTUNE)

In [None]:
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

In [30]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

In [None]:
model.fit(dataset_train_imdb, epochs=1)  # Adjust the number of epochs as needed

# Taining the RT Model

In [None]:
df_rt = pd.read_hdf('/content/drive/MyDrive/Colab Notebooks/data/nlp-gold/rottentomatoes/rt_data_tokenized.h5')

In [None]:
train_data, test_data = train_test_split(df_rt, test_size=0.2)

In [None]:
input_ids_list_train = train_data['input_ids'].tolist()
attention_mask_list_train = train_data['attention_mask'].tolist()
token_type_ids_list_train = train_data['token_type_ids'].tolist()
labels_list_train = train_data['label_top_critic'].tolist()

In [None]:
input_ids_list_test = test_data['input_ids'].tolist()
attention_mask_lis_test = test_data['attention_mask'].tolist()
token_type_ids_list_test = test_data['token_type_ids'].tolist()
labels_list_test = test_data['label_top_critic'].tolist()

In [None]:
input_ids_train = tf.convert_to_tensor(input_ids_list_train)
attention_mask_train = tf.convert_to_tensor(attention_mask_list_train)
token_type_ids_train = tf.convert_to_tensor(token_type_ids_list_train)
labels_train = tf.convert_to_tensor(labels_list_train)

In [None]:
input_ids_test = tf.convert_to_tensor(input_ids_list_test)
attention_mask_test = tf.convert_to_tensor(attention_mask_lis_test)
token_type_ids_test = tf.convert_to_tensor(token_type_ids_list_test)
labels_test = tf.convert_to_tensor(labels_list_test)

In [None]:
dataset_test = tf.data.Dataset.from_tensor_slices(((input_ids_test, attention_mask_test, token_type_ids_test), labels_test))
dataset_test = dataset_test.batch(32)  # Adjust batch size as needed
dataset_test = dataset_test.prefetch(tf.data.experimental.AUTOTUNE)

In [None]:
dataset_train = tf.data.Dataset.from_tensor_slices(((input_ids_train, attention_mask_train, token_type_ids_train), labels_train))
dataset_train = dataset_train.batch(32)  # Adjust batch size as needed
dataset_train = dataset_train.prefetch(tf.data.experimental.AUTOTUNE)

In [None]:
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

In [None]:
model.fit(dataset_train, epochs=1)  # Adjust the number of epochs as needed



<keras.src.callbacks.History at 0x7c3dc00875e0>

In [None]:
loss, accuracy = model.evaluate(dataset_test)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

In [None]:
model.save_pretrained('/content/drive/MyDrive/Colab Notebooks/ ')

In [None]:
model.save('/content/drive/MyDrive/Colab Notebooks/data/nlp_rt', save_format='tf')  # Saves the model in HDF5 format

# Load and Test

In [None]:
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

In [None]:
model = TFBertForSequenceClassification.from_pretrained('/content/drive/MyDrive/Colab Notebooks/model_top_critic')

Some layers from the model checkpoint at /content/drive/MyDrive/Colab Notebooks/model_top_critic were not used when initializing TFBertForSequenceClassification: ['dropout_37']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at /content/drive/MyDrive/Colab Notebooks/model_top_critic.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.

In [None]:
loss, accuracy = model.evaluate(dataset_test)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

In [None]:
predictions = model.predict(dataset_test_imdb)
probabilities = tf.nn.softmax(predictions.logits, axis=-1)
predicted_labels = np.argmax(probabilities, axis=1)
print(predicted_labels)

ResourceExhaustedError: Graph execution error:

Detected at node tf_bert_for_sequence_classification_1/bert/encoder/layer_._0/attention/self/MatMul defined at (most recent call last):
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main

  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code

  File "/usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py", line 37, in <module>

  File "/usr/local/lib/python3.10/dist-packages/traitlets/config/application.py", line 992, in launch_instance

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelapp.py", line 619, in start

  File "/usr/local/lib/python3.10/dist-packages/tornado/platform/asyncio.py", line 195, in start

  File "/usr/lib/python3.10/asyncio/base_events.py", line 603, in run_forever

  File "/usr/lib/python3.10/asyncio/base_events.py", line 1909, in _run_once

  File "/usr/lib/python3.10/asyncio/events.py", line 80, in _run

  File "/usr/local/lib/python3.10/dist-packages/tornado/ioloop.py", line 685, in <lambda>

  File "/usr/local/lib/python3.10/dist-packages/tornado/ioloop.py", line 738, in _run_callback

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 825, in inner

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 786, in run

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelbase.py", line 377, in dispatch_queue

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 250, in wrapper

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 748, in __init__

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 786, in run

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelbase.py", line 361, in process_one

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 234, in wrapper

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelbase.py", line 261, in dispatch_shell

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 234, in wrapper

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelbase.py", line 539, in execute_request

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 234, in wrapper

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py", line 302, in do_execute

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/zmqshell.py", line 539, in run_cell

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 2975, in run_cell

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3030, in _run_cell

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/async_helpers.py", line 78, in _pseudo_sync_runner

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3257, in run_cell_async

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3473, in run_ast_nodes

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code

  File "<ipython-input-18-4d20f93b27f8>", line 1, in <cell line: 1>

  File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 2655, in predict

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 2440, in predict_function

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 2425, in step_function

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 2413, in run_step

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 2381, in predict_step

  File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 590, in __call__

  File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/base_layer.py", line 1149, in __call__

  File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 96, in error_handler

  File "/usr/local/lib/python3.10/dist-packages/transformers/modeling_tf_utils.py", line 1557, in run_call_with_unpacked_inputs

  File "/usr/local/lib/python3.10/dist-packages/transformers/models/bert/modeling_tf_bert.py", line 1569, in call

  File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/base_layer.py", line 1149, in __call__

  File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 96, in error_handler

  File "/usr/local/lib/python3.10/dist-packages/transformers/modeling_tf_utils.py", line 1557, in run_call_with_unpacked_inputs

  File "/usr/local/lib/python3.10/dist-packages/transformers/models/bert/modeling_tf_bert.py", line 862, in call

  File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/base_layer.py", line 1149, in __call__

  File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 96, in error_handler

  File "/usr/local/lib/python3.10/dist-packages/transformers/models/bert/modeling_tf_bert.py", line 548, in call

  File "/usr/local/lib/python3.10/dist-packages/transformers/models/bert/modeling_tf_bert.py", line 554, in call

  File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/base_layer.py", line 1149, in __call__

  File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 96, in error_handler

  File "/usr/local/lib/python3.10/dist-packages/transformers/models/bert/modeling_tf_bert.py", line 464, in call

  File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/base_layer.py", line 1149, in __call__

  File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 96, in error_handler

  File "/usr/local/lib/python3.10/dist-packages/transformers/models/bert/modeling_tf_bert.py", line 380, in call

  File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/base_layer.py", line 1149, in __call__

  File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 96, in error_handler

  File "/usr/local/lib/python3.10/dist-packages/transformers/models/bert/modeling_tf_bert.py", line 310, in call

OOM when allocating tensor with shape[32,12,12385,12385] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node tf_bert_for_sequence_classification_1/bert/encoder/layer_._0/attention/self/MatMul}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_predict_function_10337]