In [None]:
!git clone https://github.com/Franck-Dernoncourt/pubmed-rct.git

import os
for filename in os.listdir("pubmed-rct"):
    print(filename)

data_dir = "pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/"

! ls pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/

filenames = [data_dir+file for file in os.listdir(data_dir)]
filenames

def get_lines(filename: str) -> list[str]:
    
    with open(filename) as f:
        return f.readlines()

def preprocess_text_with_line_numbers(filename):
    """Returns a list of dictionaries of abstract line data.

  Takes in filename, reads its contents and sorts through each line,
  extracting things like the target label, the text of the sentence,
  how many sentences are in the current abstract and what sentence number
  the target line is.

  Args:
      filename: a string of the target text file to read and extract line data
      from.

  Returns:
      A list of dictionaries each containing a line from an abstract,
      the lines label, the lines position in the abstract and the total number
      of lines in the abstract where the line is from. For example:

      [{"target": 'CONCLUSION',
        "text": The study couldn't have gone better, turns out people are kinder than you think",
        "line_number": 8,
        "total_lines": 8}]
  """
    input_lines = get_lines(filename)
    abstract_lines = ""
    abstract_sample = []
    for line in input_lines:
        if line.startswith("###"):
            abstract_id = line
            abstract_lines = ""
        elif line.isspace():
            abstract_line_split = abstract_lines.splitlines()

            for abstract_line_number, abstract_line in enumerate(abstract_line_split):
               line_data = {}
               target_text_split = abstract_line.split("\t")
               line_data["target"] = target_text_split[0]
               line_data["text"] = target_text_split[1]
               line_data["line_number"] = abstract_line_number
               line_data["total_lines"] = len(abstract_line_split)
               abstract_sample.append(line_data)
        else:
            abstract_lines += line

    return abstract_sample

train_sample = preprocess_text_with_line_numbers(data_dir + "train.txt")
test_sample = preprocess_text_with_line_numbers(data_dir + "test.txt")
val_sample = preprocess_text_with_line_numbers(data_dir + "dev.txt")

len(train_sample), len(test_sample), len(val_sample)

import pandas as pd
import matplotlib.pyplot as plt

train_df = pd.DataFrame(train_sample)
test_df = pd.DataFrame(test_sample)
val_df = pd.DataFrame(val_sample)

train_df.head()


train_sentences = train_df.text.tolist()
test_sentences = test_df.text.tolist()
val_sentences = val_df.text.tolist()
len(train_sentences), len(test_sentences), len(val_sample)

from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse_output=False)
train_labels_ohe = ohe.fit_transform(train_df.target.to_numpy().reshape(-1,1))
val_labels_ohe = ohe.transform(val_df.target.to_numpy().reshape(-1,1))
test_labels_ohe = ohe.transform(test_df.target.to_numpy().reshape(-1,1))

train_labels_ohe, val_labels_ohe

from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
train_label_encoded = label_encoder.fit_transform(train_df['target'].to_numpy())
test_label_encoded = label_encoder.transform(test_df['target'].to_numpy())
val_label_encoded = label_encoder.transform(val_df['target'].to_numpy())

train_label_encoded


num_classes = len(label_encoder.classes_)
class_names = label_encoder.classes_
num_classes, class_names

import tensorflow as tf
from tensorflow import keras
from keras import layers
from keras.layers import Dense, TextVectorization, Conv1D, Input, Embedding
from keras.models import Model

max_tokens = 680000
output_seq_len = 55
text_vectorizer = layers.TextVectorization(
    max_tokens=max_tokens,
    output_sequence_length=output_seq_len,
)

text_vectorizer.adapt(train_sentences)

text_vocab = text_vectorizer.get_vocabulary()
embedding = Embedding(
    input_dim=len(text_vocab),# lenght of our vocab
    output_dim=256,
    mask_zero=True,
    name='token_embedding'
)

def split_chars(text):
    """
    Split sentence into characters
    """
    return " ".join(list(text))

train_chars = [split_chars(sentence) for sentence in train_sentences]
val_chars = [split_chars(sentence) for sentence in val_sentences]
test_chars = [split_chars(sentence) for sentence in test_sentences]

import string
alphabet = string.ascii_lowercase + string.digits
NUM_CHAR_TOKENS = len(alphabet) + 2 # add 2 for space and [UNK]
char_vectorizer = TextVectorization(
    max_tokens=NUM_CHAR_TOKENS,
    output_sequence_length=int(290),
    name='char_vectorizer'
)

char_vectorizer.adapt(train_chars)

char_vocab = char_vectorizer.get_vocabulary()
char_embeding = Embedding(input_dim=len(char_vocab),
                         output_dim=25,
                         mask_zero=True,
                         name='char_embed')



train_line_numbers_one_hot = tf.one_hot(train_df.line_number.to_numpy(), depth=16)
val_line_numbers_one_hot = tf.one_hot(val_df.line_number.to_numpy(), depth=16)
test_line_numbers_one_hot = tf.one_hot(test_df.line_number.to_numpy(), depth=16)


train_total_lines_one_hot = tf.one_hot(train_df.total_lines.to_numpy(), depth=20)
val_total_lines_one_hot = tf.one_hot(val_df.total_lines.to_numpy(), depth=20)
test_total_lines_one_hot = tf.one_hot(test_df.total_lines.to_numpy(), depth=20)

train_char_token_pos_data = tf.data.Dataset.from_tensor_slices((train_sentences, train_chars, train_line_numbers_one_hot, train_total_lines_one_hot))
train_char_token_pos_label = tf.data.Dataset.from_tensor_slices(train_labels_ohe)
train_char_token_pos_dataset = tf.data.Dataset.zip((train_char_token_pos_data, train_char_token_pos_label)).batch(32).prefetch(tf.data.AUTOTUNE)
train_char_token_pos_data

val_char_token_pos_data = tf.data.Dataset.from_tensor_slices((val_sentences, val_chars, val_line_numbers_one_hot, val_total_lines_one_hot))
val_char_token_pos_label = tf.data.Dataset.from_tensor_slices(val_labels_ohe)
val_char_token_pos_dataset = tf.data.Dataset.zip((val_char_token_pos_data, val_char_token_pos_label)).batch(32).prefetch(tf.data.AUTOTUNE)
val_char_token_pos_data

test_char_token_pos_data = tf.data.Dataset.from_tensor_slices((test_sentences, test_chars, test_line_numbers_one_hot, test_total_lines_one_hot))
test_char_token_pos_label = tf.data.Dataset.from_tensor_slices(test_labels_ohe)
test_char_token_pos_dataset = tf.data.Dataset.zip((test_char_token_pos_data, test_char_token_pos_label)).batch(32).prefetch(tf.data.AUTOTUNE)
test_char_token_pos_data

!wget https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py
from helper_functions import calculate_results









from sklearn.utils import class_weight
import numpy as np
# Assuming your target labels are in a NumPy array named 'y_train'
class_weights = class_weight.compute_class_weight(class_weight='balanced',
                                                     classes=np.unique(train_label_encoded),
                                                     y=train_label_encoded)
print(class_weights)  # Optional: View the calculated weights
class_weights_dict = {}
for i, weight in enumerate(class_weights):
    class_weights_dict[i] = weight
    
class_weights_dict    

# 1. Token inputs/Model
token_input = keras.Input(shape=(1,), dtype="string", name='token_input')
text_vectorizer_layer = text_vectorizer(token_input)
text_embedding = embedding(text_vectorizer_layer)
bi_lstm_layer = keras.layers.Bidirectional(keras.layers.LSTM(64))(text_embedding)
custom_token_model = keras.Model(inputs=token_input, outputs=bi_lstm_layer)

# 2. Char Inputs/Model
char_input = keras.Input(shape=(1,), dtype="string", name='char_input')
char_vectorizer_layer = char_vectorizer(char_input)
char_embeding_layer = char_embeding(char_vectorizer_layer)
bi_char_lstm = keras.layers.Bidirectional(keras.layers.LSTM(64))(char_embeding_layer)
char_model = keras.Model(char_input, bi_char_lstm)

# 2.1 Concat char and token layers
char_token_embedding = keras.layers.Concatenate(name="char_token_embedding")([custom_token_model.output,
                                                                              char_model.output])
z = layers.Reshape((1, 256))(char_token_embedding)  # Reshape to (1, 512) to fit into LSTM
z = keras.layers.Bidirectional(keras.layers.LSTM(64))(z)
z = layers.Dropout(0.5)(z) 

# 3. line_number inputs/Model
line_number_input = keras.Input(shape=(16,), name="line_number_input")
line_number_output = keras.layers.Dense(128, activation='relu')(line_number_input)
line_number_model = keras.Model(line_number_input, line_number_output)

# 4. Total_line inputs/model
total_line_input = keras.Input(shape=(20,), name='total_line_input')
total_line_output = keras.layers.Dense(128, activation="relu")(total_line_input)
total_line_model = keras.Model(total_line_input, total_line_output)

# 5. Concat the above layers 
concat_layer = keras.layers.Concatenate(name="concat_of_token_char_total_no")([line_number_model.output,
                                                                              total_line_model.output,
                                                                              z])

# 6. add dropout
hidden_layer = keras.layers.Dense(256, activation='relu')(concat_layer)
dropout_layer_2 = keras.layers.Dropout(0.5)(hidden_layer)
outputs = keras.layers.Dense(5, activation='softmax')(dropout_layer_2)

model_8b = keras.Model(inputs=[token_input,
                             char_input,
                             line_number_input,
                             total_line_input], 
                     outputs=outputs)
model_8b.summary()

model_8b.compile(loss=keras.losses.CategoricalCrossentropy(label_smoothing=0.2),
                             optimizer=keras.optimizers.Adam(),
                             metrics=['accuracy'])

early_stoping = keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5, restore_best_weights=True, verbose=1)
history_model_8b = model_8b.fit(train_char_token_pos_dataset,
                             epochs=50,
                             steps_per_epoch=int(0.1 * len(train_char_token_pos_dataset)),
                             validation_data=val_char_token_pos_dataset,
                             validation_steps=int(0.1 * len(val_char_token_pos_dataset)),
                             callbacks=[early_stoping],
#                              class_weight=class_weights_dict
                               )
                                

from sklearn.metrics import classification_report
pred_prob_model_8b = model_8b.predict(val_char_token_pos_dataset)
print(pred_prob_model_8b[:5])
pred_model_8b = tf.argmax(pred_prob_model_8b, axis=1)
print(pred_model_8b[:5])
results_model_8b = calculate_results(val_label_encoded, pred_model_8b)
print(results_model_8b)
print(classification_report(val_label_encoded, pred_model_8b))

In [34]:
model_8b.save("model_8b_char_customtoken_line_total")
!zip -r file.zip model_8b_char_customtoken_line_total


  adding: model_8b_char_customtoken_line_total/ (stored 0%)
  adding: model_8b_char_customtoken_line_total/saved_model.pb (deflated 87%)
  adding: model_8b_char_customtoken_line_total/variables/ (stored 0%)
  adding: model_8b_char_customtoken_line_total/variables/variables.data-00000-of-00001 (deflated 24%)
  adding: model_8b_char_customtoken_line_total/variables/variables.index (deflated 70%)
  adding: model_8b_char_customtoken_line_total/fingerprint.pb (stored 0%)
  adding: model_8b_char_customtoken_line_total/keras_metadata.pb (deflated 93%)
  adding: model_8b_char_customtoken_line_total/assets/ (stored 0%)


In [35]:
!ls

__pycache__  helper_functions.py		   pubmed-rct
file.zip     model_8b_char_customtoken_line_total


In [36]:
from IPython.display import FileLink
FileLink(r'file.zip')

In [38]:
loaded_8b = keras.models.load_model("model_8b_char_customtoken_line_total")

In [39]:
loaded_8b.evaluate(val_char_token_pos_dataset)



[0.838618278503418, 0.8857738375663757]