In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 5.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 6.2 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 49.4 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 19.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 54.0 MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses:

In [None]:
!nvidia-smi

Wed May 11 09:41:13 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   44C    P0    35W / 250W |    377MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## 0. Prepare dataset

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [None]:
!git clone https://github.com/Franck-Dernoncourt/pubmed-rct.git
!ls pubmed-rct

Cloning into 'pubmed-rct'...
remote: Enumerating objects: 33, done.[K
remote: Counting objects: 100% (8/8), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 33 (delta 5), reused 5 (delta 5), pack-reused 25[K
Unpacking objects: 100% (33/33), done.
PubMed_200k_RCT
PubMed_200k_RCT_numbers_replaced_with_at_sign
PubMed_20k_RCT
PubMed_20k_RCT_numbers_replaced_with_at_sign
README.md


In [None]:
def calculate_metrics(true_labels, pred_labels):
  acc = accuracy_score(true_labels, pred_labels)
  precision, recall, f1, _ = precision_recall_fscore_support(true_labels,
                                                             pred_labels,
                                                             average="weighted")
  results_dict = {
      "Accuracy": acc, 
      "Precision": precision,
      "Recall": recall,
      "F1": f1
  }

  return results_dict

def get_lines(filename):
  with open(filename, 'r') as f:
    return f.readlines()

def preprocess_text_with_line_numbers(filename):
  input_lines = get_lines(filename) # get all lines from filename
  abstract_lines = "" # create an empty abstract
  abstract_samples = [] # create an empty list of abstracts
  
  # Loop through each line in target file
  for line in input_lines:
    if line.startswith("###"): # check to see if line is an ID line
      abstract_id = line
      abstract_lines = "" # reset abstract string
    elif line.isspace(): # check to see if line is a new line
      abstract_line_split = abstract_lines.splitlines() # split abstract into separate lines

      # Iterate through each line in abstract and count them at the same time
      for abstract_line_number, abstract_line in enumerate(abstract_line_split):
        line_data = {} # create empty dict to store data from line
        target_text_split = abstract_line.split("\t") # split target label from text
        line_data["target"] = target_text_split[0] # get target label
        line_data["text"] = target_text_split[1].lower() # get target text and lower it
        line_data["line_number"] = abstract_line_number # what number line does the line appear in the abstract?
        line_data["total_lines"] = len(abstract_line_split) - 1 # how many total lines are in the abstract? (start from 0)
        abstract_samples.append(line_data) # add line data to abstract samples list
    
    else: # if the above conditions aren't fulfilled, the line contains a labelled sentence
      abstract_lines += line
  
  return abstract_samples

In [None]:
# Start by using the 20k dataset 
data_dir = "pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/"

In [None]:
lines = get_lines(data_dir+"test.txt")
lines[:5]

['###24845963\n',
 'BACKGROUND\tThis study analyzed liver function abnormalities in heart failure patients admitted with severe acute decompensated heart failure ( ADHF ) .\n',
 'RESULTS\tA post hoc analysis was conducted with the use of data from the Evaluation Study of Congestive Heart Failure and Pulmonary Artery Catheterization Effectiveness ( ESCAPE ) .\n',
 'RESULTS\tLiver function tests ( LFTs ) were measured at @ time points from baseline , at discharge , and up to @ months follow-up .\n',
 'RESULTS\tSurvival analyses were used to assess the association between admission Model of End-Stage Liver Disease Excluding International Normalized Ratio ( MELD-XI ) scores and patient outcome.There was a high prevalence of abnormal baseline ( admission ) LFTs ( albumin @ % , aspartate transaminase @ % , alanine transaminase @ % , and total bilirubin @ % ) .\n']

In [None]:
train_dataset = preprocess_text_with_line_numbers(data_dir+"train.txt")
val_dataset = preprocess_text_with_line_numbers(data_dir+"dev.txt")
test_dataset = preprocess_text_with_line_numbers(data_dir+"test.txt")
len(train_dataset), len(val_dataset), len(test_dataset)

(180040, 30212, 30135)

In [None]:
train_df = pd.DataFrame(train_dataset)
val_df = pd.DataFrame(val_dataset)
test_df = pd.DataFrame(test_dataset)

In [None]:
train_df.head()

Unnamed: 0,target,text,line_number,total_lines
0,OBJECTIVE,to investigate the efficacy of @ weeks of dail...,0,11
1,METHODS,a total of @ patients with primary knee oa wer...,1,11
2,METHODS,outcome measures included pain reduction and i...,2,11
3,METHODS,pain was assessed using the visual analog pain...,3,11
4,METHODS,secondary outcome measures included the wester...,4,11


In [None]:
train_df["target"].to_numpy()

array(['OBJECTIVE', 'METHODS', 'METHODS', ..., 'RESULTS', 'CONCLUSIONS',
       'CONCLUSIONS'], dtype=object)

In [None]:
#abstract text to list
train_sentences = train_df["text"].tolist()
val_sentences = val_df["text"].tolist()
test_sentences = test_df["text"].tolist()

In [None]:
train_sentences[:5]

['to investigate the efficacy of @ weeks of daily low-dose oral prednisolone in improving pain , mobility , and systemic low-grade inflammation in the short term and whether the effect would be sustained at @ weeks in older adults with moderate to severe knee osteoarthritis ( oa ) .',
 'a total of @ patients with primary knee oa were randomized @:@ ; @ received @ mg/day of prednisolone and @ received placebo for @ weeks .',
 'outcome measures included pain reduction and improvement in function scores and systemic inflammation markers .',
 'pain was assessed using the visual analog pain scale ( @-@ mm ) .',
 'secondary outcome measures included the western ontario and mcmaster universities osteoarthritis index scores , patient global assessment ( pga ) of the severity of knee oa , and @-min walk distance ( @mwd ) .']

### Create numerical labels

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
train_labels_encoded = le.fit_transform(train_df["target"].to_numpy())
val_labels_encoded = le.fit_transform(val_df["target"].to_numpy())
test_labels_encoded = le.fit_transform(test_df["target"].to_numpy())

In [None]:
num_classes = len(le.classes_)
class_names = le.classes_
num_classes, class_names

(5, array(['BACKGROUND', 'CONCLUSIONS', 'METHODS', 'OBJECTIVE', 'RESULTS'],
       dtype=object))

In [None]:
train_labels_encoded[:5]

array([3, 2, 2, 2, 2])

## 1. Baseline model (TFIDFVectorizer with MultinomialGaussianNB)

### Training

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

#create pipeline
baseline_model = Pipeline([
                           ("tf-idf", TfidfVectorizer()),
                           ("clf", MultinomialNB())
])

#fit the pipeline
baseline_model.fit(X=train_sentences,
                   y=train_labels_encoded)

Pipeline(steps=[('tf-idf', TfidfVectorizer()), ('clf', MultinomialNB())])

### Results

In [None]:
baseline_model.score(X=val_sentences,
                     y=val_labels_encoded)

0.7218323844829869

In [None]:
pred_labels_baseline = baseline_model.predict(X = val_sentences)

In [None]:
calculate_metrics(val_labels_encoded, pred_labels_baseline)

{'Accuracy': 0.7218323844829869,
 'F1': 0.6989250353450294,
 'Precision': 0.7186466952323352,
 'Recall': 0.7218323844829869}

## 2. BERT (Bidirectional Encoder Representations from Transformers (BERT)) - Tensorflow

In [None]:
import tensorflow as tf

In [None]:
y_train = tf.keras.utils.to_categorical(pd.Series(train_labels_encoded), num_classes=num_classes)
y_val = tf.keras.utils.to_categorical(pd.Series(val_labels_encoded), num_classes=num_classes)
y_test = tf.keras.utils.to_categorical(pd.Series(test_labels_encoded), num_classes=num_classes)

In [None]:
y_train

array([[0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       ...,
       [0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.]], dtype=float32)

In [None]:
num_classes

5

In [None]:
!pip install tensorflow_text

Collecting tensorflow_text
  Downloading tensorflow_text-2.8.2-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 12.5 MB/s 
Collecting tf-estimator-nightly==2.8.0.dev2021122109
  Downloading tf_estimator_nightly-2.8.0.dev2021122109-py2.py3-none-any.whl (462 kB)
[K     |████████████████████████████████| 462 kB 60.7 MB/s 
Installing collected packages: tf-estimator-nightly, tensorflow-text
Successfully installed tensorflow-text-2.8.2 tf-estimator-nightly-2.8.0.dev2021122109


In [None]:
train_sentences[:2]

['to investigate the efficacy of @ weeks of daily low-dose oral prednisolone in improving pain , mobility , and systemic low-grade inflammation in the short term and whether the effect would be sustained at @ weeks in older adults with moderate to severe knee osteoarthritis ( oa ) .',
 'a total of @ patients with primary knee oa were randomized @:@ ; @ received @ mg/day of prednisolone and @ received placebo for @ weeks .']

In [None]:
import tensorflow_hub as hub
import tensorflow_text as text

preprocessor = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

def get_embeddings(sentences): 
  preprocessed_text = preprocessor(sentences)
  return encoder(preprocessed_text)['pooled_output']

get_embeddings(train_sentences[:2])

<tf.Tensor: shape=(2, 768), dtype=float32, numpy=
array([[-0.87553793, -0.5849887 , -0.9523438 , ..., -0.9403987 ,
        -0.6316793 ,  0.4349916 ],
       [-0.3602004 , -0.26815078, -0.6406103 , ..., -0.6820565 ,
        -0.49301693,  0.0813645 ]], dtype=float32)>

In [None]:
input_layer = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
x = preprocessor(input_layer)
x = encoder(x)
# x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(512))(tf.expand_dims(x['pooled_output'], axis=-1))
x = tf.keras.layers.Dropout(0.00001, name="dropout")(x['pooled_output'])
# x = tf.keras.layers.Conv1D(64, kernel_size=5, padding="same", activation="relu")(tf.expand_dims(x['pooled_output'], axis=-1))
# x = tf.keras.layers.GlobalAveragePooling1D()(tf.expand_dims(x,axis=-1)) # condense the output of our feature vector
x = tf.keras.layers.Dense(num_classes, activation='softmax', name="output")(x)

model_bert = tf.keras.Model(input_layer, x)

In [None]:
len(y_train)

180040

In [None]:
N_EPOCHS = 10

earlystop_callback = tf.keras.callbacks.EarlyStopping(monitor = "val_loss", 
                                                      patience = 3,
                                                      restore_best_weights = True)

model_bert.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
                   loss="categorical_crossentropy",
                   metrics=["accuracy"])

model_bert_fit = model_bert.fit(np.asarray(train_sentences),
                                y_train,
                                validation_data = (np.asarray(val_sentences), y_val),
                                
                                callbacks = [earlystop_callback],
                                epochs = N_EPOCHS
                                )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
len(test_sentences)

30135

In [None]:
!mkdir -p saved_model
model_bert_pre.save('saved_model_pre/my_model_bert_pre')



INFO:tensorflow:Assets written to: saved_model_pre/my_model_bert_pre/assets


INFO:tensorflow:Assets written to: saved_model_pre/my_model_bert_pre/assets


In [None]:
!zip -r saved_model_pre.zip ./saved_model_pre/

  adding: saved_model_pre/ (stored 0%)
  adding: saved_model_pre/my_model_bert_pre/ (stored 0%)
  adding: saved_model_pre/my_model_bert_pre/saved_model.pb (deflated 93%)
  adding: saved_model_pre/my_model_bert_pre/variables/ (stored 0%)
  adding: saved_model_pre/my_model_bert_pre/variables/variables.index (deflated 79%)
  adding: saved_model_pre/my_model_bert_pre/variables/variables.data-00000-of-00001 (deflated 8%)
  adding: saved_model_pre/my_model_bert_pre/assets/ (stored 0%)
  adding: saved_model_pre/my_model_bert_pre/assets/vocab.txt (deflated 53%)
  adding: saved_model_pre/my_model_bert_pre/keras_metadata.pb (deflated 83%)


In [None]:
new_model_bert = tf.keras.models.load_model('saved_model/my_model_bert')
bert_preds = new_model_bert.predict(test_sentences)
bert_arg_pred = tf.argmax(bert_preds, axis=1)
calculate_metrics(test_labels_encoded, bert_arg_pred)

In [None]:
!wget https://github.com/prasmussen/gdrive/releases/download/2.1.1/gdrive_2.1.1_linux_386.tar.gz

--2022-05-10 18:48:17--  https://github.com/prasmussen/gdrive/releases/download/2.1.1/gdrive_2.1.1_linux_386.tar.gz
Resolving github.com (github.com)... 140.82.121.3
Connecting to github.com (github.com)|140.82.121.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/7398306/061c8f00-c000-11eb-98a9-9f22929954ee?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20220510%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20220510T184818Z&X-Amz-Expires=300&X-Amz-Signature=c4e09d151035abd4643344ac320447d2e3f43d6ea45ebcffd3166d8ad19040cd&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=7398306&response-content-disposition=attachment%3B%20filename%3Dgdrive_2.1.1_linux_386.tar.gz&response-content-type=application%2Foctet-stream [following]
--2022-05-10 18:48:18--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/7398306/061c8f00-c000-11eb-98a9

In [None]:
!tar -xvf gdrive_2.1.1_linux_386.tar.gz

gdrive


In [None]:
!./gdrive about

Authentication needed
Go to the following url in your browser:
https://accounts.google.com/o/oauth2/auth?access_type=offline&client_id=367116221053-7n0vf5akeru7on6o2fjinrecpdoe99eg.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&response_type=code&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&state=state

Enter verification code: 4/1AX4XfWjbiOe7G7hbEuOGPe3c11TfJBmVDqMRm_k4CpXb5tHytwCAlPEX_oU
User: Bartłomiej Swoboda, 241990@student.pwr.edu.pl
Used: 4.5 GB
Free: -4532858439.0 B
Total: 
Max upload size: 5.2 TB


In [None]:
!./gdrive upload ./saved_model_pre.zip

Uploading ./saved_model_pre.zip
Uploaded 19QXIDvlHaDMj-todwfE_FPMlcaIoVNcO at 23.3 MB/s, total 405.3 MB


In [None]:
bert_preds

## 3. BERT pretrained on Pubmed


### Training

In [None]:
import tensorflow as tf
import numpy as np

In [None]:
import tensorflow_hub as hub
import tensorflow_text as text

preprocessor_pre = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
encoder_pre = hub.KerasLayer("https://tfhub.dev/google/experts/bert/pubmed/2")

In [None]:
input_layer = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
x = preprocessor_pre(input_layer)
x = encoder_pre(x)
x = tf.keras.layers.Dropout(0.0001, name="dropout")(x['pooled_output'])
x = tf.keras.layers.Dense(5, activation='softmax', name="output")(x)

model_bert_pre = tf.keras.Model(input_layer, x)

In [None]:
N_EPOCHS = 2

earlystop_callback = tf.keras.callbacks.EarlyStopping(monitor = "val_loss", 
                                                      patience = 3,
                                                      restore_best_weights = True)

model_bert_pre.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
                   loss="categorical_crossentropy",
                   metrics=["accuracy"])

model_bert_fit_pre = model_bert_pre.fit(np.asarray(train_sentences),
                                y_train,
                                validation_data = (np.asarray(val_sentences), y_val),
                                
                                callbacks = [earlystop_callback],
                                epochs = N_EPOCHS
                                )

Epoch 1/2
Epoch 2/2


### Results

## 4. Final model playground

In [None]:
!pip install tensorflow-text

Collecting tensorflow-text
  Downloading tensorflow_text-2.8.2-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 5.0 MB/s 
Collecting tf-estimator-nightly==2.8.0.dev2021122109
  Downloading tf_estimator_nightly-2.8.0.dev2021122109-py2.py3-none-any.whl (462 kB)
[K     |████████████████████████████████| 462 kB 33.5 MB/s 
Installing collected packages: tf-estimator-nightly, tensorflow-text
Successfully installed tensorflow-text-2.8.2 tf-estimator-nightly-2.8.0.dev2021122109


In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

In [None]:
!gdown https://drive.google.com/uc?id=19QXIDvlHaDMj-todwfE_FPMlcaIoVNcO
!unzip saved_model_pre.zip

Archive:  saved_model_pre.zip
   creating: saved_model_pre/
   creating: saved_model_pre/my_model_bert_pre/
  inflating: saved_model_pre/my_model_bert_pre/saved_model.pb  
   creating: saved_model_pre/my_model_bert_pre/variables/
  inflating: saved_model_pre/my_model_bert_pre/variables/variables.index  
  inflating: saved_model_pre/my_model_bert_pre/variables/variables.data-00000-of-00001  
   creating: saved_model_pre/my_model_bert_pre/assets/
  inflating: saved_model_pre/my_model_bert_pre/assets/vocab.txt  
  inflating: saved_model_pre/my_model_bert_pre/keras_metadata.pb  


In [None]:
best_model = tf.keras.models.load_model('/content/saved_model_pre/my_model_bert_pre')

In [None]:
best_prob_preds = best_model.predict(test_sentences)
best_preds = tf.argmax(best_prob_preds, axis=1)
calculate_metrics(test_labels_encoded, best_preds)

{'Accuracy': 0.853890824622532,
 'F1': 0.8519980415040023,
 'Precision': 0.8542161313279708,
 'Recall': 0.853890824622532}

In [None]:
test_input_raw= """
Despite success in hematologic malignancies, the treatment landscape of chimeric antigen receptor (CAR) T cell therapy for solid tumors remains limited. Claudin18.2 (CLDN18.2)-redirected CAR T cells showed promising efficacy against gastric cancer (GC) in a preclinical study. Here we report the interim analysis results of an ongoing, open-label, single-arm, phase 1 clinical trial of CLDN18.2-targeted CAR T cells (CT041) in patients with previously treated, CLDN18.2-positive digestive system cancers ( NCT03874897 ). The primary objective was safety after CT041 infusion; secondary objectives included CT041 efficacy, pharmacokinetics and immunogenicity. We treated 37 patients with one of three CT041 doses: 2.5 × 108, 3.75 × 108 or 5.0 × 108 cells. All patients experienced a grade 3 or higher hematologic toxicity. Grade 1 or 2 cytokine release syndrome (CRS) occurred in 94.6% of patients. No grade 3 or higher CRS or neurotoxicities, treatment-related deaths or dose-limiting toxicities were reported. The overall response rate (ORR) and disease control rate (DCR) reached 48.6% and 73.0%, respectively. The 6-month duration of response rate was 44.8%. In patients with GC, the ORR and DCR reached 57.1% and 75.0%, respectively, and the 6-month overall survival rate was 81.2%. These initial results suggest that CT041 has promising efficacy with an acceptable safety profile in patients with heavily pretreated, CLDN18.2-positive digestive system cancers, particularly in those with GC. 
"""

In [None]:
test_input_raw

'\nDespite success in hematologic malignancies, the treatment landscape of chimeric antigen receptor (CAR) T cell therapy for solid tumors remains limited. Claudin18.2 (CLDN18.2)-redirected CAR T cells showed promising efficacy against gastric cancer (GC) in a preclinical study. Here we report the interim analysis results of an ongoing, open-label, single-arm, phase 1 clinical trial of CLDN18.2-targeted CAR T cells (CT041) in patients with previously treated, CLDN18.2-positive digestive system cancers ( NCT03874897 ). The primary objective was safety after CT041 infusion; secondary objectives included CT041 efficacy, pharmacokinetics and immunogenicity. We treated 37 patients with one of three CT041 doses: 2.5 × 108, 3.75 × 108 or 5.0 × 108 cells. All patients experienced a grade 3 or higher hematologic toxicity. Grade 1 or 2 cytokine release syndrome (CRS) occurred in 94.6% of patients. No grade 3 or higher CRS or neurotoxicities, treatment-related deaths or dose-limiting toxicities w

In [None]:
test_input_list = test_input_raw.split('. ')

print(test_input_list)

['\nDespite success in hematologic malignancies, the treatment landscape of chimeric antigen receptor (CAR) T cell therapy for solid tumors remains limited', 'Claudin18.2 (CLDN18.2)-redirected CAR T cells showed promising efficacy against gastric cancer (GC) in a preclinical study', 'Here we report the interim analysis results of an ongoing, open-label, single-arm, phase 1 clinical trial of CLDN18.2-targeted CAR T cells (CT041) in patients with previously treated, CLDN18.2-positive digestive system cancers ( NCT03874897 )', 'The primary objective was safety after CT041 infusion; secondary objectives included CT041 efficacy, pharmacokinetics and immunogenicity', 'We treated 37 patients with one of three CT041 doses: 2.5 × 108, 3.75 × 108 or 5.0 × 108 cells', 'All patients experienced a grade 3 or higher hematologic toxicity', 'Grade 1 or 2 cytokine release syndrome (CRS) occurred in 94.6% of patients', 'No grade 3 or higher CRS or neurotoxicities, treatment-related deaths or dose-limiti

In [None]:
type(test_input_list)

list

In [None]:
len(test_input_list)

13

In [None]:
test_input_prob_preds = best_model.predict(test_input_list)
test_input_preds = tf.argmax(test_input_prob_preds, axis=1)
test_input_preds

<tf.Tensor: shape=(13,), dtype=int64, numpy=array([0, 0, 0, 2, 4, 4, 4, 4, 4, 4, 4, 1, 2])>

In [None]:
len(test_input_preds)

13

In [None]:
background_sentences = []
conclusions_sentences = []
methods_sentences = []
objective_sentences = []
results_sentences = []

for sentence_id, sentence in enumerate(test_input_list):
  # 'BACKGROUND', 'CONCLUSIONS', 'METHODS', 'OBJECTIVE', 'RESULTS'
  label_number = test_input_preds[sentence_id].numpy()

  if label_number == 0:
    background_sentences.append(sentence)
  if label_number == 1:
    conclusions_sentences.append(sentence)
  if label_number == 2:
    methods_sentences.append(sentence)
  if label_number == 3:
    objective_sentences.append(sentence)
  if label_number == 4:
    results_sentences.append(sentence)
  
print(f"BACKGROUND (WPROWADZENIE): {background_sentences}")
print(f"CONCLUSIONS (WNIOSEK): {conclusions_sentences}")
print(f"METHODS (METODY): {methods_sentences}")
print(f"OBJECTIVE (CEL): {objective_sentences}")
print(f"RESULTS (WYNIKI): {results_sentences}")

BACKGROUND (WPROWADZENIE): ['\nDespite success in hematologic malignancies, the treatment landscape of chimeric antigen receptor (CAR) T cell therapy for solid tumors remains limited', 'Claudin18.2 (CLDN18.2)-redirected CAR T cells showed promising efficacy against gastric cancer (GC) in a preclinical study', 'Here we report the interim analysis results of an ongoing, open-label, single-arm, phase 1 clinical trial of CLDN18.2-targeted CAR T cells (CT041) in patients with previously treated, CLDN18.2-positive digestive system cancers ( NCT03874897 )']
CONCLUSIONS (WNIOSEK): ['These initial results suggest that CT041 has promising efficacy with an acceptable safety profile in patients with heavily pretreated, CLDN18.2-positive digestive system cancers, particularly in those with GC']
METHODS (METODY): ['The primary objective was safety after CT041 infusion; secondary objectives included CT041 efficacy, pharmacokinetics and immunogenicity', '\n']
OBJECTIVE (CEL): []
RESULTS (WYNIKI): ['We

In [None]:
test_input_preds[0].numpy()

0

In [None]:
test_input_list[2]

'Here we report the interim analysis results of an ongoing, open-label, single-arm, phase 1 clinical trial of CLDN18.2-targeted CAR T cells (CT041) in patients with previously treated, CLDN18.2-positive digestive system cancers ( NCT03874897 )'