#### Import all necessary libraries

In [1]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D

##### Import the dataset

In [2]:
# Load datasets
train_data = pd.read_csv("MTS-Dialog-TrainingSet.csv")
validation_data = pd.read_csv("MTS-Dialog-ValidationSet.csv")
test_data1 = pd.read_csv("MTS-Dialog-TestSet-1-MEDIQA-Chat-2023.csv")
test_data2 = pd.read_csv("MTS-Dialog-TestSet-2-MEDIQA-Sum-2023.csv")

In [3]:
# Extract dialogues and section texts
train_dialogues = train_data['dialogue'].values
train_section_texts = train_data['section_text'].values

In [4]:
validation_dialogues = validation_data['dialogue'].values
validation_section_texts = validation_data['section_text'].values

In [5]:
# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_dialogues)
vocab_size = len(tokenizer.word_index) + 1

In [6]:
# Convert text to sequences
maxlen = 100  
X_train = tokenizer.texts_to_sequences(train_dialogues)
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
Y_train = tokenizer.texts_to_sequences(train_section_texts)
Y_train = pad_sequences(Y_train, padding='post', maxlen=maxlen)
X_val = tokenizer.texts_to_sequences(validation_dialogues)
X_val = pad_sequences(X_val, padding='post', maxlen=maxlen)
Y_val = tokenizer.texts_to_sequences(validation_section_texts)
Y_val = pad_sequences(Y_val, padding='post', maxlen=maxlen)

In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional  

# Model architecture
embedding_dim = 100
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))  # Remove input_length from here
model.add(Bidirectional(LSTM(128, return_sequences=True), input_shape=(maxlen, embedding_dim)))  # Specify input_length here
model.add(Dense(vocab_size, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

  super().__init__(**kwargs)


In [9]:
# Model training
epochs = 10
batch_size = 64
history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_val, Y_val))

Epoch 1/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 2s/step - accuracy: 0.5582 - loss: 7.7210 - val_accuracy: 0.7205 - val_loss: 2.5005
Epoch 2/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 2s/step - accuracy: 0.6949 - loss: 2.6231 - val_accuracy: 0.7205 - val_loss: 2.1851
Epoch 3/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 1s/step - accuracy: 0.6974 - loss: 2.4821 - val_accuracy: 0.7205 - val_loss: 2.1155
Epoch 4/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 2s/step - accuracy: 0.6930 - loss: 2.4374 - val_accuracy: 0.7205 - val_loss: 2.0738
Epoch 5/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 2s/step - accuracy: 0.7083 - loss: 2.2946 - val_accuracy: 0.7205 - val_loss: 2.0416
Epoch 6/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 2s/step - accuracy: 0.7025 - loss: 2.2964 - val_accuracy: 0.7205 - val_loss: 2.0140
Epoch 7/10
[1m19/19[0m [32m━━━━━━━━━━

In [10]:
# Evaluate the model on test set 
# Preprocess test data
test_dialogues1 = test_data1['dialogue'].values
test_section_texts1 = test_data1['section_text'].values
test_dialogues2 = test_data2['dialogue'].values
test_section_texts2 = test_data2['section_text'].values
X_test1 = tokenizer.texts_to_sequences(test_dialogues1)
X_test1 = pad_sequences(X_test1, padding='post', maxlen=maxlen)
Y_test1 = tokenizer.texts_to_sequences(test_section_texts1)
Y_test1 = pad_sequences(Y_test1, padding='post', maxlen=maxlen)
X_test2 = tokenizer.texts_to_sequences(test_dialogues2)
X_test2 = pad_sequences(X_test2, padding='post', maxlen=maxlen)
Y_test2 = tokenizer.texts_to_sequences(test_section_texts2)
Y_test2 = pad_sequences(Y_test2, padding='post', maxlen=maxlen)

In [11]:
# Evaluate on Test Set 1
loss_test1, accuracy_test1 = model.evaluate(X_test1, Y_test1, verbose=0)
print(f"Test Set 1 - Loss: {loss_test1}, Accuracy: {accuracy_test1}")
# Evaluate on Test Set 2
loss_test2, accuracy_test2 = model.evaluate(X_test2, Y_test2, verbose=0)
print(f"Test Set 2 - Loss: {loss_test2}, Accuracy: {accuracy_test2}")

Test Set 1 - Loss: 2.142153263092041, Accuracy: 0.7084000110626221
Test Set 2 - Loss: 1.9102681875228882, Accuracy: 0.7232499718666077


In [12]:
# Define the generate_clinical_notes function
def generate_clinical_notes(dialogue):
    # Tokenize and pad the input dialogue
    X_input = tokenizer.texts_to_sequences([dialogue])
    X_input = pad_sequences(X_input, padding='post', maxlen=maxlen)
    # Predict the section text
    predicted_sequence = model.predict(X_input)
    # Convert the predicted sequence back to text
    predicted_text = ' '.join([tokenizer.index_word[idx] for idx in np.argmax(predicted_sequence, axis=-1)[0] if idx != 0])
    
    return predicted_text
example_dialogue = '''
Doctor: When did your pain begin? 
Patient: I've had low back pain for about eight years now.
Doctor: Is there any injury? 
Patient: Yeah, it started when I fell in an A B C store.
Doctor: How old are you now?
Patient: I'm twenty six.  
Doctor: What kind of treatments have you had for this low back pain? 
Patient: Yeah, I got referred to P T, and I went, but only once or twice, um, and if I remember right, they only did the electrical stimulation, and heat. 
Doctor: I see, how has your pain progressed over the last eight years? 
Patient: It's been pretty continuous, but it's been at varying degrees, sometimes are better than others. 
Doctor: Do you have any children? 
Patient: Yes, I had my son in August of two thousand eight, and I've had back pain since giving birth. 
Doctor: Have you had any falls since the initial one? 
Patient: Yes, I fell four or five days ago while I was mopping the floor. 
Doctor: Did you land on your lower back again?
Patient: Yes, right onto my tailbone. 
Doctor: Did that make the low back pain worse? 
Patient: Yes. 
Doctor: Have you seen any other doctors for this issue? 
Patient: Yes, I saw Doctor X on January tenth two thousand nine, and I have a follow up appointment scheduled for February tenth two thousand nine.
'''
print("Dialogue:", example_dialogue)

predicted_notes = generate_clinical_notes(example_dialogue)
print("Predicted Clinical Notes:", predicted_notes)


Dialogue: 
Doctor: When did your pain begin? 
Patient: I've had low back pain for about eight years now.
Doctor: Is there any injury? 
Patient: Yeah, it started when I fell in an A B C store.
Doctor: How old are you now?
Patient: I'm twenty six.  
Doctor: What kind of treatments have you had for this low back pain? 
Patient: Yeah, I got referred to P T, and I went, but only once or twice, um, and if I remember right, they only did the electrical stimulation, and heat. 
Doctor: I see, how has your pain progressed over the last eight years? 
Patient: It's been pretty continuous, but it's been at varying degrees, sometimes are better than others. 
Doctor: Do you have any children? 
Patient: Yes, I had my son in August of two thousand eight, and I've had back pain since giving birth. 
Doctor: Have you had any falls since the initial one? 
Patient: Yes, I fell four or five days ago while I was mopping the floor. 
Doctor: Did you land on your lower back again?
Patient: Yes, right onto my tai