In [3]:
from transformers import GPT2Tokenizer, GPT2Model
import torch

# Load the GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2Model.from_pretrained("gpt2")

# Set the padding token
tokenizer.pad_token = tokenizer.eos_token

# Example medical transcript
medical_transcript = "Patient presented with symptoms of fever and cough. Diagnosis revealed pneumonia."

# Tokenize the medical transcript
inputs = tokenizer(medical_transcript, return_tensors="pt", padding=True, truncation=True)

# Forward pass through the model
with torch.no_grad():
    outputs = model(**inputs)

# Get the embeddings of the last hidden layer
last_hidden_states = outputs.last_hidden_state

# Calculate the mean embedding across all tokens
mean_embedding = torch.mean(last_hidden_states, dim=1)

# Convert the mean embedding to a numpy array
mean_embedding_numpy = mean_embedding.numpy()

print("Mean Embedding Shape:", mean_embedding_numpy.shape)
print("Mean Embedding:", mean_embedding_numpy)


Mean Embedding Shape: (1, 768)
Mean Embedding: [[-2.46534258e-01  1.55215949e-01 -3.33123863e-01 -1.15170322e-01
  -1.38967335e-02  4.40402441e-02  5.67849541e+00  1.51439518e-01
  -2.92459279e-01  1.25109583e-01  4.68287766e-01  3.73245448e-01
   1.49615511e-01 -4.41485912e-01 -1.10913053e-01 -2.91135848e-01
  -2.99705774e-01  2.13527624e-02 -2.54093379e-01 -1.72143832e-01
  -6.60345405e-02  1.39443919e-01 -4.49155897e-01  2.71856189e-01
  -2.09399968e-01 -6.21943362e-02 -3.93749118e-01 -1.01555586e-01
  -5.67435510e-02  3.31536308e-02 -2.62225658e-01  4.45141969e-03
  -1.39230475e-01 -2.13647753e-01 -1.57585308e-01 -3.23703647e-01
   6.64620895e+01  1.33286282e-01  2.43868423e-03  7.48879492e-01
  -3.46334055e-02 -3.43357205e-01 -9.38073918e-02 -5.40846400e-02
  -3.04617316e-01 -2.39816815e-01 -1.05470829e-01  1.88021045e-02
  -4.51870821e-02  8.07366788e-01  1.75010890e-01 -3.99206758e-01
  -4.39740151e-01  3.14695209e-01 -1.25587448e-01  2.02625424e-01
  -2.09220931e-01 -2.66584814

In [5]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np

# Sample medical transcript
medical_transcript = [
    "Patient presented with symptoms of cough and shortness of breath.",
    "Physical examination revealed elevated temperature and wheezing.",
    "Diagnosis confirmed as bronchitis, prescribed antibiotics and inhaler."
]

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Tokenize and encode the medical transcript
inputs = tokenizer(medical_transcript, return_tensors='pt', padding=True, truncation=True)

# Pass the encoded transcript through the model
with torch.no_grad():
    outputs = model(**inputs)

# Extract embeddings from the last hidden states of the model
embeddings = outputs.last_hidden_state

# Average the embeddings for each token to get sentence-level embeddings
sentence_embeddings = np.mean(embeddings.numpy(), axis=1)

# Print the shape of the sentence embeddings
print("Shape of Sentence Embeddings:", sentence_embeddings.shape)

print(sentence_embeddings)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Shape of Sentence Embeddings: (3, 768)
[[-0.41188726  0.19563177 -0.21300343 ... -0.39615563 -0.17174609
  -0.19250548]
 [-0.16289961 -0.1759688  -0.30027658 ... -0.22405262  0.1804233
  -0.08022095]
 [-0.09802145  0.09829926  0.1123988  ... -0.47010767 -0.16395707
   0.07442835]]


In [7]:
from transformers import XLNetTokenizer, XLNetModel
import torch
import numpy as np

# Sample medical transcript
medical_transcript = [
    "Patient presented with symptoms of cough and shortness of breath.",
    "Physical examination revealed elevated temperature and wheezing.",
    "Diagnosis confirmed as bronchitis, prescribed antibiotics and inhaler."
]

# Load pre-trained XLNet tokenizer and model
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model = XLNetModel.from_pretrained('xlnet-base-cased')

# Tokenize and encode the medical transcript
inputs = tokenizer(medical_transcript, return_tensors='pt', padding=True, truncation=True)

# Pass the encoded transcript through the model
with torch.no_grad():
    outputs = model(**inputs)

# Extract embeddings from the last hidden states of the model
embeddings = outputs.last_hidden_state

# Average the embeddings for each token to get sentence-level embeddings
sentence_embeddings = np.mean(embeddings.numpy(), axis=1)

# Print the shape of the sentence embeddings
print("Shape of Sentence Embeddings:", sentence_embeddings.shape)

print(sentence_embeddings)

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetModel: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Shape of Sentence Embeddings: (3, 768)
[[-0.784388    1.136335   -1.4560485  ... -1.3636625  -0.02272269
   0.945049  ]
 [ 1.9143348   2.7174098  -1.2326441  ... -0.5918026   1.250938
   1.0457697 ]
 [-0.06876998  1.707028   -3.581354   ...  0.13654473 -0.00410832
   0.49695683]]


In [8]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
import numpy as np

# Sample medical transcript
medical_transcript = [
    "Patient presented with symptoms of cough and shortness of breath.",
    "Physical examination revealed elevated temperature and wheezing.",
    "Diagnosis confirmed as bronchitis, prescribed antibiotics and inhaler."
]

# Load pre-trained T5 tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# Prepare input data for T5
input_text = "summarize: " + ' '.join(medical_transcript)

# Tokenize input text
input_ids = tokenizer.encode(input_text, return_tensors="pt")

# Generate embeddings using T5
with torch.no_grad():
    outputs = model.generate(input_ids)

# Decode the generated embeddings
embeddings = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print the generated embeddings
print("Generated Embeddings:")
print(embeddings)


Downloading (…)"spiece.model";:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)enizer_config.json";:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading (…)"config.json";:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)ration_config.json";:   0%|          | 0.00/147 [00:00<?, ?B/s]



Generated Embeddings:
physical examination revealed elevated temperature and wheezing.


In [9]:
from transformers import RobertaTokenizer, RobertaModel
import torch

# Load the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Example medical transcript
medical_transcript = "Patient presented with symptoms of fever and cough. Diagnosis revealed pneumonia."

# Tokenize the medical transcript
inputs = tokenizer(medical_transcript, return_tensors="pt", padding=True, truncation=True)

# Forward pass through the model
with torch.no_grad():
    outputs = model(**inputs)

# Get the embeddings of the last hidden layer
last_hidden_states = outputs.last_hidden_state

# Calculate the mean embedding across all tokens
mean_embedding = torch.mean(last_hidden_states, dim=1)

# Convert the mean embedding to a numpy array
mean_embedding_numpy = mean_embedding.numpy()

print("Mean Embedding Shape:", mean_embedding_numpy.shape)
print("Mean Embedding:", mean_embedding_numpy)


Downloading (…)"vocab.json";:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)"merges.txt";:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)"config.json";:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Mean Embedding Shape: (1, 768)
Mean Embedding: [[-1.34657444e-02  8.05886164e-02  9.51291472e-02 -9.36847180e-02
   2.95858383e-01  3.12678337e-01  5.81306070e-02  3.27146016e-02
   1.52028233e-01 -1.28165603e-01 -1.18035570e-01 -2.30172843e-01
   4.93167639e-02 -1.43226296e-01 -3.03018764e-02 -2.88337946e-01
   1.23698361e-01 -1.23696536e-01 -2.02867966e-02 -1.61040038e-01
  -9.55083445e-02  4.34743837e-02 -2.39905622e-03 -7.56407008e-02
  -2.41796114e-02 -5.93555868e-02 -9.40332860e-02  3.65279242e-02
   1.42758861e-01 -2.14311540e-01 -1.01221092e-02  6.35195151e-02
   1.18025638e-01 -1.30293459e-01 -5.94660733e-03 -5.66560552e-02
   1.70334756e-01  9.83177405e-03  1.10045493e-01  6.96563721e-02
  -2.35299617e-01 -6.25173151e-02 -6.04168512e-02 -1.25766873e-01
  -3.34312469e-02 -2.57791486e-02  2.67696586e-02  3.02116340e-03
   8.98886845e-02 -5.48655866e-03 -8.30903426e-02  4.45259474e-02
  -4.26012538e-02 -1.17320172e-01 -4.40187305e-02  1.05224088e-01
  -6.52815104e-02 -3.56510170

In [11]:
from transformers import T5Tokenizer, T5Model
import torch

# Load the T5 tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5Model.from_pretrained("t5-small")

# Example medical transcript
medical_transcript = "Patient presented with symptoms of fever and cough. Diagnosis revealed pneumonia."

# Tokenize the medical transcript
inputs = tokenizer(medical_transcript, return_tensors="pt", padding=True, truncation=True)

# Define a dummy output sequence
dummy_output = torch.tensor([[1]])  # Dummy output of shape (batch_size, sequence_length)

# Forward pass through the model
with torch.no_grad():
    outputs = model(input_ids=inputs["input_ids"], decoder_input_ids=dummy_output)

# Get the embeddings of the last hidden layer
last_hidden_states = outputs.last_hidden_state

# Calculate the mean embedding across all tokens
mean_embedding = torch.mean(last_hidden_states, dim=1)

# Convert the mean embedding to a numpy array
mean_embedding_numpy = mean_embedding.numpy()

print("Mean Embedding Shape:", mean_embedding_numpy.shape)
print("Mean Embedding:", mean_embedding_numpy)


Mean Embedding Shape: (1, 512)
Mean Embedding: [[ 1.43325632e-03  3.03820461e-01  9.98550579e-02 -7.40139186e-02
   9.59046651e-04  6.23361059e-02 -1.76047102e-01 -4.88029495e-02
   4.75238264e-01 -1.78186938e-01  2.60818064e-01  5.55268563e-02
  -4.65925112e-02 -1.44470856e-01  3.17464924e+00 -1.24402113e-01
  -1.13300875e-01 -3.44110504e-02  1.58099309e-01 -4.80108224e-02
   2.10603801e-04  6.91224411e-02  1.36738554e-01  2.55352759e+00
  -5.00732996e-02 -3.04256618e-01  2.45103776e-01  9.09000337e-01
   1.28503785e-01  1.53549328e-01  9.58630741e-02 -6.45914748e-02
   3.02465279e-02 -2.55008727e-01  3.07639062e-01  9.58694145e-02
   3.45771573e-02  8.95791054e-02 -1.13229744e-01  1.94041535e-01
   9.57155228e-02  1.23583777e-02 -4.18286212e-03  2.59297431e-01
  -3.02465633e-02 -1.36698231e-01  4.35405046e-01 -1.97037101e-01
   9.77664813e-02  1.76745251e-01  1.41536677e-02  1.48561344e-01
  -4.76563089e-02  2.93638650e-02  7.57103637e-02 -7.89814591e-02
   3.40653509e-02 -1.73021242

In [13]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

# Sample medical transcript
medical_transcript = [
    "Patient presented with symptoms of cough and shortness of breath.",
    "Physical examination revealed elevated temperature and wheezing.",
    "Diagnosis confirmed as bronchitis, prescribed antibiotics and inhaler."
]

# Load pre-trained ERNIE tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("nghuyong/ernie-1.0")
model = AutoModel.from_pretrained("nghuyong/ernie-1.0")

# Tokenize and encode the medical transcript
inputs = tokenizer(medical_transcript, return_tensors="pt", padding=True, truncation=True)

# Pass the encoded transcript through the model
with torch.no_grad():
    outputs = model(**inputs)

# Extract embeddings from the last hidden states of the model
embeddings = outputs.last_hidden_state

# Average the embeddings for each token to get sentence-level embeddings
sentence_embeddings = np.mean(embeddings.numpy(), axis=1)

# Print the shape of the sentence embeddings
print("Shape of Sentence Embeddings:", sentence_embeddings.shape)

print(sentence_embeddings)


Some weights of the model checkpoint at nghuyong/ernie-1.0 were not used when initializing ErnieModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing ErnieModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ErnieModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Shape of Sentence Embeddings: (3, 768)
[[ 0.740785   -0.27213484  0.27299964 ...  0.12558618  0.00330855
  -1.6647937 ]
 [ 0.86112386 -0.51095754  0.19222717 ... -0.06696365 -0.13564548
  -1.6583649 ]
 [ 0.5311943  -0.3828348   0.2244909  ...  0.07977653  0.29741117
  -0.77518713]]


In [None]:
from transformers import AlbertTokenizer, AlbertModel
import torch
import numpy as np

# Sample medical transcript
medical_transcript = [
    "Patient presented with symptoms of cough and shortness of breath.",
    "Physical examination revealed elevated temperature and wheezing.",
    "Diagnosis confirmed as bronchitis, prescribed antibiotics and inhaler."
]

# Load pre-trained ALBERT tokenizer and model
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertModel.from_pretrained('albert-base-v2')

# Tokenize and encode the medical transcript
inputs = tokenizer(medical_transcript, return_tensors='pt', padding=True, truncation=True)

# Pass the encoded transcript through the model
with torch.no_grad():
    outputs = model(**inputs)

# Extract embeddings from the last hidden states of the model
embeddings = outputs.last_hidden_state

# Average the embeddings for each token to get sentence-level embeddings
sentence_embeddings = np.mean(embeddings.numpy(), axis=1)

# Print the shape of the sentence embeddings
print("Shape of Sentence Embeddings:", sentence_embeddings.shape)

print(sentence_embeddings)


Downloading (…)"spiece.model";:   0%|          | 0.00/760k [00:00<?, ?B/s]