# NLP Model

## Data Preparation

In [1]:
import pandas as pd
import re
import os

In [2]:
def load_txt_files(directory):
	txt_files = []
	for filename in os.listdir(directory):
		if filename.endswith('.txt'):
			with open(os.path.join(directory, filename), 'r') as file:
				txt_files.append(file.read())
	return txt_files

directory = '../data/i2b2/training_20180910/training_20180910'  
txt_files = load_txt_files(directory)
			

In [3]:
def extract_sections(note):
    sections = {
        'Chief Complaint': '',
        'History of Present Illness': '',
        'Past Medical History': '',
        'Discharge Diagnosis': ''
    }
    for section in sections.keys():
        pattern = re.compile(r'{}:(.*?)(?:\n\n|$)'.format(re.escape(section)), re.DOTALL)
        match = pattern.search(note)
        if match:
            sections[section] = match.group(1).strip()
    
    return sections

notes_data = [extract_sections(note) for note in txt_files]

In [4]:
notes_data

[{'Chief Complaint': 'Post-cardiac arrest, asthma exacerbation',
  'History of Present Illness': 'Mr. [**Known lastname 3234**] is a 36 year old gentleman with a PMH signifciant\nwith dilated cardiomyopathy s/p AICD, asthma, and HTN admitted\nto an OSH with dyspnea now admitted to the MICU after PEA arrest\nx2. The patient initially presented to LGH ED with hypoxemic\nrespiratory distress. While at the OSH, he received CTX,\nazithromycin, SC epinephrine, and solumedrol. While at the OSH,\nhe became confused and subsequently had an episode of PEA arrest\nand was intubated. He received epinephrine, atropine, magnesium,\nand bicarb. In addition, he had bilateral needle thoracostomies\nwith report of air return on the left, and he subsequently had\nbilateral chest tubes placed.  After approximately 15-20 minutes\nof rescucitation, he had ROSC. He received vecuronium and was\nstarted on an epi gtt for asthma and a cooling protocol, and was\nthen transferred to [**Hospital1 18**] for further

In [5]:
data = []
for note in notes_data:
    if note['Chief Complaint'] and note['Discharge Diagnosis']:
        symptoms = note['Chief Complaint']
        diagnosis = note['Discharge Diagnosis']
        data.append((symptoms, diagnosis))

df = pd.DataFrame(data, columns=['Symptoms', 'Diagnosis'])


In [6]:
df

Unnamed: 0,Symptoms,Diagnosis
0,"Post-cardiac arrest, asthma exacerbation",Anoxic Brain Injury s/p PEA arrest x2\nStatus ...
1,Abdominal Pain,Primary:\n-Abdominal Pain\n-Acute on chronic r...
2,SOB,Primary:\n1) Pulmonary Embolism with history o...
3,"Hypotension with elevated lactate, code sepsis.",Primary:\n1. Sepsis\n2. Shock liver\n3. Hepari...
4,SVC thrombosis,1. Deep Vein Thrombosis of subclavian vein\n2....
...,...,...
265,"Weakness, slurred speech, increased frequency ...",Acute Renal Failure\nHyperkalemia
266,HCC/HCV,HCV/HCC\ndeaf\nblind
267,Transferred from OSH with hypoxic respiratory ...,"congestive heart failure , acute on chronic r..."
268,Severe lower extremity lymphedema with superim...,Lymphedema with superimposed cellulitis and un...


In [6]:
df['Diagnosis'][:20]

0     Anoxic Brain Injury s/p PEA arrest x2\nStatus ...
1     Primary:\n-Abdominal Pain\n-Acute on chronic r...
2     Primary:\n1) Pulmonary Embolism with history o...
3     Primary:\n1. Sepsis\n2. Shock liver\n3. Hepari...
4     1. Deep Vein Thrombosis of subclavian vein\n2....
5             Primary:\nHealthcare associated pneumonia
6     Primary Diagnosis\n-Altered mental status seco...
7     1.) Malignant pleural effusion\n2.) Mucinous a...
8     PRIMARY:\nDiabetic Ketoacidosis\nDiabetic foot...
9     Primary:\nDisseminated intravascular coagulati...
10                        Pneumonia, Pulmonary Embolism
11                                                  N/A
12    Type A Aortic Dissection, Aortic Insufficiency...
13    Metastatic rectal cancer.\nSmall bowel obstruc...
14    1. Hypothyroidism, primary\n2. Pancytopenia(an...
15    Vocal cord dysfunction\nEpiglottitis\nCardiac ...
16    Primary: Korsakoff's psychosis, back pain, agi...
17                                        Jejuna

In [7]:
def extract_primary_diagnosis(diagnosis):
    diagnosis = diagnosis.strip()
    
    if "Chief cause of death" in diagnosis:
        primary_diagnosis = diagnosis.split("Chief cause of death:")[1].split('\n')[0].strip()
    elif "primary diagnosis" in diagnosis.lower():
        primary_diagnosis = diagnosis.lower().split("primary diagnosis")[1].strip(':').strip()
    elif "primary:" in diagnosis.lower():
        primary_section = diagnosis.lower().split("primary:")[1]
        primary_diagnosis = primary_section.split('\n')[1].strip('-').strip()
    elif "diagnosis:" in diagnosis.lower():
        primary_diagnosis = diagnosis.lower().split("diagnosis:")[1].split('\n')[0].strip('-')
        #primary_diagnosis = primary_section.split('\n')[1].strip('-').strip()
    else:
        primary_diagnosis = diagnosis.split('\n')[0].strip()
    
	# remove numberings like 1., 2., 1), 2), ... 
    primary_diagnosis = re.sub(r'^\d+[\.\)]{1,2}\s*', '', primary_diagnosis)
    
    return primary_diagnosis

In [8]:
df['Primary Diagnosis'] = df['Diagnosis'].apply(extract_primary_diagnosis)

In [9]:
df[['Symptoms', 'Primary Diagnosis']]

Unnamed: 0,Symptoms,Primary Diagnosis
0,"Post-cardiac arrest, asthma exacerbation",Anoxic Brain Injury s/p PEA arrest x2
1,Abdominal Pain,abdominal pain
2,SOB,pulmonary embolism with history of dvt and ivc...
3,"Hypotension with elevated lactate, code sepsis.",sepsis
4,SVC thrombosis,Deep Vein Thrombosis of subclavian vein
...,...,...
265,"Weakness, slurred speech, increased frequency ...",Acute Renal Failure
266,HCC/HCV,HCV/HCC
267,Transferred from OSH with hypoxic respiratory ...,"congestive heart failure , acute on chronic r..."
268,Severe lower extremity lymphedema with superim...,Lymphedema with superimposed cellulitis and un...


In [10]:
df['Primary Diagnosis'][6] = 'Altered mental status'

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df['Primary Diagnosis'][6] = 'Altered mental status'


In [11]:
df['Primary Diagnosis'][:20]

0                 Anoxic Brain Injury s/p PEA arrest x2
1                                        abdominal pain
2     pulmonary embolism with history of dvt and ivc...
3                                                sepsis
4               Deep Vein Thrombosis of subclavian vein
5                       healthcare associated pneumonia
6                                 Altered mental status
7                            Malignant pleural effusion
8                                 diabetic ketoacidosis
9                disseminated intravascular coagulation
10                        Pneumonia, Pulmonary Embolism
11                                                  N/A
12    Type A Aortic Dissection, Aortic Insufficiency...
13                            Metastatic rectal cancer.
14                              Hypothyroidism, primary
15                               Vocal cord dysfunction
16                   secondary: hypertension, bph, gerd
17                                        Jejuna

In [12]:
df['Symptoms']

0               Post-cardiac arrest, asthma exacerbation
1                                         Abdominal Pain
2                                                    SOB
3        Hypotension with elevated lactate, code sepsis.
4                                         SVC thrombosis
                             ...                        
265    Weakness, slurred speech, increased frequency ...
266                                              HCC/HCV
267    Transferred from OSH with hypoxic respiratory ...
268    Severe lower extremity lymphedema with superim...
269                                             s/p fall
Name: Symptoms, Length: 270, dtype: object

## Training a simple NLP model

In [139]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

X = df['Symptoms']
y = df['Primary Diagnosis']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression(max_iter=1000))
])

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


                                                                                                                                                                       precision    recall  f1-score   support

                                                                                                                          # Fall, with findings of C5-6 retrolistesis       0.00      0.00      0.00         1
                                                                                                                                                        ACUTE ISSUES:       0.00      0.00      0.00         1
                                                                                                                                   Acute renal failure requiring CVVH       0.00      0.00      0.00         1
                                                                                                                                                      Adrenal adenoma      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [140]:
def predict_disease(symptoms):
    return model.predict([symptoms])[0]

symptoms_input = "fever, stomach pain"
predicted_disease = predict_disease(symptoms_input)
print(f'Predicted Disease: {predicted_disease}')


Predicted Disease: Primary


## GPT2 Model 1

Training using symptoms and primary diagnosis text

In [33]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset

# Load pre-trained GPT-2 model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Set eos_token as pad_token
tokenizer.pad_token = tokenizer.eos_token

model = GPT2LMHeadModel.from_pretrained(model_name)

# Calculate total number of layers
total_layers = len(model.transformer.h)

# Identify number of layers to unfreeze (top 10%)
num_layers_to_unfreeze = int(total_layers * 0.5)

# Freeze all layers except the top 10%
for param in model.transformer.h[:-num_layers_to_unfreeze].parameters():
    param.requires_grad = False

# Assuming df is your DataFrame containing 'Symptoms' and 'Primary Diagnosis' columns
df.dropna(inplace=True)

# Prepare data for fine-tuning
X = df['Symptoms'].tolist()
y = df['Primary Diagnosis'].tolist()

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

# Calculate maximum sequence length in your training data
#lengths = [len(tokenizer.encode(text)) for text in X_train]
#max_length = max(lengths)
max_length = max(max([len(tokenizer.encode(text)) for text in X_train + X_val]),
                 max([len(tokenizer.encode(text)) for text in y_train + y_val]))

max_length = 36
# Adjust max_length if it exceeds model limits or for efficiency
if max_length > 512:  # Example limit for GPT-2
    max_length = 512

print("Max sequence length chosen:", max_length)

# Tokenize input sequences (X_train and X_val)
inputs_train = tokenizer(X_train, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
inputs_val = tokenizer(X_val, padding=True, truncation=True, max_length=max_length, return_tensors="pt")

# Tokenize output sequences (y_train and y_val)
labels_train = tokenizer(y_train, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
labels_val = tokenizer(y_val, padding=True, truncation=True, max_length=max_length, return_tensors="pt")


# Debug print to check sizes and keys
print("Training input keys:", inputs_train.keys())
print("Validation input keys:", inputs_val.keys())
print("Training label keys:", labels_train.keys())
print("Validation label keys:", labels_val.keys())

print("Size of inputs_train['input_ids']:", inputs_train['input_ids'].size())
print("Size of labels_train['input_ids']:", labels_train['input_ids'].size())




Max sequence length chosen: 36
Training input keys: dict_keys(['input_ids', 'attention_mask'])
Validation input keys: dict_keys(['input_ids', 'attention_mask'])
Training label keys: dict_keys(['input_ids', 'attention_mask'])
Validation label keys: dict_keys(['input_ids', 'attention_mask'])
Size of inputs_train['input_ids']: torch.Size([243, 36])
Size of labels_train['input_ids']: torch.Size([243, 36])


In [22]:

print("Shape of inputs_train['input_ids']:", inputs_train['input_ids'].shape)
print("Shape of inputs_val['input_ids']:", inputs_val['input_ids'].shape)
print("Shape of labels_train['input_ids']:", labels_train['input_ids'].shape)
print("Shape of labels_val['input_ids']:", labels_val['input_ids'].shape)

Shape of inputs_train['input_ids']: torch.Size([243, 36])
Shape of inputs_val['input_ids']: torch.Size([27, 36])
Shape of labels_train['input_ids']: torch.Size([243, 36])
Shape of labels_val['input_ids']: torch.Size([27, 36])


In [23]:
# Debug print to check sizes and keys
print("Training input keys:", inputs_train.keys())
print("Validation input keys:", inputs_val.keys())
print("Training label keys:", labels_train.keys())
print("Validation label keys:", labels_val.keys())

print("Size of inputs_train['input_ids']:", inputs_train['input_ids'].size())
print("Size of labels_train['input_ids']:", labels_train['input_ids'].size())


Training input keys: dict_keys(['input_ids', 'attention_mask'])
Validation input keys: dict_keys(['input_ids', 'attention_mask'])
Training label keys: dict_keys(['input_ids', 'attention_mask'])
Validation label keys: dict_keys(['input_ids', 'attention_mask'])
Size of inputs_train['input_ids']: torch.Size([243, 36])
Size of labels_train['input_ids']: torch.Size([243, 36])


In [41]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',  # Directory to save the model checkpoints
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy="epoch",
)

# Define custom dataset class for DataLoader
class CustomDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.inputs['input_ids'])

    def __getitem__(self, idx):
        input_ids = self.inputs['input_ids'][idx]
        attention_mask = self.inputs['attention_mask'][idx]
        labels = self.labels['input_ids'][idx]
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

# Create DataLoader instances
train_dataset = CustomDataset(inputs_train, labels_train)
eval_dataset = CustomDataset(inputs_val, labels_val)

# Define trainer with correct DataLoader
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# Fine-tune the model
train_results = trainer.train()

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_gpt2_model")

 33%|███▎      | 61/183 [02:33<04:27,  2.19s/it]
 33%|███▎      | 61/183 [02:36<04:27,  2.19s/it]

{'eval_loss': 2.336745262145996, 'eval_runtime': 2.2848, 'eval_samples_per_second': 11.817, 'eval_steps_per_second': 3.064, 'epoch': 1.0}


 55%|█████▍    | 100/183 [04:09<03:12,  2.32s/it]

{'loss': 1.6401, 'grad_norm': 29.13014030456543, 'learning_rate': 2.2677595628415303e-05, 'epoch': 1.64}


 67%|██████▋   | 122/183 [05:01<02:20,  2.31s/it]
 67%|██████▋   | 122/183 [05:03<02:20,  2.31s/it]

{'eval_loss': 2.3271117210388184, 'eval_runtime': 2.2789, 'eval_samples_per_second': 11.848, 'eval_steps_per_second': 3.072, 'epoch': 2.0}


100%|██████████| 183/183 [07:34<00:00,  2.39s/it]
100%|██████████| 183/183 [07:49<00:00,  2.56s/it]


{'eval_loss': 2.327259063720703, 'eval_runtime': 2.3461, 'eval_samples_per_second': 11.509, 'eval_steps_per_second': 2.984, 'epoch': 3.0}
{'train_runtime': 469.4918, 'train_samples_per_second': 1.553, 'train_steps_per_second': 0.39, 'train_loss': 1.583905287779094, 'epoch': 3.0}


In [42]:
tokenizer.save_pretrained("./fine_tuned_gpt2_model")
model.save_pretrained("./fine_tuned_gpt2_model")

### Predicting diagnosis model 1

In [47]:
import torch

# Check if CUDA (GPU support) is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)
model.to(device)


Device: cpu


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [48]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the fine-tuned model and tokenizer
model_path = "./fine_tuned_gpt2_model"  # Replace with your actual path
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)
model.eval()  # Put the model in evaluation mode


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [52]:
# Example symptoms input
symptoms = "Post-cardiac arrest, asthma exacerbation"

# Tokenize input with the special <|endoftext|> token
input_ids = tokenizer.encode(symptoms, return_tensors="pt")

# Generate output
with torch.no_grad():
    outputs = model.generate(input_ids, max_length=100, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)

# Decode the generated output
predicted_diagnosis = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Predicted Diagnosis:", predicted_diagnosis)

Predicted Diagnosis: Post-cardiac arrest, asthma exacerbation


In [30]:
# Provide a prompt
prompt = "Patient presented with severe cough and fever"

# Tokenize the prompt
input_ids = tokenizer.encode(prompt, return_tensors="pt")

# Generate text based on the prompt
output = model.generate(input_ids, max_length=100, num_return_sequences=1, no_repeat_ngram_size=2)

# Decode and print the generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print("Generated Text:", generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Generated Text: Patient presented with severe cough and fever


## GPT2 model 2

Training using chief complain and histry of present illness text

In [55]:
def extract_sections(text, sections):
    data = {}
    for section in sections:
        pattern = rf'{section}:(.*?)(?=\n[A-Z]|$)'
        match = re.search(pattern, text, re.DOTALL)
        if match:
            data[section] = match.group(1).strip()
    return data

def process_files(directory):
    all_data = []
    sections = ["Chief Complaint", "History of Present Illness"]
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            with open(os.path.join(directory, filename), 'r') as file:
                text = file.read()
                extracted_data = extract_sections(text, sections)
                if extracted_data:
                    all_data.append(extracted_data)
    return all_data

def prepare_training_data(data):
    training_data = []
    for record in data:
        for section, content in record.items():
            training_data.append(f"{section}: {content}")
    return training_data

In [56]:
extracted_data = process_files(directory)

training_data = prepare_training_data(extracted_data)

# Saving the training data to a file to use it directly for training
with open('training_data.txt', 'w') as file:
    for item in training_data:
        file.write("%s\n\n" % item)

In [57]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model2 = GPT2LMHeadModel.from_pretrained('gpt2')

# Load and tokenize the training data
train_path = 'training_data.txt'
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=train_path,
    block_size=128,
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=1,
    save_steps=10_000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model2,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

# Train the model
trainer.train()


[codecarbon INFO @ 17:55:50] [setup] RAM Tracking...
[codecarbon INFO @ 17:55:50] [setup] GPU Tracking...
[codecarbon INFO @ 17:55:50] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 17:55:50] [setup] CPU Tracking...
[codecarbon INFO @ 17:55:52] CPU Model on constant consumption mode: Intel(R) Core(TM) i7-8750H CPU @ 2.20GHz
[codecarbon INFO @ 17:55:52] >>> Tracker's metadata:
[codecarbon INFO @ 17:55:52]   Platform system: Windows-10-10.0.19045-SP0
[codecarbon INFO @ 17:55:52]   Python version: 3.10.14
[codecarbon INFO @ 17:55:52]   CodeCarbon version: 2.3.5
[codecarbon INFO @ 17:55:52]   Available RAM : 15.876 GB
[codecarbon INFO @ 17:55:52]   CPU count: 12
[codecarbon INFO @ 17:55:52]   CPU model: Intel(R) Core(TM) i7-8750H CPU @ 2.20GHz
[codecarbon INFO @ 17:55:52]   GPU count: 1
[codecarbon INFO @ 17:55:52]   GPU model: 1 x NVIDIA GeForce GTX 1060
 28%|██▊       | 51/179 [00:14<00:37,  3.41it/s][codecarbon INFO @ 17:56:12] Energy consumed for RAM : 0.000025 kWh. RAM Power : 5.95

{'train_runtime': 69.4114, 'train_samples_per_second': 2.579, 'train_steps_per_second': 2.579, 'train_loss': 3.541965250196404, 'epoch': 1.0}


100%|██████████| 179/179 [01:10<00:00,  2.53it/s]


TrainOutput(global_step=179, training_loss=3.541965250196404, metrics={'train_runtime': 69.4114, 'train_samples_per_second': 2.579, 'train_steps_per_second': 2.579, 'total_flos': 11692818432000.0, 'train_loss': 3.541965250196404, 'epoch': 1.0})

### Predicting Diagnosis model 2:

In [59]:
prompt = "Chieft Complaint: Headache and fever. Discharge diagnosis:"
input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)


In [60]:
#prompt = "Sore throat"

output = model.generate(input_ids, max_length=100, num_return_sequences=1, no_repeat_ngram_size=2, early_stopping=True)

generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
diagnosis_start_token = "Discharge diagnosis:"
diagnosis_start_index = generated_text.find(diagnosis_start_token)

if diagnosis_start_index != -1:
    predicted_diagnosis = generated_text[diagnosis_start_index + len(diagnosis_start_token):].strip()
else:
    predicted_diagnosis = "No discharge diagnosis found."

print("Predicted Discharge Diagnosis:", predicted_diagnosis)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Predicted Discharge Diagnosis: History of Present Illness: 
Chief Complaints: s/p
1.
2.  History
Hospitalization: [**H]
[**Name: **] was admitted to [Hosp] on [ **HOSP1-9-1**] with a history of [of] acute [Chief Illnesses] and [History] [with]  and a recent history
