## Installing all the dependencies

In [None]:
# !pip install datasets
# !pip install accelerate -U
# !pip install transformers[torch]
# !pip install evaluate
# !pip install seqeval
# !pip install boto3
# !pip install python-dotenv
!pip install aioboto3

## conda install pytorch pytorch-cuda=11.8 -c pytorch -c nvidia
## pip install ipykernel

In [1]:
!nvidia-smi

Sun Apr 21 20:44:16 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 522.06       Driver Version: 522.06       CUDA Version: 11.8     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:01:00.0 Off |                  N/A |
| N/A   85C    P5     8W /  N/A |    159MiB /  4096MiB |      6%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [27]:
import json
from datasets import load_dataset,DatasetDict
# from sklearn.preprocessing import LabelEncoder
from pathlib import Path
import torch
import os
import boto3
from botocore.exceptions import ClientError
import asyncio
import aioboto3
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()
# from .autonotebook import tqdm as notebook_tqdm

True

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

## Iniitalizing notebook with some Constants, varaiables and function to be used 

In [48]:
# constants to hold access key and id using environment variables

aws_access_key_id =  os.environ.get('AWS_ACCESS_KEY_ID')
aws_secret_access_key = os.environ.get('AWS_SECRET_ACCESS_KEY')

# constants holding s3 bucket name and keys where data is hosted and transformed dat will be saved.
s3_bucket = 'medical-entity-recognition-bucket'
## hosted data
s3_train_data_key = 'HistoricalData/AnEM.train'
s3_test_data_key = 'HistoricalData/AnEM.test'
## transfomed data
s3_train_json_data_key = 'transformed_to_json/train.json'
s3_test_json_data_key = 'transformed_to_json/test.json'

# contants holding path varaible to save loaded and transfromed data 
raw_data_path = Path('Historical_data')
transformed_data_path = Path('transformed_dataset')
train_json_data_path = os.path.join(transformed_data_path,'train.json')
test_json_data_path = os.path.join(transformed_data_path,'test.json')

# Create an S3 resource with explicit credentials if provided
s3_resource = boto3.resource('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)


In [50]:
def create_directory(directory: Path) -> None:
    """
    Create a directory if it doesn't exist and print a message if it already exists.

    Args:
        directory (Path): The directory path to create.
    """
    if directory.exists():
        print(f"Directory '{directory}' already exists.")
    else:
        directory.mkdir(parents=True, exist_ok=True)
        print(f"Directory '{directory}' created.")

In [15]:
def download_s3_object(bucket_name: str, object_key: str, local_dir: str) -> None:
    """
    Download an object from an S3 bucket 
    and save it to a local directory.
    """

    try:
        # Get the bucket object
        bucket = s3_resource.Bucket(bucket_name)
        # Download the object to a local file
        local_file_path = os.path.join(local_dir, os.path.basename(object_key))

        bucket.download_file(object_key, local_file_path)
        
    except ClientError as e:
        if e.response['Error']['Code'] == 'NoSuchKey':
            print(f"The object with key '{object_key}' does not exist in the bucket '{bucket_name}'.")
        else:
            print(f"Error downloading object: {e}")
    except Exception as e:
        print(f"Unexpected error: {e}")
        # return None

In [23]:
def convert_dataset_to_json(input_file_path: str, output_file_path: str) -> None:
    """
    Converts a dataset from a text format to JSON format.
    """
    
    with open(input_file_path, 'r') as input_file, open(output_file_path, 'w') as output_file:
        dataset = []
        current_text = []
        current_label = []
        current_indices = []

        for line in input_file:
            if line.strip() == "":
                if current_text:
                    example = {"text": current_text, "label": current_label, "token_indices": current_indices}
                    dataset.append(example)
                    current_text = []
                    current_label = []
                    current_indices = []
            else:
                parts = line.strip().split("\t")
                token = parts[0]
                label = parts[-1]
                start_index = int(parts[1])
                end_index = int(parts[2])
                current_text.append(token)
                current_label.append(label)
                current_indices.append((start_index, end_index))

        json.dump(dataset, output_file, indent=4)



In [43]:

def upload_file_to_s3(bucket_name: str, local_file_path: str, s3_key: str) -> None:
    """
    Upload a local file to an S3 bucket.
    """
    try:
        # Upload the file to S3
        s3_resource.Bucket(bucket_name).upload_file(Filename=local_file_path, Key=s3_key)
        print(f"File '{local_file_path}' uploaded to '{bucket_name}/{s3_key}'.")

    except ClientError as e:
        # Handle any errors
        print(f"Error uploading file to S3: {e}")


## Data  Ingestion

### Creating the required folder


In [53]:
# directory to store hstorical data 
create_directory(raw_data_path)

# directory to store transformed json data
create_directory(transformed_data_path)

Directory 'Historical_data' created.
Directory 'transformed_dataset' created.


### Downloading historical data from S3 bucket

In [54]:
# download historical train data from S3
download_s3_object(
    bucket_name=s3_bucket,
    object_key=s3_train_data_key,
    local_dir=raw_data_path,
    )

# download historical test data from S3
download_s3_object(
    bucket_name=s3_bucket,
    object_key=s3_test_data_key,
    local_dir=raw_data_path,
    )

'Historical_data\\AnEM.test'

### Converting raw training data to json fomat

In [55]:
# converting train data to json fomat
convert_dataset_to_json(
    input_file_path=os.path.join(raw_data_path,os.path.basename(s3_train_data_key)),
    output_file_path=train_json_data_path
    )

# converting test data to json format
convert_dataset_to_json(
    input_file_path=os.path.join(raw_data_path,os.path.basename(s3_test_data_key)),
    output_file_path=test_json_data_path
    )

### storing the json data to s3 bucket


In [56]:
# storing train data json to s3 bucket
upload_file_to_s3(
    bucket_name=s3_bucket,
    local_file_path=train_json_data_path,
    s3_key=s3_train_json_data_key
    )

# storing test data json to s3 bucket
upload_file_to_s3(
    bucket_name=s3_bucket,
    local_file_path=test_json_data_path,
    s3_key=s3_test_json_data_key
    ) 

File 'transformed_dataset\train.json' uploaded to 'medical-entity-recognition-bucket/transformed_to_json/train.json'.
File 'transformed_dataset\test.json' uploaded to 'medical-entity-recognition-bucket/transformed_to_json/test.json'.


### Transforming the json data hugging face dataset

In [7]:
# Load the  JSON file using 
train_json = r"D:\ML projects\Medical_Enity_recognition\transformed_dataset\train_dataset.json"
dataset = load_dataset('json', data_files=train_json)
# Extract unique labels from the dataset
unique_labels = set()
for example in dataset['train']:
    unique_labels.update(example['label'])

# Sort the unique labels alphabetically
label_list = sorted(unique_labels)
print("Label List:", label_list)

# Initialize a dictionary to map labels to encoded values
label2id = {label: encoded for encoded, label in enumerate(label_list)}

# Initialize a dictionary to map encoded values back to labels
id2label = {encoded: label for label, encoded in label2id.items()}

# Print or use the dictionaries
print("Label to Encoded:", label2id)
print("Encoded to Label:", id2label)


Label List: ['B-Anatomical_system', 'B-Cell', 'B-Cellular_component', 'B-Developing_anatomical_structure', 'B-Immaterial_anatomical_entity', 'B-Multi-tissue_structure', 'B-Organ', 'B-Organism_subdivision', 'B-Organism_substance', 'B-Pathological_formation', 'B-Tissue', 'I-Anatomical_system', 'I-Cell', 'I-Cellular_component', 'I-Developing_anatomical_structure', 'I-Immaterial_anatomical_entity', 'I-Multi-tissue_structure', 'I-Organ', 'I-Organism_subdivision', 'I-Organism_substance', 'I-Pathological_formation', 'I-Tissue', 'O']
Label to Encoded: {'B-Anatomical_system': 0, 'B-Cell': 1, 'B-Cellular_component': 2, 'B-Developing_anatomical_structure': 3, 'B-Immaterial_anatomical_entity': 4, 'B-Multi-tissue_structure': 5, 'B-Organ': 6, 'B-Organism_subdivision': 7, 'B-Organism_substance': 8, 'B-Pathological_formation': 9, 'B-Tissue': 10, 'I-Anatomical_system': 11, 'I-Cell': 12, 'I-Cellular_component': 13, 'I-Developing_anatomical_structure': 14, 'I-Immaterial_anatomical_entity': 15, 'I-Multi-t

In [8]:
dataset= dataset['train'].train_test_split(test_size=0.2, train_size=0.8, seed=42)

dataset= DatasetDict({
    'train':dataset['train'],
    'validation': dataset['test']
})
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'token_indices'],
        num_rows: 1812
    })
    validation: Dataset({
        features: ['label', 'text', 'token_indices'],
        num_rows: 454
    })
})

In [9]:
# working with small sample for testing prupose only please comment this during training
train  = dataset['train'].select(list(range(100)))
validation = train  = dataset['validation'].select(list(range(100)))

small_dataset= DatasetDict({
    'train': train,
    'validation': validation
})

small_dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'token_indices'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['label', 'text', 'token_indices'],
        num_rows: 100
    })
})

In [None]:
# example = dataset["train"][0]
# tokenized_input = tokenizer(example["text"], is_split_into_words=True)
# tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
# print(tokens)
# print(tokenized_input)
# print(len(example["text"]),len(tokens),len(tokenized_input["input_ids"]))

In [None]:
# tokenized_input.word_ids(batch_index=0)

### Transforming dataset to compatible with hugging face transformer 


 A single word corresponding to a single label may now be split into two subwords. You’ll need to realign the tokens and labels by:

Mapping all tokens to their corresponding word with the word_ids method.
Assigning the label -100 to the special tokens [CLS] and [SEP] so they’re ignored by the PyTorch loss function (see CrossEntropyLoss).
Only labeling the first token of a given word. Assign -100 to other subtokens from the same word.

In [10]:
from transformers import BertTokenizerFast, BertForTokenClassification

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

def tokenize_and_align_labels(examples):
    
    tokenized_inputs = tokenizer(examples["text"], truncation=True, is_split_into_words=True)

    labels = []

    for i, label in enumerate(examples[f"label"]):
        label = [label2id[l] for l in label]
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
          
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels

    return tokenized_inputs

In [11]:
transformer_dataset = small_dataset.map(tokenize_and_align_labels,batched=True)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map: 100%|██████████| 100/100 [00:00<00:00, 281.30 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 203.22 examples/s]


In [12]:
transformer_dataset = transformer_dataset.rename_column("label", "tags")

In [13]:
transformer_dataset

DatasetDict({
    train: Dataset({
        features: ['tags', 'text', 'token_indices', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['tags', 'text', 'token_indices', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
})

In [14]:
print((transformer_dataset['train'][0]['labels']))
# print((transformer_dataset['train'][0]['labels']))
# print(tokens)


[-100, 22, 22, 22, 22, -100, -100, 22, -100, 22, 22, 22, 22, 22, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, -100, -100, -100, -100, -100, -100, -100, -100, 22, 22, 22, 22, 22, 22, 22, -100]


## Model fine Tuning

### Defining evaluation metirc for evaluating test and validation data

In [15]:
import evaluate

seqeval = evaluate.load("seqeval")

In [16]:
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

### Defining training arguments and initializing trainer compatible for hugging face transformer library

In [18]:
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification

torch.cuda.empty_cache()

model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(label_list))

data_collator = DataCollatorForTokenClassification(tokenizer)

# Define the training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=2,
    num_train_epochs=3,
    logging_dir='./logs',
    output_dir= '/content/chekpoints',
        # output_dir="my_awesome_wnut_model",
    learning_rate=2e-5,
    per_device_eval_batch_size=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    # push_to_hub=True,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,data_collator=data_collator,
    args=training_args,
    train_dataset=transformer_dataset['train'],
    eval_dataset=transformer_dataset['validation'],
    compute_metrics=compute_metrics,
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Training the model

In [19]:
# Train the model
trainer.train()

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
                                                
 33%|███▎      | 50/150 [00:58<01:15,  1.33it/s]

{'eval_loss': 0.36464211344718933, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9418896321070234, 'eval_runtime': 16.2942, 'eval_samples_per_second': 6.137, 'eval_steps_per_second': 3.069, 'epoch': 1.0}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
                                                 
 67%|██████▋   | 100/150 [03:07<00:48,  1.04it/s]

{'eval_loss': 0.29774752259254456, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9418896321070234, 'eval_runtime': 16.1974, 'eval_samples_per_second': 6.174, 'eval_steps_per_second': 3.087, 'epoch': 2.0}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
                                                 
100%|██████████| 150/150 [04:47<00:00,  1.25it/s]

{'eval_loss': 0.25195467472076416, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9418896321070234, 'eval_runtime': 18.127, 'eval_samples_per_second': 5.517, 'eval_steps_per_second': 2.758, 'epoch': 3.0}


100%|██████████| 150/150 [05:27<00:00,  2.18s/it]

{'train_runtime': 327.4439, 'train_samples_per_second': 0.916, 'train_steps_per_second': 0.458, 'train_loss': 0.47984156290690105, 'epoch': 3.0}





TrainOutput(global_step=150, training_loss=0.47984156290690105, metrics={'train_runtime': 327.4439, 'train_samples_per_second': 0.916, 'train_steps_per_second': 0.458, 'total_flos': 6441779579640.0, 'train_loss': 0.47984156290690105, 'epoch': 3.0})

In [None]:
# Evaluate the trained model on the test dataset
test_results = trainer.evaluate(eval_dataset=test_dataset)
print("Test results:", test_results)