### Importing Necessary Libraries

In [97]:
import pandas as pd
import glob
import os 
import json

### Loading the generated CSV files and combining into a single DataFrame

In [98]:
def combine_csv_files_concise(directory='/home/md/Documents/ner_train/ner_datasets'):
    files = glob.glob(os.path.join(directory, 'ner_dataset_raw_batch_*.csv'))
    combined_df = pd.concat([pd.read_csv(f) for f in files], ignore_index=True)
    return combined_df

df = combine_csv_files_concise()

In [99]:
df

Unnamed: 0,text,tokens,ner_tags
0,Patient ko तेज बुखार (high fever) aur सीने में...,"['Patient', 'ko', 'तेज', 'बुखार', '(high', 'fe...","['O', 'O', 'B-SYMPTOM', 'I-SYMPTOM', 'I-SYMPTO..."
1,"Patient ko तेज़ बुखार tha, aur usse सीने में द...","['Patient', 'ko', 'तेज़', 'बुखार', 'tha', 'aur...","['O', 'O', 'B-SYMPTOM', 'I-SYMPTOM', 'O', 'O',..."
2,Patient ke blood reports mein high cholesterol...,"['Patient', 'ke', 'blood', 'reports', 'mein', ...","['O', 'O', 'B-TEST', 'I-TEST', 'O', 'B-CONDITI..."
3,Patient के ख़ून में sugar level बहुत ज़्यादा थ...,"['Patient', 'के', 'ख़ून', 'में', 'sugar', 'lev...","['O', 'O', 'O', 'O', 'B-SYMPTOM', 'I-SYMPTOM',..."
4,"Patient ke saath continuous chest pain hai, au...","['Patient', 'ke', 'saath', 'continuous', 'ches...","['O', 'O', 'O', 'B-SYMPTOM', 'I-SYMPTOM', 'I-S..."
...,...,...,...
490,The patient with पैंफिसिटिस (Pleurisy) was und...,"['The', 'patient', 'with', 'पैंफिसिटिस', '(', ...","['O', 'O', 'O', 'B-CONDITION', 'O', 'I-CONDITI..."
491,शिशु में severe pneumonia का पता चला और DR. कु...,"['शिशु', 'में', 'severe', 'pneumonia', 'का', '...","['O', 'O', 'O', 'B-CONDITION', 'O', 'O', 'O', ..."
492,The emergency room doctor administered 2 units...,"['The', 'emergency', 'room', 'doctor', 'admini...","['O', 'B-CONDITION', 'O', 'O', 'O', 'B-DOSAGE'..."
493,Angioplasty was performed on 55-year-old patie...,"['Angioplasty', 'was', 'performed', 'on', '55-...","['B-PROCEDURE', 'O', 'O', 'O', 'O', 'O', 'O', ..."


In [100]:
df['text'].duplicated().sum()

np.int64(84)

In [101]:
df.head(n=3)

Unnamed: 0,text,tokens,ner_tags
0,Patient ko तेज बुखार (high fever) aur सीने में...,"['Patient', 'ko', 'तेज', 'बुखार', '(high', 'fe...","['O', 'O', 'B-SYMPTOM', 'I-SYMPTOM', 'I-SYMPTO..."
1,"Patient ko तेज़ बुखार tha, aur usse सीने में द...","['Patient', 'ko', 'तेज़', 'बुखार', 'tha', 'aur...","['O', 'O', 'B-SYMPTOM', 'I-SYMPTOM', 'O', 'O',..."
2,Patient ke blood reports mein high cholesterol...,"['Patient', 'ke', 'blood', 'reports', 'mein', ...","['O', 'O', 'B-TEST', 'I-TEST', 'O', 'B-CONDITI..."


### Converting the DataFrame to HuggingFace Dataset

In [102]:
# Define the mapping
label2id = {
    "O": 0,
    "B-CONDITION": 1,
    "I-CONDITION": 2,
    "B-MEDICATION": 3,
    "I-MEDICATION": 4,
    "B-PROCEDURE": 5,
    "I-PROCEDURE": 6,
    "B-SYMPTOM":8,
    "I-SYMPTOM": 7,
    "B-TEST":9,
    "I-TEST":10,
    "B-DOSAGE":11,
    "B-ALLERGEN":12,
    "I-ALLERGEN":13
}

# Convert tags to numbers with error handling
def convert_tags_to_ids(tags):
    return [label2id.get(tag, 0) for tag in tags]  # Returns 0 for any unknown tag

# Apply the conversion
df['ner_tags'] = df['ner_tags'].apply(eval)  # If needed
df['ner_tags_numeric'] = df['ner_tags'].apply(convert_tags_to_ids)

# Replace original column
df['ner_tags'] = df['ner_tags_numeric']
del df['ner_tags_numeric']

# Print unique tags to verify conversion
unique_tags = set([tag for tags in df['ner_tags'].tolist() for tag in tags])
print("Unique numeric tags in dataset:", sorted(unique_tags))

Unique numeric tags in dataset: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]


Shuffling is critical for ensuring the quality, robustness, and generalizability of models trained on the dataset. It eliminates biases introduced by the order of data in the original files and prepares the dataset for realistic evaluation and deployment scenarios.

In [103]:
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [104]:
df

Unnamed: 0,text,tokens,ner_tags
0,मुझे दो दिन पहले पेट में तेज़ दर्द और उल्टी हु...,"['मुझे', 'दो', 'दिन', 'पहले', 'पेट', 'में', 'त...","[0, 0, 0, 0, 0, 0, 8, 0, 8, 0, 0, 0, 0, 0, 0, ..."
1,"Patient ko तेज़ बुखार और सिर दर्द था, इसलिए डॉ...","['Patient', 'को', 'तेज़', 'बुखार', 'और', 'सिर'...","[0, 0, 0, 8, 0, 8, 7, 0, 0, 0, 0, 0, 3, 4, 0, ..."
2,The doctor ordered an emergency Angioplasty pr...,"['The', 'doctor', 'ordered', 'an', 'emergency'...","[0, 0, 0, 0, 0, 5, 6, 0, 0, 0, 0, 0, 0, 1, 2, ..."
3,The 35-year-old पेशेवर गोलफ़र को Chest pain an...,"['The', '35-year-old', 'पेशेवर', 'गोलफ़र', 'को...","[0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 7, 0, 0, 0, 5, ..."
4,The 30-year-old patient was rushed to the emer...,"['The', '30-year-old', 'patient', 'was', 'rush...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...
490,The patient complained of chest pain and short...,"['The', 'patient', 'complained', 'of', 'chest'...","[0, 0, 0, 0, 8, 7, 0, 8, 0, 7, 0, 0, 0, 0, 0, ..."
491,The patient was rushed to the ICU after suffer...,"['The', 'patient', 'was', 'rushed', 'to', 'the...","[0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 8, 0, 0, 0, 0, ..."
492,Patient underwent emergency Appendectomy due t...,"['Patient', 'underwent', 'emergency', 'Appende...","[0, 0, 0, 5, 0, 0, 1, 2, 0, 0, 0, 11, 3, 0, 11..."
493,The patient underwent an emergency Angioplasty...,"['The', 'patient', 'underwent', 'an', 'emergen...","[0, 0, 0, 0, 0, 5, 0, 0, 0, 8, 7, 0, 0, 0, 11,..."


### Defining Dataset Features and Splits for Dataset

In [105]:
label_names = list(label2id.keys())
label_names

['O',
 'B-CONDITION',
 'I-CONDITION',
 'B-MEDICATION',
 'I-MEDICATION',
 'B-PROCEDURE',
 'I-PROCEDURE',
 'B-SYMPTOM',
 'I-SYMPTOM',
 'B-TEST',
 'I-TEST',
 'B-DOSAGE',
 'B-ALLERGEN',
 'I-ALLERGEN']

In [106]:
from datasets import Dataset, DatasetDict, Features, ClassLabel, Sequence, Value, DatasetInfo


# Define features
features = Features({
    'text': Value('string'),
    'tokens': Sequence(Value('string')),
    'ner_tags': Sequence(ClassLabel(num_classes=len(label_names), names=label_names))
})

# Add info about the dataset
description = """
NER Tag Definitions:
{
    "B-CONDITION": "Medical condition (e.g., Diabetes, हृदय रोग, Asthma, TB)",
    "I-CONDITION": "Continuation of condition",
    "B-MEDICATION": "Medicines (e.g., Insulin, Paracetamol, Dolo-650)",
    "I-MEDICATION": "Continuation of medication",
    "B-PROCEDURE": "Medical procedure (e.g., Angioplasty, X-ray, डायलिसिस)",
    "I-PROCEDURE": "Continuation of procedure",
    "B-SYMPTOM": "Symptoms (e.g., बुखार, Chest pain, Fatigue)",
    "I-SYMPTOM": "Continuation of symptom",
    "B-TEST": "Lab/diagnostic test (e.g., ECG, Lipid Profile, ब्लड टेस्ट)",
    "I-TEST": "Continuation of test",
    "B-DOSAGE": "Dosage/measurement (e.g., 5mg, 200mL, twice daily)",
    "B-ALLERGEN": "Allergens (e.g., Penicillin, धूल, Peanuts)",
    "I-ALLERGEN": "Continuation of allergen",
    "O": "Non-entity tokens"
}
"""

# Create a DatasetInfo object
my_dataset_info = DatasetInfo(
    description=description,
    dataset_name="Medical Domain NER Datasets",
    features=features,
)

# Create dataset with features
dataset = Dataset.from_dict(
    {
        'text': df['text'].tolist(),
        'tokens': df['tokens'].apply(eval).tolist(),
        'ner_tags': df['ner_tags'].tolist()
    },
    info=my_dataset_info
)

# Create splits
dataset = dataset.train_test_split(test_size=0.2, seed=42)
test_valid = dataset['test'].train_test_split(test_size=0.5, seed=42)

# Create final dataset dictionary
hf_dataset = DatasetDict({
    'train': dataset['train'],
    'validation': test_valid['train'],
    'test': test_valid['test']
})


In [107]:
# Print to verify
print("Features:", hf_dataset['train'].features)

Features: {'text': Value(dtype='string', id=None), 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-CONDITION', 'I-CONDITION', 'B-MEDICATION', 'I-MEDICATION', 'B-PROCEDURE', 'I-PROCEDURE', 'B-SYMPTOM', 'I-SYMPTOM', 'B-TEST', 'I-TEST', 'B-DOSAGE', 'B-ALLERGEN', 'I-ALLERGEN'], id=None), length=-1, id=None)}


In [108]:
hf_dataset["train"][0]

{'text': 'रोगी को हृदय रोग के कारण Angioplasty करवाना पड़ा और उसे 5mg Aspirin दिया गया क्योंकि उसके ECG में abnormal रीडिंग थी।',
 'tokens': ['रोगी',
  'को',
  'हृदय',
  'रोग',
  'के',
  'कारण',
  'Angioplasty',
  'करवाना',
  'पड़ा',
  'और',
  'उसे',
  '5mg',
  'Aspirin',
  'दिया',
  'गया',
  'क्योंकि',
  'उसके',
  'ECG',
  'में',
  'abnormal',
  'रीडिंग',
  'थी',
  '।'],
 'ner_tags': [0,
  0,
  1,
  2,
  0,
  0,
  5,
  0,
  0,
  0,
  0,
  11,
  3,
  0,
  0,
  0,
  0,
  9,
  0,
  0,
  0,
  0,
  0]}

In [109]:
hf_dataset["train"].features["ner_tags"]

Sequence(feature=ClassLabel(names=['O', 'B-CONDITION', 'I-CONDITION', 'B-MEDICATION', 'I-MEDICATION', 'B-PROCEDURE', 'I-PROCEDURE', 'B-SYMPTOM', 'I-SYMPTOM', 'B-TEST', 'I-TEST', 'B-DOSAGE', 'B-ALLERGEN', 'I-ALLERGEN'], id=None), length=-1, id=None)

In [110]:
print(hf_dataset['train'].description)


NER Tag Definitions:
{
    "B-CONDITION": "Medical condition (e.g., Diabetes, हृदय रोग, Asthma, TB)",
    "I-CONDITION": "Continuation of condition",
    "B-MEDICATION": "Medicines (e.g., Insulin, Paracetamol, Dolo-650)",
    "I-MEDICATION": "Continuation of medication",
    "B-PROCEDURE": "Medical procedure (e.g., Angioplasty, X-ray, डायलिसिस)",
    "I-PROCEDURE": "Continuation of procedure",
    "B-SYMPTOM": "Symptoms (e.g., बुखार, Chest pain, Fatigue)",
    "I-SYMPTOM": "Continuation of symptom",
    "B-TEST": "Lab/diagnostic test (e.g., ECG, Lipid Profile, ब्लड टेस्ट)",
    "I-TEST": "Continuation of test",
    "B-DOSAGE": "Dosage/measurement (e.g., 5mg, 200mL, twice daily)",
    "B-ALLERGEN": "Allergens (e.g., Penicillin, धूल, Peanuts)",
    "I-ALLERGEN": "Continuation of allergen",
    "O": "Non-entity tokens"
}



In [111]:
hf_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'tokens', 'ner_tags'],
        num_rows: 396
    })
    validation: Dataset({
        features: ['text', 'tokens', 'ner_tags'],
        num_rows: 49
    })
    test: Dataset({
        features: ['text', 'tokens', 'ner_tags'],
        num_rows: 50
    })
})

### Saving the Dataset

In [112]:
# Suppose you have a Dataset or DatasetDict called 'hf_dataset'
hf_dataset.save_to_disk("../Dataset/medical_domain_ner_dataset")

Saving the dataset (1/1 shards): 100%|██████████| 396/396 [00:00<00:00, 9490.03 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 49/49 [00:00<00:00, 3623.30 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 50/50 [00:00<00:00, 1652.15 examples/s]
