In [None]:
! pip install -U spacy -q

In [None]:
!python -m spacy info

[1m

spaCy version    3.7.5                         
Location         /usr/local/lib/python3.10/dist-packages/spacy
Platform         Linux-6.1.85+-x86_64-with-glibc2.35
Python version   3.10.12                       
Pipelines        en_core_web_sm (3.7.1)        



In [None]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

nlp = spacy.blank("en") # load a new spacy model
db = DocBin() # create a DocBin object


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import json
f = open(r"/content/annotations (3).json")
TRAIN_DATA = json.load(f)

In [None]:
TRAIN_DATA

{'classes': ['TRIMESTER',
  'PLACENTA',
  'LIQUOR',
  'FETAL ACTIVITY',
  'CARDIAC ACTIVITY',
  'FETAL HEART BEAT',
  'CROWN LUMP LENGTH',
  'BIPARIETAL DIAMETER',
  'HEAD CIRCUMFERENCE',
  'ABDOMINAL CIRCUMFERENCE'],
 'annotations': [['JAMM SCANS DEPARTMENT OF FETAL MEDICINE No:16 Vaidhyaraman Street Tnagar Patient name AgelSex 31 Years Female Patient ID Visit no Referred bY Visit date LMP date 02/03/2023 \r\n\r\nLMP EDD: 07/12/2023[12W 1DL OB 3\r\n\r\n First Trimester Scan Report Indication(s) First trimester screening \r\n\r\nReal time B-mode ultrasonography of gravid uterus done_ Route: Transabdominal and Transvaginal Single intrauterine gestation Medicalnotes \r\n\r\nBlood group AIB+ve Height 159 cms Weight : 48.2kgs Marital History : 4 years   Consanguinity : NCM Menstrual History Regular\r\n\r\n Gravida 2 Para 1 Live 1 Abortion : 0 Significant previous obstetric details Nil\r\n\r\n Medical Surgical History Lscs. Maternal Cervix measured 3.10 cm in length.\r\n\r\n Right Uterine 1

In [None]:
for text, annot in tqdm(TRAIN_DATA['annotations']):
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents
    db.add(doc)

db.to_disk("./training_data.spacy") # save the docbin object

100%|██████████| 1/1 [00:00<00:00, 74.29it/s]


In [None]:
! python -m spacy init config config.cfg --lang en --pipeline ner --optimize efficiency

[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [None]:
! python -m spacy train config.cfg --output ./ --paths.train ./training_data.spacy --paths.dev ./training_data.spacy

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00    109.97    0.00    0.00    0.00    0.00
200     200         35.73   2141.32  100.00  100.00  100.00    1.00
400     400          0.00      0.00  100.00  100.00  100.00    1.00
600     600          0.00      0.00  100.00  100.00  100.00    1.00
800     800          0.00      0.00  100.00  100.00  100.00    1.00
1000    1000          0.00      0.00  100.00  100.00  100.00    1.00
1200    1200          0.00      0.00  100.00  100.00  100.00    1.00
1400    1400          0.00      0.00  100.00  100.00  100.00    1.00
1600    1600          0.00      0.00  100.00  100.00  

In [None]:
nlp_ner = spacy.load("/content/model-best")



In [None]:
import spacy
import json

# Load your trained model
nlp = spacy.load("/content/model-best")

# Load evaluation data from JSON
with open("/content/annotations (3).json", "r") as f:
    evaluation_data = json.load(f)

# Assuming your evaluation data structure is similar to the training data
eval_annotations = evaluation_data['annotations']

# Initialize variables to track evaluation metrics
total_entities = 0
correct_predictions = 0
predicted_entities = 0

# Iterate through each evaluation example
for text, annot in eval_annotations:
    # Process the text with the loaded NER model
    doc = nlp(text)

    # Extract predicted entities
    predicted = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]

    # Compare predicted entities with gold standard annotations
    for start, end, label in annot['entities']:
        total_entities += 1
        if (start, end, label) in predicted:
            correct_predictions += 1

    predicted_entities += len(predicted)

# Calculate precision, recall, and F1 score
precision = correct_predictions / predicted_entities if predicted_entities > 0 else 0
recall = correct_predictions / total_entities if total_entities > 0 else 0
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

# Set your desired minimum score threshold
minimum_score = 0.90  # 90% threshold

# Print evaluation scores
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1_score:.4f}")

# Check against the minimum score threshold
if precision >= minimum_score and recall >= minimum_score and f1_score >= minimum_score:
    print(f"\n  model meets the minimum {minimum_score * 100:.0f}% score threshold.")
else:
    print(f"\nYour model does not meet the minimum {minimum_score * 100:.0f}% score threshold. Further improvement may be needed.")

Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000

  model meets the minimum 90% score threshold.


In [None]:
!pip install easyocr

Collecting easyocr
  Downloading easyocr-1.7.1-py3-none-any.whl (2.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
Collecting python-bidi (from easyocr)
  Downloading python_bidi-0.4.2-py2.py3-none-any.whl (30 kB)
Collecting pyclipper (from easyocr)
  Downloading pyclipper-1.3.0.post5-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (908 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m908.3/908.3 kB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ninja (from easyocr)
  Downloading ninja-1.11.1.1-py2.py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl (307 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m307.2/307.2 kB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->easyocr)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==

In [None]:
from google.colab import files

uploaded = files.upload()
image_path = list(uploaded.keys())[0]  # Access uploaded ima


KeyboardInterrupt: 

In [None]:
import easyocr
reader = easyocr.Reader(['en'])  # Adjust language(s) as needed

def extract_text_from_image(image_path):
    text = reader.readtext(image_path, detail=1)
    extracted_text = ""
    for line in text:
        extracted_text += line[1] + " "
    return extracted_text

In [None]:
extracted_text = extract_text_from_image(image_path)

print(extracted_text)

In [None]:
doc1 = nlp_ner(extracted_text)

In [None]:
visualization_output = spacy.displacy.render(doc1, style="ent", jupyter=True) # display in Jupyter

In [None]:
import pandas as pd

In [None]:
gen = pd.DataFrame(visualization_output)

In [None]:
import spacy
import pandas as pd

# Load the spaCy model
nlp = spacy.load("/content/model-best")

# Your paragraph of text
paragraph = extracted_text
# Process the paragraph with spaCy
doc = nlp(paragraph)

# Extract entities and their labels
entities = [(ent.text, ent.label_) for ent in doc.ents]

entity_dict = {entity[0]: entity[1] for entity in entities}

# Output the dictionary
print(entity_dict)

In [None]:
def reverse_dict(entity_dict):
    reversed_dict = {value: key for key, value in entity_dict.items()}
    return reversed_dict

reversed_dict = reverse_dict(entity_dict)
print( reversed_dict)

In [None]:
# Define the reference ranges for each parameter
reference_ranges = {
    "TRIMESTER": "First Trimester Scan Report",
    "PLACENTA": "Anterior",
    "LIQUOR": "Normal",
    "CARDIAC_ACTIVITY": "Cardiac activity present",
    "FETAL_HEART_BEAT": (110, 160),  # bpm
    "CROWN_LUMP_LENGTH": (43, 60),  # mm
    "BIPARIETAL_DIAMETER": (17, 42),  # mm
    "HEAD_CIRCUMFERENCE": (60, 80),  # mm
    "ABDOMINAL_CIRCUMFERENCE": (50, 60),  # mm
}

# Initialize a flag to track if all values are within range
in_range = True

# Simulated DataFrame iteration
# for index, row in df.iterrows():
#     entity = row["Entity"]
#     label = row["Label"]

# Simulating data for testing purposes
data = entity_dict

# Iterate over the keys and values of the reference ranges dictionary
for label, reference_value in reference_ranges.items():
    # Check if the extracted entity matches the reference value
    if label in data:
        entity = data[label]
        if isinstance(reference_value, tuple):  # Check if it's a range
            if not (reference_value[0] <= float(entity) <= reference_value[1]):
                in_range = False
                break
        else:  # Check if it's a string
            if entity != reference_value:
                in_range = False
                break

# Output the result
if in_range:
    print("Fetus is in good condition")
else:
    print("Fetus is not in good condition")


In [None]:
# Define the reference ranges for each parameter
reference_ranges = {
    "TRIMESTER": "First Trimester Scan Report",
    "LIQUOR": "Normal",
    "CARDIAC ACTIVITY": "present",
    "FETAL HEART BEAT": (110, 160),  # bpm
    "CROWN LUMP LENGTH": (43, 60),  # mm
    "BIPARIETAL DIAMETER": (17, 42),  # mm
    "HEAD CIRCUMFERENCE": (60, 80),  # mm
    "ABDOMINAL CIRCUMFERENCE": (50, 60),  # mm
}

# Simulated extracted data (reversed_dict in your case)
#reversed_dict = {
   # 'LIQUOR': 'Normal',
    #'CARDIAC ACTIVITY': 'present',
    #'FETAL HEART BEAT': '167',
    #'CROWN LUMP LENGTH': '72',
    #'BIPARIETAL DIAMETER': '25',
    #'HEAD CIRCUMFERENCE': '86.43',
    #'ABDOMINAL CIRCUMFERENCE': '69.14'
#}

# Initialize a flag to track if all values are within range
in_range = True

# Iterate over the keys and values of the reference ranges dictionary
for label, reference_value in reference_ranges.items():
    # Check if the extracted entity exists in the data
    if label in reversed_dict:
        entity = reversed_dict[label]
        # Convert entity to float if it's numeric
        try:
            entity = float(entity)
        except ValueError:
            pass  # Handle non-numeric values gracefully

        if isinstance(reference_value, tuple):  # Check if it's a range
            if not (reference_value[0] <= entity <= reference_value[1]):
                in_range = False
                print(f"{label}: {entity} is out of range {reference_value}")
        else:  # Check if it's a string
            if entity != reference_value:
                in_range = False
                print(f"{label}: {entity} is not {reference_value}")
    else:
        in_range = False
        print(f"{label} is missing from the extracted data.")

# Output the result
if in_range:
    print("Fetus is in good condition")
else:
    print("Fetus is not in good condition")


In [None]:
# Define the reference ranges for each parameter
reference_ranges = {
    "TRIMESTER": "2/3 Trimester Scan Report",
    "PLACENTA": "Anterior",
    "LIQUOR": "Normal",
    "CARDIAC ACTIVITY": "present",

    "FETAL HEART BEAT": (120, 180),  # Heartbeat range in beats per minute
    "CROWN LUMP LENGTH": (115, 400),  # Length in millimeters
    "BIPARIETAL DIAMETER": (45, 88),  # Diameter in millimeters
    "HEAD CIRCUMFERENCE": (160, 240),  # Circumference in millimeters
    "ABDOMINAL CIRCUMFERENCE": (110, 190),  # Circumference in millimeters
    "TRANSVERSE CEREBELLAR DIAMETER": (15, 25)  # Diameter in millimeters
}

# Simulated extracted data (replace with your actual data)


# Initialize a flag to track if all values are within range
in_range = True

# Iterate over the keys and values of the reference ranges dictionary
for label, reference_value in reference_ranges.items():
    # Check if the extracted entity exists in the data
    if label in reversed_dict:
        entity = reversed_dict[label]
        # Convert entity to float if it's numeric
        try:
            entity = float(entity)
        except ValueError:
            pass  # Handle non-numeric values gracefully

        if isinstance(reference_value, tuple):  # Check if it's a range
            if not (reference_value[0] <= entity <= reference_value[1]):
                in_range = False
                print(f"{label}: {entity} is out of range {reference_value}")
        else:  # Check if it's a string
            if entity != reference_value:
                in_range = False
                print(f"{label}: {entity} is not {reference_value}")
    else:
        in_range = False
        print(f"{label} is missing from the extracted data.")

# Output the result
if in_range:
    print("Fetus is in good condition")
else:
    print("Fetus is not in good condition")
