In [3]:
import json
import csv

# Load the JSON data
with open('../EDC_target_model/EDC_annotations.json', 'r') as f:
    data = json.load(f)

annotations = data['annotations']



In [4]:
annotations

[None,
 ['This is largely driven by anthropogenic changes to the environment, including the widespread exposure of invertebrates to endocrine disrupting chemicals (EDCs), which impair fertility.\r',
  {'entities': [[122, 152, 'ENDOCRINE_DISRUPTING_CHEMICAL'],
    [154, 158, 'ENDOCRINE_DISRUPTING_CHEMICAL']]}],
 ['To test whether generations of Drosophila melanogaster born from parents exposed to a common dietary EDC, equol, could recover reproductive function, we quantified the reproductive capacity of the two subsequent generations.\r',
  {'entities': [[101, 104, 'ENDOCRINE_DISRUPTING_CHEMICAL'],
    [106, 111, 'ENDOCRINE_DISRUPTING_CHEMICAL']]}],
 None,
 ['Though the sex ratio alters in response to EDC exposure, favouring the survival of female offspring, most lineages with ancestral EDC exposure exhibit persistent subfertility in both the male and female.\r',
  {'entities': [[43, 46, 'ENDOCRINE_DISRUPTING_CHEMICAL'],
    [130, 133, 'ENDOCRINE_DISRUPTING_CHEMICAL']]}],
 ['Male offspr

In [183]:
import pandas as pd 
import os
from tqdm import tqdm
import spacy
from spacy.tokens import DocBin
import en_ner_bionlp13cg_md

# Load the spaCy model
nlp = spacy.load("en_ner_bionlp13cg_md")

# Create the DocBin object
db = DocBin()

#There are some none entities in the annotations that need to be handled 
# Filter out None elements from the annotations list
filtered_annotations = [item for item in annotations if item is not None]

# Process the filtered annotations
for text, annot in tqdm(filtered_annotations):
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot['entities']:
        span = doc.char_span(start, end, label=label, alignment_mode='contract')
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents  # Label text with the ents
    db.add(doc)

# Change directory and save the DocBin object
os.chdir("./EDC_target_model")
db.to_disk("train.spacy")

  8%|▊         | 93/1145 [00:09<01:02, 16.86it/s] 

Skipping entity
Skipping entity
Skipping entity
Skipping entity


 63%|██████▎   | 720/1145 [00:10<00:00, 455.68it/s]

Skipping entity
Skipping entity


100%|██████████| 1145/1145 [00:10<00:00, 104.21it/s]

Skipping entity
Skipping entity
Skipping entity





In [None]:
#E (Epoch):
#This column indicates the epoch number, which refers to how many #complete passes the training algorithm has made over the entire #training dataset.

In [None]:
# (Batch):
#This shows the batch number within the current epoch. Training is typically done in batches (subsets of the entire dataset), and this column tracks which batch is being processed.

In [None]:
#LOSS TOK2VEC:
#This column shows the loss value for the tok2vec component of the pipeline. The tok2vec layer is responsible for transforming the tokens into vector representations (embeddings) that can be used by the NER model.

In [None]:
#LOSS NER:
#This represents the loss associated with the Named Entity Recognition (NER) component. It indicates how well the model is learning to predict named entities. As training progresses, this value should generally decrease, indicating that the model is improving.

In [None]:
#ENTS_P (Precision for Entities):
#This shows the precision metric for the named entity predictions. Precision is the ratio of true positive named entity predictions to the total number of named entities predicted (both true positives and false positives).

In [None]:
#ENTS_F (F1-Score for Entities):
#The F1-score is a measure of a model's accuracy that considers both precision and recall. Specifically:

In [None]:
#ENTS_R (Recall for Entities):
#This shows the recall metric for the named entity predictions. Recall is the ratio of true positive named entity predictions to the total number of actual named entities in the dataset (both true positives and false negatives).

In [None]:
#SCORE:
#This is typically a shorthand for the overall performance score of the model on a validation set or during training, often referring to the F1-score, but it can sometimes represent another metric depending on the configuration of the training.

In [None]:
#table for the  ROC curve
#those are the metrics from the table that was generated by the model training

In [10]:
import pandas as pd

# Define the data variable
data = [
    ["E", "LOSS TOK2VEC", "LOSS NER", "ENTS_F", "ENTS_P", "ENTS_R", "SCORE"],
    [0, 0, 0.00, 17.10, 1.37, 1.25, 0.01],
    [0, 200, 251.28, 2177.45, 72.14, 78.96, 0.72],
    [1, 400, 203.89, 971.34, 84.62, 86.43, 0.85],
    [1, 600, 109.18, 838.74, 91.32, 92.87, 0.91],
    [2, 800, 138.42, 734.11, 93.26, 93.74, 0.93],
    [3, 1000, 219.72, 713.64, 94.70, 94.73, 0.95],
    [5, 1200, 232.07, 642.81, 97.10, 97.04, 0.97],
    [7, 1400, 291.06, 544.04, 97.61, 97.44, 0.98],
    [9, 1600, 357.20, 494.26, 98.54, 98.18, 0.99],
    [12, 1800, 364.83, 416.68, 99.16, 98.95, 0.99],
    [15, 2000, 462.39, 383.79, 99.02, 99.45, 0.99],
    [19, 2200, 631.86, 363.15, 99.43, 99.45, 0.99],
    [24, 2400, 567.40, 287.85, 99.81, 99.77, 1.00],
    [29, 2600, 658.22, 275.39, 99.80, 99.88, 1.00],
    [34, 2800, 455.98, 175.23, 99.84, 99.81, 1.00],
    [39, 3000, 811.45, 191.90, 99.94, 99.96, 1.00],
    [44, 3200, 379.70, 120.41, 99.88, 99.88, 1.00],
    [49, 3400, 681.01, 155.25, 99.84, 99.84, 1.00],
    [54, 3600, 545.93, 146.08, 99.92, 99.96, 1.00],
    [59, 3800, 503.62, 125.46, 99.94, 99.88, 1.00],
    [64, 4000, 700.58, 137.44, 99.94, 99.96, 1.00],
    [69, 4200, 482.59, 115.46, 99.71, 99.69, 1.00],
    [75, 4400, 737.91, 158.18, 99.88, 99.88, 1.00],
    [80, 4600, 615.42, 94.62, 99.90, 99.88, 1.00],
    [85, 4800, 596.78, 97.33, 99.92, 99.92, 1.00],
    [90, 5000, 715.43, 106.47, 99.81, 99.73, 1.00],
    [95, 5200, 786.16, 117.88, 99.96, 99.96, 1.00],
    [100, 5400, 858.02, 110.69, 99.96, 100.00, 1.00],
    [105, 5600, 943.74, 142.66, 99.92, 99.92, 1.00],
    [110, 5800, 515.02, 79.29, 99.90, 99.84, 1.00],
    [115, 6000, 710.78, 87.02, 99.90, 99.84, 1.00],
    [120, 6200, 794.46, 82.42, 99.92, 99.88, 1.00],
    [125, 6400, 1047.44, 104.75, 99.94, 99.92, 1.00],
    [130, 6600, 669.36, 100.62, 99.92, 99.92, 1.00],
    [135, 6800, 442.76, 63.37, 99.94, 99.92, 1.00],
]

# Create the DataFrame
df = pd.DataFrame(data[1:], columns=data[0])

# Display the DataFrame
print(df)

# Find the row with the highest F1 score (ENTS_F)
best_metrics = df.loc[df['ENTS_F'].idxmax()]




      E  LOSS TOK2VEC  LOSS NER   ENTS_F  ENTS_P  ENTS_R  SCORE
0     0             0      0.00    17.10    1.37    1.25   0.01
1     0           200    251.28  2177.45   72.14   78.96   0.72
2     1           400    203.89   971.34   84.62   86.43   0.85
3     1           600    109.18   838.74   91.32   92.87   0.91
4     2           800    138.42   734.11   93.26   93.74   0.93
5     3          1000    219.72   713.64   94.70   94.73   0.95
6     5          1200    232.07   642.81   97.10   97.04   0.97
7     7          1400    291.06   544.04   97.61   97.44   0.98
8     9          1600    357.20   494.26   98.54   98.18   0.99
9    12          1800    364.83   416.68   99.16   98.95   0.99
10   15          2000    462.39   383.79   99.02   99.45   0.99
11   19          2200    631.86   363.15   99.43   99.45   0.99
12   24          2400    567.40   287.85   99.81   99.77   1.00
13   29          2600    658.22   275.39   99.80   99.88   1.00
14   34          2800    455.98   175.23

In [11]:
print("Best metrics based on F1 score:")
print(best_metrics)

Best metrics based on F1 score:
E                  0.00
LOSS TOK2VEC     200.00
LOSS NER         251.28
ENTS_F          2177.45
ENTS_P            72.14
ENTS_R            78.96
SCORE              0.72
Name: 1, dtype: float64


In [None]:
#learning rate 0.001

In [7]:
import pandas as pd
data = pd.read_csv("metrics.csv")

Unnamed: 0,E,LOSS TOK2VEC,LOSS NER,ENTS_F,ENTS_P,ENTS_R,SCORE
0,0,0.0,17.1,1.37,1.25,1.52,0.01
0,200,251.28,2177.45,72.14,78.96,66.41,0.72
1,400,203.89,971.34,84.62,86.43,82.89,0.85
1,600,109.18,838.74,91.32,92.87,89.83,0.91
2,800,138.42,734.11,93.26,93.74,92.79,0.93


In [12]:
import matplotlib.pyplot as plt
epochs = data['E']
f1_score = data['ENTS_F']
precision = data['ENTS_P']
recall = data['ENTS_R']

# Plot F1 Score, Precision, and Recall
plt.figure(figsize=(10, 6))
plt.plot(epochs, f1_score, label='F1 Score', marker='o')
plt.plot(epochs, precision, label='Precision', marker='o')
plt.plot(epochs, recall, label='Recall', marker='o')
plt.xlabel('Epochs')
plt.ylabel('Percentage')
plt.title('F1 Score, Precision, and Recall over Epochs')
plt.legend()
plt.grid(True)
plt.show()

TypeError: list indices must be integers or slices, not str

In [None]:
from sklearn.model_selection import KFold
import spacy
from spacy.training import Example

# Load your data
data = './articles.csv'  # Replace with your actual data loading

# Create k-fold cross-validation splits
kf = KFold(n_splits=5)

for train_index, test_index in kf.split(data):
    train_data = [data[i] for i in train_index]
    test_data = [data[i] for i in test_index]
    
    # Initialize a blank model
    nlp = spacy.blank("en")
    ner = nlp.add_pipe("ner")

    # Train the model
    # (Include your training code here)

    # Evaluate the model
    # (Include your evaluation code here)

    # Print or store performance metrics
    # (Include code to print/store metrics)
