In [1]:
# import os
# import json
# import random
# from faker import Faker
# from sklearn.model_selection import train_test_split

# # Initialize Faker for generating random data
# fake = Faker()

# # Create synthetic essay-like data
# def create_synthetic_data(num_examples=500):
#     examples = []
#     for _ in range(num_examples):
#         # Generate random entities
#         name = fake.name()
#         institution = fake.company() + " University"
#         course = fake.job()
#         student_number = fake.random_int(min=10000, max=99999)
#         course_id = fake.random_int(min=100, max=999)
        
#         # Generate random essay-like content
#         base_text = fake.paragraph(nb_sentences=5)
        
#         # Insert entities into the essay
#         insert_positions = sorted(random.sample(range(len(base_text)), 5))
#         text = (
#             base_text[:insert_positions[0]] + name + base_text[insert_positions[0]:insert_positions[1]] +
#             str(student_number) + base_text[insert_positions[1]:insert_positions[2]] + course +
#             base_text[insert_positions[2]:insert_positions[3]] + str(course_id) +
#             base_text[insert_positions[3]:insert_positions[4]] + institution +
#             base_text[insert_positions[4]:]
#         )
        
#         # Calculate entity positions
#         annotations = {
#             "entities": [
#                 (insert_positions[0], insert_positions[0] + len(name), "PERSON"),
#                 (insert_positions[1], insert_positions[1] + len(str(student_number)), "STUDENT_NUMBER"),
#                 (insert_positions[2], insert_positions[2] + len(course), "COURSE"),
#                 (insert_positions[3], insert_positions[3] + len(str(course_id)), "COURSE_ID"),
#                 (insert_positions[4], insert_positions[4] + len(institution), "ORG")
#             ]
#         }
        
#         examples.append((text, annotations))
    
#     return examples

# # Save data to a folder
# def save_data(data, folder_path):
#     if not os.path.exists(folder_path):
#         os.makedirs(folder_path)
    
#     for i, (text, annotations) in enumerate(data):
#         file_path = os.path.join(folder_path, f"example_{i}.json")
#         with open(file_path, 'w') as f:
#             json.dump({"text": text, "annotations": annotations}, f)

# # Main script
# if __name__ == "__main__":
#     # Create synthetic data
#     data = create_synthetic_data(500)
    
#     # Split data into train and test sets (80% train, 20% test)
#     train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
    
#     # Define paths
#     train_folder = "education_data/train"
#     test_folder = "education_data/test"
    
#     # Save train data
#     save_data(train_data, train_folder)
    
#     # Save test data
#     save_data(test_data, test_folder)
    
#     # Print an example
#     print("Example from training data:", train_data[0])
#     print("Example from testing data:", test_data[0])


In [2]:
import os
import json

def load_data_from_folder(folder_path):
    texts = []
    annotations = []
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        with open(file_path, 'r') as file:
            data = json.load(file)
            texts.append(data['text'])
            annotations.append(data['annotations']['entities'])
    return texts, annotations

# Load training data
train_texts, train_annotations = load_data_from_folder('education_data/train')
# Load testing data
test_texts, test_annotations = load_data_from_folder('education_data/test')

# Print one example from training data
print("Example from training data:")
print("Text:", train_texts[0])
print("Annotations:", train_annotations[0])

# Print one example from testing data
print("\nExample from testing data:")
print("Text:", test_texts[0])
print("Annotations:", test_annotations[0])


Example from training data:
Text: Free wife range scienKevin Jonesce win energy. Build chance policy a74354pprChiropodistoach. Social bed involve establi876sh several watch expectDavidson PLC University.
Annotations: [[21, 32, 'PERSON'], [57, 62, 'STUDENT_NUMBER'], [60, 71, 'COURSE'], [92, 95, 'COURSE_ID'], [115, 138, 'ORG']]

Example from testing data:
Text: Party per small prove positive year. Themselves prove less monthOscar Short cam87069paign blood. Someone way drive mission radio mention. Drop before third agreement reducTherapist, sportse base. Law sell es358peciJohnson and Sons Universityally tell.
Annotations: [[64, 75, 'PERSON'], [68, 73, 'STUDENT_NUMBER'], [155, 172, 'COURSE'], [174, 177, 'COURSE_ID'], [178, 205, 'ORG']]


In [3]:
def filter_overlapping_entities(entities):
    # Sort entities by their start position
    entities = sorted(entities, key=lambda x: x[0])
    filtered_entities = []
    last_end = -1

    for start, end, label in entities:
        if start >= last_end:
            filtered_entities.append((start, end, label))
            last_end = end

    return filtered_entities

# Apply the filter to the annotations
train_annotations = [filter_overlapping_entities(annotations) for annotations in train_annotations]
test_annotations = [filter_overlapping_entities(annotations) for annotations in test_annotations]


In [9]:
import spacy
from spacy.training import Example

# Load a blank model
nlp = spacy.blank("en")

# Add a NER component to the pipeline
ner = nlp.add_pipe("ner")

# Add labels to the NER component
for annotations in train_annotations:
    for start, end, label in annotations:
        ner.add_label(label)

# Prepare training data
train_data = []
for text, annotations in zip(train_texts, train_annotations):
    entities = [(start, end, label) for start, end, label in annotations]
    train_data.append(Example.from_dict(nlp.make_doc(text), {"entities": entities}))

# Train the model
optimizer = nlp.begin_training()
for i in range(100):  # Number of iterations
    losses = {}
    nlp.update(train_data, sgd=optimizer, losses=losses)
    print(f"Iteration {i}, Losses: {losses}")

# Save the model
nlp.to_disk("spacy_model")


Iteration 0, Losses: {'ner': 9789.819361329079}
Iteration 1, Losses: {'ner': 9644.833057701588}
Iteration 2, Losses: {'ner': 9427.046074688435}
Iteration 3, Losses: {'ner': 9081.104561150074}
Iteration 4, Losses: {'ner': 8486.050849676132}
Iteration 5, Losses: {'ner': 7403.166162908077}
Iteration 6, Losses: {'ner': 5497.407516896725}
Iteration 7, Losses: {'ner': 2856.6247889995575}
Iteration 8, Losses: {'ner': 779.3533536363393}
Iteration 9, Losses: {'ner': 131.75973317900207}
Iteration 10, Losses: {'ner': 64.00592240209517}
Iteration 11, Losses: {'ner': 61.89801471908771}
Iteration 12, Losses: {'ner': 61.96520461478098}
Iteration 13, Losses: {'ner': 61.98256518310707}
Iteration 14, Losses: {'ner': 61.98000112959788}
Iteration 15, Losses: {'ner': 61.952348687003195}
Iteration 16, Losses: {'ner': 61.90336807759007}
Iteration 17, Losses: {'ner': 61.74535450383295}
Iteration 18, Losses: {'ner': 61.33183424724592}
Iteration 19, Losses: {'ner': 59.69262844781042}
Iteration 20, Losses: {'ner

In [10]:
import pycrfsuite

# Prepare training data for CRF
def prepare_crf_data(texts, annotations):
    data = []
    for text, entities in zip(texts, annotations):
        words = text.split()
        labels = ['O'] * len(words)
        for start, end, label in entities:
            start_word_idx = len(text[:start].split())
            end_word_idx = len(text[:end].split())
            for i in range(start_word_idx, end_word_idx):
                if i < len(labels):
                    labels[i] = label
        data.append(list(zip(words, labels)))
    return data

train_data = prepare_crf_data(train_texts, train_annotations)
test_data = prepare_crf_data(test_texts, test_annotations)

# Define feature extraction function
def extract_features(doc):
    return [word for word in doc]

# Train CRF model
trainer = pycrfsuite.Trainer()
for xseq in train_data:
    words, labels = zip(*xseq)
    trainer.append(extract_features(words), labels)
trainer.train('crf_model.crfsuite')


Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 377
Seconds required: 0.003

L-BFGS optimization
c1: 0.000000
c2: 1.000000
num_memories: 6
max_iterations: 2147483647
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 11823.057520
Feature norm: 1.000000
Error norm: 3896.914189
Active features: 377
Line search trials: 1
Line search step: 0.000070
Seconds required for this iteration: 0.004

***** Iteration #2 *****
Loss: 10891.013538
Feature norm: 1.354332
Error norm: 2272.314675
Active features: 377
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.002

***** Iteration #3 *****
Loss: 10404.608985
Feature norm: 1.474227
Error norm: 2313.305789
Active features: 377
Line search trials: 1
Line search step: 1.000000
Seconds required for thi

In [13]:
from spacy.training import Example
from spacy.scorer import Scorer
from sklearn.metrics import classification_report

# Load the trained model
nlp = spacy.load("spacy_model")

# Prepare test data
test_data = []
for text, annotations in zip(test_texts, test_annotations):
    entities = [(start, end, label) for start, end, label in annotations]
    test_data.append(Example.from_dict(nlp.make_doc(text), {"entities": entities}))

# Evaluate the model
scorer = Scorer()
scores = scorer.score(test_data)

# Extract evaluation metrics
precision = scores['ents_p']
recall = scores['ents_r']
f1_score = scores['ents_f']
print(f"spaCy - Precision: {precision}, Recall: {recall}, F1 Score: {f1_score}")


spaCy - Precision: 0.0, Recall: 0.0, F1 Score: 0.0


In [16]:
from sklearn.metrics import classification_report

# Load the trained model
tagger = pycrfsuite.Tagger()
tagger.open('crf_model.crfsuite')

# Prepare test data
X_test = [extract_features([word for word, label in xseq]) for xseq in test_data]
y_test = [[label for word, label in xseq] for xseq in test_data]

# Evaluate the model
y_pred = [tagger.tag(xseq) for xseq in X_test]

# Flatten the lists for classification report
y_test_flat = [label for seq in y_test for label in seq]
y_pred_flat = [label for seq in y_pred for label in seq]

# Print classification report
print(classification_report(y_test_flat, y_pred_flat))


TypeError: 'spacy.training.example.Example' object is not iterable