# Running the Code

`pip install spacy
python -m spacy download en_core_web_sm
pip install spacy-lookups-data
pip install label-studio`

# Generating

Add to the corpus from a news article.

In [7]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json

# URL of the article to scrape
url = 'https://www.hrw.org/world-report/2020/country-chapters/israel-and-palestine'

# Send a GET request to the URL and get the response
response = requests.get(url)

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Extract the text content from each <p> element and store in a JSON object
data = {"paragraphs": []}
for p in soup.find_all('p'):
    text = p.get_text().strip()
    if len(text) > 0:
        data["paragraphs"].append({"text": text})
df = pd.DataFrame(data["paragraphs"])

# Save the DataFrame as a CSV file
df.to_csv('paragraphs.csv', index=False)
# Save the DataFrame as a JSONL file
with open('paragraphs.jsonl', 'w') as f:
    for _, row in df.iterrows():
        json.dump(row.to_dict(), f)
        f.write('\n')

Add to the corpus a set of manual labels.

In [12]:
import jsonlines

# List of phrases for each label
israel_phrases = [
    'Israeli settlements in the West Bank',
    'Jerusalem as the capital of Israel',
    'Iron Dome and Israel’s missile defense',
    'Likud and the Israeli right-wing',
    'Arab-Israeli conflict and peace talks',
    'Mossad and Israel’s intelligence agency',
    'Jewish National Fund and land ownership',
    'Knesset and Israeli politics',
    'Yitzhak Rabin and the assassination',
    'Israeli-Palestinian coexistence initiatives',
    "Jerusalem",
    "West Bank",
    "Gaza",
    "Netanyahu",
    "Israeli-Palestinian conflict",
    "Jewish settlements",
    "Iron Dome",
    "Masada",
    "Yom Kippur War",
    "Six-Day War"
]

palestine_phrases = [
    'Palestinian refugees and global politics',
    'Occupation of the West Bank and East Jerusalem',
    'Gaza Strip and the Israeli blockade',
    'Palestinian Authority and its governance',
    'Hamas and its role in Palestinian politics',
    'Al-Aqsa Mosque and Temple Mount',
    'Palestinian Nationalism and the PLO',
    'Oslo Accords and the peace process',
    'BDS Movement and its impact',
    'Intifada and resistance movements',
    "West Bank",
    "Gaza",
    "Hamas",
    "Palestinian Authority",
    "Israeli-Palestinian conflict",
    "Al-Aqsa Mosque",
    "Intifada",
    "Right of Return",
    "Two-State Solution",
    "Nakba"
]

judaism_phrases = [
    'Jewish diaspora and global communities',
    'Torah and Jewish law',
    'Anti-Semitism and Jewish discrimination',
    'Jewish settlements in the West Bank',
    'Hasidic Judaism and its practices',
    'Zionism and Jewish nationalism',
    'Kabbalah and Jewish mysticism',
    'Jewish festivals and holidays',
    'Talmud and Jewish scholarship',
    'Holocaust and Jewish history',
    "Torah",
    "Talmud",
    "Rabbi",
    "Synagogue",
    "Kabbalah",
    "Hasidism",
    "Passover",
    "Yom Kippur",
    "Hanukkah",
    "Bar Mitzvah"
]

# Generate list of labeled data
labeled_data = []
for israel_text in israel_phrases:
    israel_label = [israel_text, [[0, len(israel_text), "ISRAEL"]]]
    labeled_data.append(israel_label)

for palestine_text in palestine_phrases:
    palestine_label = [palestine_text, [[0, len(palestine_text), "PALESTINE"]]]
    labeled_data.append(palestine_label)

for judaism_text in judaism_phrases:
    judaism_label = [judaism_text, [[0, len(judaism_text), "JUDAISM"]]]
    labeled_data.append(judaism_label)

# Write data to JSONL file
with jsonlines.open('phrases.jsonl', mode='w') as writer:
    for item in labeled_data:
        data = {
            'text': item[0],
            'label': item[1]
        }
        writer.write(data)


# Annotating

Open an instance of doccano to upload `paragraphs.jsonl` and `phrases.jsonl` and manually check or add labels.

Save these labels then to a file called `doccano_labeled.jsonl`.

# Splitting

Split into test and train data.

In [13]:
import random

# Set a seed for reproducability
random.seed(42)

# Read the annotated data from the Prodigy output
with open("doccano_labeled.jsonl", "r", encoding="utf-8") as f:
    annotated_data = f.readlines()

# Split the data into training and validation sets
random.shuffle(annotated_data)
split_idx = int(len(annotated_data) * 0.8)

train_data = annotated_data[:split_idx]
valid_data = annotated_data[split_idx:]

# Write the training and validation data to separate files
with open("train.jsonl", "w", encoding="utf-8") as f:
    f.writelines(train_data)

with open("valid.jsonl", "w", encoding="utf-8") as f:
    f.writelines(valid_data)


In [31]:
import json
import spacy

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Define a function to convert the Prodigy output to spaCy's training data format
def convert_to_spacy_format(annotated_data):
    spacy_data = []

    for line in annotated_data:
        data = json.loads(line)
        text = data["text"]
        spans = {}

        for entity in data["entities"]:
            start = entity["start_offset"]
            end = entity["end_offset"]
            label = entity["label"]
            spans[label] = []
            spans[label].append((start, end))

        spacy_data.append((text, {"spans": spans}))

    return spacy_data

# Convert the training and validation data to spaCy format
with open("train.jsonl", "r", encoding="utf-8") as f:
    train_data = f.readlines()

with open("valid.jsonl", "r", encoding="utf-8") as f:
    valid_data = f.readlines()

train_spacy_data = convert_to_spacy_format(train_data)
valid_spacy_data = convert_to_spacy_format(valid_data)

# Save the spaCy data to separate files
with open("train_spacy.json", "w", encoding="utf-8") as f:
    json.dump(train_spacy_data, f)

with open("valid_spacy.json", "w", encoding="utf-8") as f:
    json.dump(valid_spacy_data, f)


# spaCy Training

In [42]:
import random
from spacy.util import minibatch, compounding
from spacy.training import Example
import spacy
from pathlib import Path

# read the annotated data
with open("train_spacy.json", "r") as f:
    TRAIN_DATA = json.load(f)

# Load the model and set up the pipeline
nlp = spacy.load('en_core_web_sm')
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner, last=True)
else:
    ner = nlp.get_pipe('ner')

# Add the labels
ner.add_label("ISRAEL")
ner.add_label("PALESTINE")
ner.add_label("JUDAISM")

# define the output directory for the trained model
output_dir = Path("entity_model")

# Disable other pipelines in spaCy to only train NER
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
with nlp.disable_pipes(*unaffected_pipes):

    # Set up the optimizer
    optimizer = nlp.begin_training()

    # Iterate over the training data
    for i in range(20):
        # Shuffle the training data
        random.shuffle(TRAIN_DATA)

        # Create batches of training data
        batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))

        # Initialize the losses
        losses = {}

        # Iterate over the batches
        for batch in batches:
            # Convert the batch to Examples and update the model
            examples = []
            for text, annots in batch:
                examples.append(Example.from_dict(nlp.make_doc(text), annots))
            nlp.update(examples, sgd=optimizer, drop=0.35, losses=losses)

        # Print the losses
        print(f"Losses at iteration {i}: {losses}")

        # save the trained model to the output directory
        nlp.to_disk(output_dir)

# test the trained model on some sample text
test_text = "Israeli Prime Minister Benjamin Netanyahu visited the White House today. Israel, Palestine, Judaism."
doc = nlp(test_text)
print("Spans in '%s'" % test_text)
for span in doc.spans:
    print(span.label_, span.text)

Losses at iteration 0: {'ner': 0.0}
Losses at iteration 1: {'ner': 0.0}
Losses at iteration 2: {'ner': 0.0}
Losses at iteration 3: {'ner': 0.0}
Losses at iteration 4: {'ner': 0.0}
Losses at iteration 5: {'ner': 0.0}
Losses at iteration 6: {'ner': 0.0}
Losses at iteration 7: {'ner': 0.0}
Losses at iteration 8: {'ner': 0.0}
Losses at iteration 9: {'ner': 0.0}
Losses at iteration 10: {'ner': 0.0}
Losses at iteration 11: {'ner': 0.0}
Losses at iteration 12: {'ner': 0.0}
Losses at iteration 13: {'ner': 0.0}
Losses at iteration 14: {'ner': 0.0}
Losses at iteration 15: {'ner': 0.0}
Losses at iteration 16: {'ner': 0.0}
Losses at iteration 17: {'ner': 0.0}
Losses at iteration 18: {'ner': 0.0}
Losses at iteration 19: {'ner': 0.0}
Spans in 'Israeli Prime Minister Benjamin Netanyahu visited the White House today. Israel, Palestine, Judaism.'


In [53]:
import spacy
import json

# Load the saved NER model
nlp = spacy.load('entity_model')

# Load the validation data
with open('valid_spacy.json', 'r') as f:
    validation_data = json.load(f)

true_positive_sum = 0

# Iterate over the validation data and test the model
for data in validation_data:
    text = data[0]
    gold_spans = data[1]['spans']
    doc = nlp(text)
    predicted_spans = [(span.start_char, span.end_char, span.label_) for span in doc.spans]
    
    # Compare the predicted entities to the gold-standard entities
    # and print the evaluation metrics
    true_positives = set(predicted_spans) & set(gold_spans)
    true_positive_sum += len(true_positives)
    false_positives = set(predicted_spans) - set(gold_spans)
    false_negatives = set(gold_spans) - set(predicted_spans)
    
    if(len(true_positives) == 0):
        precision = 0
        recall = 0
        f1_score = 0
    else:
        precision = len(true_positives) / (len(true_positives) + len(false_positives))
        recall = len(true_positives) / (len(true_positives) + len(false_negatives))
        f1_score = 2 * (precision * recall) / (precision + recall)
    
    print('Text: ', text)
    print('Gold entities: ', gold_spans)
    print('Predicted entities: ', predicted_spans)
    print('Precision: ', precision)
    print('Recall: ', recall)
    print('F1-score: ', f1_score)
    print('---------------------------------------')

print("Total number of true positives: ",true_positive_sum)



Text:  Talmud and Jewish scholarship
Gold entities:  {'JUDAISM': [[0, 29]]}
Predicted entities:  []
Precision:  0
Recall:  0
F1-score:  0
---------------------------------------
Text:  Jerusalem
Gold entities:  {'ISRAEL': [[0, 9]]}
Predicted entities:  []
Precision:  0
Recall:  0
F1-score:  0
---------------------------------------
Text:  Yitzhak Rabin and the assassination
Gold entities:  {'ISRAEL': [[0, 35]]}
Predicted entities:  []
Precision:  0
Recall:  0
F1-score:  0
---------------------------------------
Text:  Talmud
Gold entities:  {'JUDAISM': [[0, 6]]}
Predicted entities:  []
Precision:  0
Recall:  0
F1-score:  0
---------------------------------------
Text:  Hanukkah
Gold entities:  {'JUDAISM': [[0, 8]]}
Predicted entities:  []
Precision:  0
Recall:  0
F1-score:  0
---------------------------------------
Text:  An updated version of this World Report chapter is available here >>
Gold entities:  {}
Predicted entities:  []
Precision:  0
Recall:  0
F1-score:  0
----------------