In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import spacy
import en_core_web_trf

In [2]:
# Load data
column_names = ["Sentence", "Label"]
df_test_sen_lab = pd.read_csv('datasets_cleaned/valid_cleaned.txt', sep='\t', header=None, names=column_names)

# Load Spacy model
nlp = spacy.load("en_core_web_trf")

In [3]:
# Additional terms for entity extraction
additional_terms = {"person", "man", "woman", "i", "me", "you", "he", "she", "him", "her", "they", "them", "himself", "herself", 
                    "police", "administration", "immigrants",
                   "president", "minister", "senator", "representative", "governor", "mayor", "council", "secretary", "ambassador",
                   "chancellor", "parliamentary"}

In [4]:
def find_interacting_entities(doc):
    priority_labels = ["PERSON", "NORP", "ORG", "GPE"]
    valid_entities = []

    for label in priority_labels:
        if len(valid_entities) >= 2:
            break
        current_entities = [(ent.start_char, ent.end_char, ent.text) for ent in doc.ents if ent.label_ == label]
        valid_entities.extend(current_entities)
        if label == "PERSON" and len(valid_entities) < 2:
            for token in doc:
                if token.text.lower() in additional_terms:
                    valid_entities.append((token.idx, token.idx + len(token.text), token.text))

        valid_entities = sorted(valid_entities, key=lambda x: x[0])
    
        merged_entities = []
        i = 0
        while i < len(valid_entities):
            current_ent = valid_entities[i]
            j = i + 1
            while j < len(valid_entities) and valid_entities[j][0] <= current_ent[1] + 1:
                current_ent = (current_ent[0], valid_entities[j][1], current_ent[2] + " " + valid_entities[j][2])
                j += 1
            merged_entities.append(current_ent)
            i = j
        valid_entities = merged_entities
    
    if len(valid_entities) >= 2:
        return valid_entities[:2]
    else:
        return valid_entities

def wrap_selected_entities(text, i_sent):
    doc = nlp(text)
    entities = find_interacting_entities(doc)
    
    formatted_text = text
    offset = 0
    for ent in entities:
        start, end, ent_text = ent
        start += offset
        end += offset
        formatted_text = formatted_text[:start] + '[' + ent_text + ']' + formatted_text[end:]
        offset += 2  # 2 characters for the added brackets

    formatted_text = formatted_text + "\t" + str(len(entities))
    
    #if len(entities) != 2:
        #print(f"Document {i_sent} has unexpected number of entities:\n{formatted_text}\n")
    
    return formatted_text

In [5]:
# Apply the function to the 'Sentence' column with sentence numbering
df_test_sen_lab['Formatted_Sentence'] = df_test_sen_lab.apply(lambda row: wrap_selected_entities(row['Sentence'], row.name + 1), axis=1)

# Create a new dataframe with the required columns
#df_test_Form_sen_lab_sen_lab = df_test_sen_lab[['Formatted_Sentence', 'Label']]

# Save the new dataframe
#file_path = 'datasets_new_nered/test_new_ner.txt'
#df_test_Form_sen_lab_sen_lab.to_csv(file_path, sep='\t', index=False, header=None)

In [6]:
df_test_Form_sen_lab_sen_lab = df_test_sen_lab[['Formatted_Sentence', 'Label']]

df_test_Form_sen_lab_sen_lab["Num_of_ent"] = df_test_Form_sen_lab_sen_lab['Formatted_Sentence'].str.split("\t", expand=True)[1].astype(int)

df_test_Form_sen_lab_sen_lab["Formatted_Sentence"] = df_test_Form_sen_lab_sen_lab['Formatted_Sentence'].str.split("\t", expand=True)[0]

df_test_Form_sen_lab_sen_lab = df_test_Form_sen_lab_sen_lab[["Formatted_Sentence", "Num_of_ent", "Label"]]

# Save the new dataframe
file_path = 'datasets_new_nered/valid_new_ner.txt'
df_test_Form_sen_lab_sen_lab.to_csv(file_path, sep='\t', index=False, header=None)

In [7]:
df_test_Form_sen_lab_sen_lab

Unnamed: 0,Formatted_Sentence,Num_of_ent,Label
0,[Kratz]’s claims would be a lot easier to beli...,2,0
1,[Brussels] attacker [Najim Laachraoui].,2,0
2,The [Conservatives] support is up from 32.4 pe...,2,0
3,"The meeting, put together by Mr. [Trump]’s con...",2,0
4,The president's defeat highlighted the difficu...,2,0
...,...,...,...
1456,Mr [Nicholls] said [Gales] had been responsibl...,2,1
1457,Former [Olympic] cyclist [Chris Boardman] has ...,2,3
1458,"NewsCNN invited architect [James Furzer], whos...",2,3
1459,"Lastly, there are major questions as to the su...",2,4


In [8]:
ent = [0, 1]
mask = df_test_Form_sen_lab_sen_lab['Num_of_ent'].isin(ent)
df_test_Form_sen_lab_sen_lab_0_1 = df_test_Form_sen_lab_sen_lab[mask]

In [9]:
df_test_Form_sen_lab_sen_lab_0_1

Unnamed: 0,Formatted_Sentence,Num_of_ent,Label
15,[Coons] has been one of the staunchest critics...,1,0
21,Ads for [Secret] deodorant nudge us to ask for...,1,0
28,"Since its acclaimed launch in February, Full F...",1,0
32,1980s icon [Debbie Gibson] took the Broadway R...,1,0
45,"At Sunday’s Golden Globes, venerable actress ...",1,0
62,"After finishing, having roared through the las...",1,0
105,Let’s start this discussion by not being silly...,1,0
153,[Marleys] used to overcoming criticism based o...,1,0
163,"The Bachelor,” appeared on the [Kimmel] show t...",1,0
202,"In 1989, [Wen] imagined phases like the fracti...",1,0


In [11]:
df_test_Form_sen_lab_sen_lab_0_1.describe()

Unnamed: 0,Num_of_ent,Label
count,49.0,49.0
mean,0.959184,0.346939
std,0.199915,0.804959
min,0.0,0.0
25%,1.0,0.0
50%,1.0,0.0
75%,1.0,0.0
max,1.0,3.0
