In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import spacy
import en_core_web_trf

In [2]:
# Load data
column_names = ["Sentence", "Label"]
df_test_sen_lab = pd.read_csv('datasets_cleaned/test_cleaned.txt', sep='\t', header=None, names=column_names)

# Load Spacy model
nlp = spacy.load("en_core_web_trf")

In [3]:
# Additional terms for entity extraction
additional_terms = {"person", "man", "woman", "i", "me", "you", "he", "she", "him", "her", "they", "them", "himself", "herself", 
                    "police", "administration", "immigrants",
                   "president", "minister", "senator", "representative", "governor", "mayor", "council", "secretary", "ambassador",
                   "chancellor", "parliamentary"}

In [4]:
def find_interacting_entities(doc):
    priority_labels = ["PERSON", "NORP", "ORG", "GPE"]
    valid_entities = []

    for label in priority_labels:
        if len(valid_entities) >= 2:
            break
        current_entities = [(ent.start_char, ent.end_char, ent.text) for ent in doc.ents if ent.label_ == label]
        valid_entities.extend(current_entities)
        if label == "PERSON" and len(valid_entities) < 2:
            for token in doc:
                if token.text.lower() in additional_terms:
                    valid_entities.append((token.idx, token.idx + len(token.text), token.text))

        valid_entities = sorted(valid_entities, key=lambda x: x[0])
    
        merged_entities = []
        i = 0
        while i < len(valid_entities):
            current_ent = valid_entities[i]
            j = i + 1
            while j < len(valid_entities) and valid_entities[j][0] <= current_ent[1] + 1:
                current_ent = (current_ent[0], valid_entities[j][1], current_ent[2] + " " + valid_entities[j][2])
                j += 1
            merged_entities.append(current_ent)
            i = j
        valid_entities = merged_entities
    
    if len(valid_entities) >= 2:
        return valid_entities[:2]
    else:
        return valid_entities

def wrap_selected_entities(text, i_sent):
    doc = nlp(text)
    entities = find_interacting_entities(doc)
    
    formatted_text = text
    offset = 0
    for ent in entities:
        start, end, ent_text = ent
        start += offset
        end += offset
        formatted_text = formatted_text[:start] + '[' + ent_text + ']' + formatted_text[end:]
        offset += 2  # 2 characters for the added brackets

    formatted_text = formatted_text + "\t" + str(len(entities))
    
    #if len(entities) != 2:
        #print(f"Document {i_sent} has unexpected number of entities:\n{formatted_text}\n")
    
    return formatted_text

In [5]:
# Apply the function to the 'Sentence' column with sentence numbering
df_test_sen_lab['Formatted_Sentence'] = df_test_sen_lab.apply(lambda row: wrap_selected_entities(row['Sentence'], row.name + 1), axis=1)

# Create a new dataframe with the required columns
#df_test_Form_sen_lab_sen_lab = df_test_sen_lab[['Formatted_Sentence', 'Label']]

# Save the new dataframe
#file_path = 'datasets_new_nered/test_new_ner.txt'
#df_test_Form_sen_lab_sen_lab.to_csv(file_path, sep='\t', index=False, header=None)

In [6]:
df_test_Form_sen_lab_sen_lab = df_test_sen_lab[['Formatted_Sentence', 'Label']]

df_test_Form_sen_lab_sen_lab["Num_of_ent"] = df_test_Form_sen_lab_sen_lab['Formatted_Sentence'].str.split("\t", expand=True)[1].astype(int)

df_test_Form_sen_lab_sen_lab["Formatted_Sentence"] = df_test_Form_sen_lab_sen_lab['Formatted_Sentence'].str.split("\t", expand=True)[0]

df_test_Form_sen_lab_sen_lab = df_test_Form_sen_lab_sen_lab[["Formatted_Sentence", "Num_of_ent", "Label"]]

# Save the new dataframe
file_path = 'datasets_new_nered/test_new_ner.txt'
df_test_Form_sen_lab_sen_lab.to_csv(file_path, sep='\t', index=False, header=None)

In [7]:
df_test_Form_sen_lab_sen_lab

Unnamed: 0,Formatted_Sentence,Num_of_ent,Label
0,” Breitbart News has now reached out to the [T...,2,0
1,"[Phoebe Nora Mary Prince], 15, committed suici...",2,0
2,"Notably, the department says about 30 emails t...",2,0
3,[Meacham] was recently featured by the San Die...,2,0
4,But Portland's city code prohibits persons fro...,2,0
...,...,...,...
1618,The Trump [administration] is likely to start ...,2,3
1619,"Well, we do have challenges, but were not stup...",2,3
1620,"He was also fired from a season of ""The Celebr...",2,1
1621,[The Hong Kong Monetary Authority] started sel...,2,1


In [8]:
ent = [0, 1]
mask = df_test_Form_sen_lab_sen_lab['Num_of_ent'].isin(ent)
df_test_Form_sen_lab_sen_lab_0_1 = df_test_Form_sen_lab_sen_lab[mask]

In [9]:
df_test_Form_sen_lab_sen_lab_0_1

Unnamed: 0,Formatted_Sentence,Num_of_ent,Label
14,Mars reaches its closest approach to Earth for...,0,0
32,"On Wednesday, [Transparency International] (T...",1,0
45,A recent story by [the Washington Post] covere...,1,0
80,[The Iraq Museum of Baghdad] is to display 40 ...,1,0
98,"As a result, [Russia] will accept having milit...",1,0
...,...,...,...
1295,After major airlines jumped to suspend flights...,1,4
1326,Twickenham and the crowd is a massive factor f...,1,2
1406,[Reserve Bank of Australia Governor Glenn Stev...,1,1
1454,Cap and trade is an environmental system that ...,1,2
