In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import spacy
import en_core_web_trf

In [2]:
# Load data
column_names = ["Sentence", "Label"]
df_test_sen_lab = pd.read_csv('datasets_cleaned/valid_cleaned.txt', sep='\t', header=None, names=column_names)

# Load Spacy model
nlp = spacy.load("en_core_web_trf")

In [3]:
# Additional terms for entity extraction
additional_terms = {"person", "man", "woman", "police", "administration", "immigrants",
                   "president", "minister", "senator", "representative", "governor", "mayor", "council", "secretary", "ambassador",
                   "chancellor", "parliamentary"}

In [4]:
pronouns = {"i", "me", "myself", "you", "yourself", "he", "she", "him", "her", "they", "them", "himself", "herself", "themself"}

In [5]:
def find_interacting_entities(doc):
    # Define the priority labels for entities to be extracted
    priority_labels = ["PERSON", "NORP", "ORG", "GPE"]
    valid_entities = []

    for label in priority_labels:
        # If we already have two different entities, stop searching
        if len(valid_entities) >= 2:
            if valid_entities[0][2].lower() != valid_entities[1][2].lower(): break # 2 entities cant be the same entities 

        # Extract entities matching the current priority label
        current_entities = [(ent.start_char, ent.end_char, ent.text) for ent in doc.ents if ent.label_ == label]
        valid_entities.extend(current_entities)

        # Additional check for PERSON label: include tokens from additional terms if fewer than 2 entities found
        if label == "PERSON" and len(valid_entities) < 2:
            for token in doc:
                if token.text.lower() in additional_terms:
                    valid_entities.append((token.idx, token.idx + len(token.text), token.text))

        # Additional check after the last label: include tokens from pronouns if fewer than 2 entities found
        if label == "GPE" and len(valid_entities) < 2:
            for token in doc:
                if token.text.lower() in pronouns:
                    valid_entities.append((token.idx, token.idx + len(token.text), token.text))

        # Sort entities by their start position
        valid_entities = sorted(valid_entities, key=lambda x: x[0])

        # Merge adjacent entities
        merged_entities = []
        i = 0
        while i < len(valid_entities):
            current_ent = valid_entities[i]
            j = i + 1
            while j < len(valid_entities) and valid_entities[j][0] <= current_ent[1] + 1:
                current_ent = (current_ent[0], valid_entities[j][1], current_ent[2] + " " + valid_entities[j][2])
                j += 1
            merged_entities.append(current_ent)
            i = j
        valid_entities = merged_entities

    # Return the first two entities if available
    if len(valid_entities) >= 2:
        return valid_entities[:2]
    else:
        return valid_entities



def wrap_selected_entities(text):
    doc = nlp(text)
    entities = find_interacting_entities(doc)
    
    formatted_text = text
    offset = 0
    for ent in entities:
        start, end, ent_text = ent
        start += offset
        end += offset
        formatted_text = formatted_text[:start] + '[' + ent_text + ']' + formatted_text[end:]
        offset += 2  # 2 characters for the added brackets

    # Construct the final formatted text with entity details
    entity_count = len(entities)
    if entity_count == 0:
        formatted_text = formatted_text + "\t" + "0" + "\t" + "None" + "\t" + "None"
    elif entity_count == 1:
        formatted_text = formatted_text + "\t" + "1" + "\t" + entities[0][2] + "\t" + "None"
    else:
        formatted_text = formatted_text + "\t" + "2" + "\t" + entities[0][2] + "\t" + entities[1][2]
    
    return formatted_text

In [6]:
# Apply the function to the 'Sentence' column with sentence numbering
df_test_sen_lab['Formatted_Sentence'] = df_test_sen_lab.apply(lambda row: wrap_selected_entities(row['Sentence']), axis=1)

In [7]:
# Select the relevant columns
df_test_Form_sen_lab_sen_lab = df_test_sen_lab[['Formatted_Sentence', 'Label']]

# Split the 'Formatted_Sentence' column into multiple columns
split_columns = df_test_Form_sen_lab_sen_lab['Formatted_Sentence'].str.split("\t", expand=True)

# Assign the new columns to the DataFrame and rename them
df_test_Form_sen_lab_sen_lab["Formatted_Sentence"] = split_columns[0]
df_test_Form_sen_lab_sen_lab["Num_of_ent"] = split_columns[1].astype(int)
df_test_Form_sen_lab_sen_lab["Ent_1"] = split_columns[2]
df_test_Form_sen_lab_sen_lab["Ent_2"] = split_columns[3]

# Select and reorder the final columns
df_test_Form_sen_lab_sen_lab = df_test_Form_sen_lab_sen_lab[["Formatted_Sentence", "Ent_1", "Ent_2", "Num_of_ent", "Label"]]

# Save the new dataframe
file_path = 'datasets_new_nered/valid_new_ner.txt'
df_test_Form_sen_lab_sen_lab.to_csv(file_path, sep='\t', index=False, header=None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_Form_sen_lab_sen_lab["Formatted_Sentence"] = split_columns[0]


In [8]:
df_test_Form_sen_lab_sen_lab

Unnamed: 0,Formatted_Sentence,Ent_1,Ent_2,Num_of_ent,Label
0,[Kratz]’s claims would be a lot easier to beli...,Kratz,Avery,2,0
1,[Brussels] attacker [Najim Laachraoui].,Brussels,Najim Laachraoui,2,0
2,The [Conservatives] support is up from 32.4 pe...,Conservatives,Liberals,2,0
3,"The meeting, put together by Mr. [Trump]’s con...",Trump,Paul Teller,2,0
4,The president's defeat highlighted the difficu...,Mubarak,Morsi,2,0
...,...,...,...,...,...
1456,Mr [Nicholls] said [Gales] had been responsibl...,Nicholls,Gales,2,1
1457,Former [Olympic] cyclist [Chris Boardman] has ...,Olympic,Chris Boardman,2,3
1458,"NewsCNN invited architect [James Furzer], whos...",James Furzer,Dean Harvey,2,3
1459,"Lastly, there are major questions as to the su...",Chinas,Farr,2,4


In [9]:
# take only those rows, where Num_of_ent = 2
ent = [2]
mask = df_test_Form_sen_lab_sen_lab['Num_of_ent'].isin(ent)
df_test_Form_sen_lab_sen_lab_2 = df_test_Form_sen_lab_sen_lab[mask]
df_test_Form_sen_lab_sen_lab_2 = df_test_Form_sen_lab_sen_lab_2.drop(columns=['Num_of_ent'])

In [10]:
file_path = 'datasets_new_nered_2_ents/valid.txt'
df_test_Form_sen_lab_sen_lab_2.to_csv(file_path, sep='\t', index=False, header=None)