In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import spacy
import en_core_web_trf

In [2]:
# Load data
column_names = ["Sentence", "Label"]
df_test_sen_lab = pd.read_csv('datasets_cleaned/test_cleaned.txt', sep='\t', header=None, names=column_names)

# Load Spacy model
nlp = spacy.load("en_core_web_trf")

In [3]:
# Additional terms for entity extraction
additional_terms = {"person", "man", "woman", "i", "me", "you", "he", "she", "him", "her", "they", "them", "himself", "herself", 
                    "police", "administration", "immigrants",
                   "president", "minister", "senator", "representative", "governor", "mayor", "council", "secretary", "ambassador",
                   "chancellor", "parliamentary"}

In [4]:
def find_interacting_entities(doc):
    priority_labels = ["PERSON", "NORP", "ORG", "GPE"]
    valid_entities = []

    for label in priority_labels:
        if len(valid_entities) >= 2:
            break
        current_entities = [(ent.start_char, ent.end_char, ent.text) for ent in doc.ents if ent.label_ == label]
        valid_entities.extend(current_entities)
        if label == "PERSON" and len(valid_entities) < 2:
            for token in doc:
                if token.text.lower() in additional_terms:
                    valid_entities.append((token.idx, token.idx + len(token.text), token.text))

        valid_entities = sorted(valid_entities, key=lambda x: x[0])
    
        merged_entities = []
        i = 0
        while i < len(valid_entities):
            current_ent = valid_entities[i]
            j = i + 1
            while j < len(valid_entities) and valid_entities[j][0] <= current_ent[1] + 1:
                current_ent = (current_ent[0], valid_entities[j][1], current_ent[2] + " " + valid_entities[j][2])
                j += 1
            merged_entities.append(current_ent)
            i = j
        valid_entities = merged_entities
    
    if len(valid_entities) >= 2:
        return valid_entities[:2]
    else:
        return valid_entities

def wrap_selected_entities(text, i_sent):
    doc = nlp(text)
    entities = find_interacting_entities(doc)
    
    formatted_text = text
    offset = 0
    for ent in entities:
        start, end, ent_text = ent
        start += offset
        end += offset
        formatted_text = formatted_text[:start] + '[' + ent_text + ']' + formatted_text[end:]
        offset += 2  # 2 characters for the added brackets
    
    entity_count = len(entities)
    if entity_count == 0:
        formatted_text = formatted_text + "\t" + "0" + "\t" + "None" + "\t" + "None"
    elif entity_count == 1:
        formatted_text = formatted_text + "\t" + "1" + "\t" + entities[0][2] + "\t" + "None"
    else:
        formatted_text = formatted_text + "\t" + "2" + "\t" + entities[0][2] + "\t" + entities[1][2]
    
    #if len(entities) != 2:
        #print(f"Document {i_sent} has unexpected number of entities:\n{formatted_text}\n")
    
    return formatted_text

In [5]:
# Apply the function to the 'Sentence' column with sentence numbering
df_test_sen_lab['Formatted_Sentence'] = df_test_sen_lab.apply(lambda row: wrap_selected_entities(row['Sentence'], row.name + 1), axis=1)

# Create a new dataframe with the required columns
#df_test_Form_sen_lab_sen_lab = df_test_sen_lab[['Formatted_Sentence', 'Label']]

# Save the new dataframe
#file_path = 'datasets_new_nered/test_new_ner.txt'
#df_test_Form_sen_lab_sen_lab.to_csv(file_path, sep='\t', index=False, header=None)

In [6]:
print(df_test_sen_lab['Formatted_Sentence'][0])

” Breitbart News has now reached out to the [Trump] campaign twice to ask if he would be comfortable with a [Ivanka] going into a bathroom at Target in which there was a man wearing a dress, but has yet to receive a response.	2	Trump	Ivanka


In [7]:
df_test_Form_sen_lab_sen_lab = df_test_sen_lab[['Formatted_Sentence', 'Label']]

df_test_Form_sen_lab_sen_lab["Num_of_ent"] = df_test_Form_sen_lab_sen_lab['Formatted_Sentence'].str.split("\t", expand=True)[1].astype(int)

df_test_Form_sen_lab_sen_lab["Ent_1"] = df_test_Form_sen_lab_sen_lab['Formatted_Sentence'].str.split("\t", expand=True)[2]

df_test_Form_sen_lab_sen_lab["Ent_2"] = df_test_Form_sen_lab_sen_lab['Formatted_Sentence'].str.split("\t", expand=True)[3]

df_test_Form_sen_lab_sen_lab["Formatted_Sentence"] = df_test_Form_sen_lab_sen_lab['Formatted_Sentence'].str.split("\t", expand=True)[0]

df_test_Form_sen_lab_sen_lab = df_test_Form_sen_lab_sen_lab[["Formatted_Sentence", "Ent_1", "Ent_2", "Num_of_ent", "Label"]]

# Save the new dataframe
file_path = 'datasets_new_nered/test_new_ner.txt'
df_test_Form_sen_lab_sen_lab.to_csv(file_path, sep='\t', index=False, header=None)

In [8]:
df_test_Form_sen_lab_sen_lab

Unnamed: 0,Formatted_Sentence,Ent_1,Ent_2,Num_of_ent,Label
0,” Breitbart News has now reached out to the [T...,Trump,Ivanka,2,0
1,"[Phoebe Nora Mary Prince], 15, committed suici...",Phoebe Nora Mary Prince,Phoebe Prince,2,0
2,"Notably, the department says about 30 emails t...",U. S.,Benghazi,2,0
3,[Meacham] was recently featured by the San Die...,Meacham,Trump,2,0
4,But Portland's city code prohibits persons fro...,Root,Ashton,2,0
...,...,...,...,...,...
1618,The Trump [administration] is likely to start ...,administration,Obama,2,3
1619,"Well, we do have challenges, but were not stup...",Trump,president,2,3
1620,"He was also fired from a season of ""The Celebr...","""Hulk Hogan",Hogan,2,1
1621,[The Hong Kong Monetary Authority] started sel...,The Hong Kong Monetary Authority,U.S.,2,1


In [9]:
ent = [0, 1]
mask = df_test_Form_sen_lab_sen_lab['Num_of_ent'].isin(ent)
df_test_Form_sen_lab_sen_lab_0_1 = df_test_Form_sen_lab_sen_lab[mask]

In [10]:
df_test_Form_sen_lab_sen_lab_0_1

Unnamed: 0,Formatted_Sentence,Ent_1,Ent_2,Num_of_ent,Label
14,Mars reaches its closest approach to Earth for...,,,0,0
32,"On Wednesday, [Transparency International] (T...",Transparency International,,1,0
45,A recent story by [the Washington Post] covere...,the Washington Post,,1,0
80,[The Iraq Museum of Baghdad] is to display 40 ...,The Iraq Museum of Baghdad,,1,0
98,"As a result, [Russia] will accept having milit...",Russia,,1,0
...,...,...,...,...,...
1295,After major airlines jumped to suspend flights...,U.S.,,1,4
1326,Twickenham and the crowd is a massive factor f...,Robshaw,,1,2
1406,[Reserve Bank of Australia Governor Glenn Stev...,Reserve Bank of Australia Governor Glenn Stevens,,1,1
1454,Cap and trade is an environmental system that ...,President Al Gore,,1,2


**В скольких случаях моя и их разметка различается?**

In [11]:
df_test_Form_sen_lab_sen_lab

Unnamed: 0,Formatted_Sentence,Ent_1,Ent_2,Num_of_ent,Label
0,” Breitbart News has now reached out to the [T...,Trump,Ivanka,2,0
1,"[Phoebe Nora Mary Prince], 15, committed suici...",Phoebe Nora Mary Prince,Phoebe Prince,2,0
2,"Notably, the department says about 30 emails t...",U. S.,Benghazi,2,0
3,[Meacham] was recently featured by the San Die...,Meacham,Trump,2,0
4,But Portland's city code prohibits persons fro...,Root,Ashton,2,0
...,...,...,...,...,...
1618,The Trump [administration] is likely to start ...,administration,Obama,2,3
1619,"Well, we do have challenges, but were not stup...",Trump,president,2,3
1620,"He was also fired from a season of ""The Celebr...","""Hulk Hogan",Hogan,2,1
1621,[The Hong Kong Monetary Authority] started sel...,The Hong Kong Monetary Authority,U.S.,2,1


In [12]:
column_names = ["Sentence", "Ent_1", "Ent_2", "Label"]
df_test_other_ner = pd.read_csv('dataset/test.txt', sep='\t', header=None, names=column_names)

In [13]:
df_test_other_ner

Unnamed: 0,Sentence,Ent_1,Ent_2,Label
0,” [Breitbart New]s has now reached out to the ...,Breitbart New,Trump,0
1,"Phoebe Nora Mary Prince, 15, committed suicide...",Phoebe Prince,Mass.,0
2,"[Notably], the department says about 30 emails...",Notably,Benghazi,0
3,[Meacham] was recently featured by the San Die...,Meacham,Trump,0
4,But [Portland]'s city code prohibits persons f...,Portland,OregonLive.com,0
...,...,...,...,...
1618,The [Trump] administration is likely to start ...,Trump,Obama’,3
1619,"Well, we do have challenges, but were not stup...",McClatchy-Marist,Trump,3
1620,"He was also fired from a season of ""The Celebr...",Hogan,Trump,1
1621,[The Hong Kong Monetary Authority] started sel...,The Hong Kong Monetary Authority,U.S.,1


In [14]:
column_names = ["Sentence", "Label"]
df_test_clean = pd.read_csv('datasets_cleaned/test_cleaned.txt', sep='\t', header=None, names=column_names)

In [15]:
df_test_clean

Unnamed: 0,Sentence,Label
0,” Breitbart News has now reached out to the Tr...,0
1,"Phoebe Nora Mary Prince, 15, committed suicide...",0
2,"Notably, the department says about 30 emails t...",0
3,Meacham was recently featured by the San Diego...,0
4,But Portland's city code prohibits persons fro...,0
...,...,...
1618,The Trump administration is likely to start di...,3
1619,"Well, we do have challenges, but were not stup...",3
1620,"He was also fired from a season of ""The Celebr...",1
1621,The Hong Kong Monetary Authority started selli...,1


In [16]:
# Create the new DataFrame
df_comparing = pd.DataFrame({
    "sentense_clean": df_test_clean["Sentence"],
    "my_ent_1": df_test_Form_sen_lab_sen_lab["Ent_1"],
    "their_ent_1": df_test_other_ner["Ent_1"],
    "my_ent_2": df_test_Form_sen_lab_sen_lab["Ent_2"],
    "their_ent_2": df_test_other_ner["Ent_2"],
    "label": df_test_clean["Label"]  # Assuming label is the same across DataFrames
})

# Show only rows where entities don't match
df_comparing_wrong = df_comparing[
    (df_comparing["my_ent_1"] != df_comparing["their_ent_1"]) |
    (df_comparing["my_ent_2"] != df_comparing["their_ent_2"])
]

In [17]:
df_comparing

Unnamed: 0,sentense_clean,my_ent_1,their_ent_1,my_ent_2,their_ent_2,label
0,” Breitbart News has now reached out to the Tr...,Trump,Breitbart New,Ivanka,Trump,0
1,"Phoebe Nora Mary Prince, 15, committed suicide...",Phoebe Nora Mary Prince,Phoebe Prince,Phoebe Prince,Mass.,0
2,"Notably, the department says about 30 emails t...",U. S.,Notably,Benghazi,Benghazi,0
3,Meacham was recently featured by the San Diego...,Meacham,Meacham,Trump,Trump,0
4,But Portland's city code prohibits persons fro...,Root,Portland,Ashton,OregonLive.com,0
...,...,...,...,...,...,...
1618,The Trump administration is likely to start di...,administration,Trump,Obama,Obama’,3
1619,"Well, we do have challenges, but were not stup...",Trump,McClatchy-Marist,president,Trump,3
1620,"He was also fired from a season of ""The Celebr...","""Hulk Hogan",Hogan,Hogan,Trump,1
1621,The Hong Kong Monetary Authority started selli...,The Hong Kong Monetary Authority,The Hong Kong Monetary Authority,U.S.,U.S.,1


In [18]:
df_comparing_wrong

Unnamed: 0,sentense_clean,my_ent_1,their_ent_1,my_ent_2,their_ent_2,label
0,” Breitbart News has now reached out to the Tr...,Trump,Breitbart New,Ivanka,Trump,0
1,"Phoebe Nora Mary Prince, 15, committed suicide...",Phoebe Nora Mary Prince,Phoebe Prince,Phoebe Prince,Mass.,0
2,"Notably, the department says about 30 emails t...",U. S.,Notably,Benghazi,Benghazi,0
4,But Portland's city code prohibits persons fro...,Root,Portland,Ashton,OregonLive.com,0
5,FORMER Jacksonville tackle Tony Boselli thinks...,Tony Boselli,Jacksonville,they,AFC,0
...,...,...,...,...,...,...
1617,They cited a debate within the Obama administr...,They,the United State,administration,Syrian,3
1618,The Trump administration is likely to start di...,administration,Trump,Obama,Obama’,3
1619,"Well, we do have challenges, but were not stup...",Trump,McClatchy-Marist,president,Trump,3
1620,"He was also fired from a season of ""The Celebr...","""Hulk Hogan",Hogan,Hogan,Trump,1


In [19]:
# Save to CSV if needed
df_comparing_wrong.to_csv('df_comparing_wrong.csv', index=False)

**Возьмем рандомные 200 записей и вручную рассмотрим**

In [20]:
# Take a random sample of 200 sentences
df_sampled = df_comparing_wrong.sample(n=200, random_state=42)

# Save the sampled DataFrame to a CSV file
df_sampled.to_csv('df_comparing_wrong_sampled.csv', index=False)