In [7]:
from frame_semantic_transformer import FrameSemanticTransformer 
import pickle
import ssl
import spacy
import pandas as pd
import nltk
import pickle
import logging
logger = logging.getLogger()
from spacy import displacy

In [2]:
import ssl
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

In [3]:
nlp = spacy.load("en_core_web_sm")

In [39]:
## Spacy NLP Pipeline   

with open('../data/custom-lexicon-physical-consequences-war.txt', 'rb') as f:
    consequences_lexicon = f.read().decode('utf-8').split('\n')
    consequences_lexicon = [x.lower() for x in consequences_lexicon]

with open('../data/custom-lexicon-action-verbs.txt', 'rb') as f:
    action_verbs_lexicon = f.read().decode('utf-8').split('\n')
    action_verbs_lexicon = [x.lower() for x in action_verbs_lexicon]

def process_text(text: str, lexicon: list):
    doc = nlp(text)

    ## Extracting the entities
    for ent in doc.ents: 
        print(ent.text, ent.start_char, ent.end_char, ent.label_)

    ## Extracting the subject and object of the sentence using dependency parsing
    # Find the main verb and its subject and objects
    for token in doc:
        if token.pos_ == 'VERB' and token.lemma_ in lexicon:
            print(token.text, token.lemma_, token.pos_, token.dep_)
            if token.dep_ == "nsubj" and token.head.pos_ == "VERB":
                action = token.head  # The main action verb
                subject = token  # The subject performing the action
                print(f"Action: {action.lemma_}, Subject: {subject.text}")

                # Explore children of the main verb
                for child in action.children:
                    # Look for direct objects or prepositions that might have objects
                    if child.dep_ in ["dobj", "prep"]:
                        if child.dep_ == "prep" and child.n_rights > 0:  # Handling prepositional phrases
                            for subchild in child.children:
                                if subchild.dep_ == "pobj" and subchild.ent_type_ == "CARDINAL":
                                    # Check if the parent of the number is a noun related to people
                                    for grandchild in subchild.children:
                                        if grandchild.dep_ == "compound":
                                            print(f"Number affected: {subchild.text}, Affected group: {grandchild.text} {subchild.head.text}")
                        elif child.dep_ == "dobj":
                            # Check for numeric modifiers within the direct object
                            affected_entity = child.text
                            for subchild in child.children:
                                if subchild.dep_ == "nummod":
                                    print(f"Number affected by {action.lemma_}: {subchild.text} {affected_entity}")
    
    displacy.render(doc, style="dep")

In [40]:
article_text = "The airstrikes have killed at least 11,025 Palestinians, including 4,506 children, and wounded more than 27,000 others so far, according to the Palestinian Ministry of Health in Ramallah, which draws its figures from sources in Hamas-run Gaza."
process_text(article_text, action_verbs_lexicon)

at least 11,025 27 42 CARDINAL
Palestinians 43 55 NORP
4,506 67 72 CARDINAL
more than 27,000 95 111 CARDINAL
the Palestinian Ministry of Health 140 174 ORG
Ramallah 178 186 GPE
Hamas 228 233 ORG
Gaza 238 242 GPE
killed kill VERB ROOT
wounded wound VERB conj


In [42]:
article_text

'The airstrikes have killed at least 11,025 Palestinians, including 4,506 children, and wounded more than 27,000 others so far, according to the Palestinian Ministry of Health in Ramallah, which draws its figures from sources in Hamas-run Gaza.'

In [41]:
frame_transformer = FrameSemanticTransformer()
frame_transformer.detect_frames(article_text)

DetectFramesResult(sentence='The airstrikes have killed at least 11,025 Palestinians, including 4,506 children, and wounded more than 27,000 others so far, according to the Palestinian Ministry of Health in Ramallah, which draws its figures from sources in Hamas-run Gaza.', trigger_locations=[4, 20, 57, 73, 87, 127, 137, 217], frames=[FrameResult(name='Attack', trigger_location=4, frame_elements=[]), FrameResult(name='Killing', trigger_location=20, frame_elements=[FrameElementResult(name='Cause', text='The airstrikes'), FrameElementResult(name='Victim', text='at least 11,025 Palestinians, including 4,506 children')]), FrameResult(name='Inclusion', trigger_location=57, frame_elements=[FrameElementResult(name='Total', text='11,025 Palestinians'), FrameElementResult(name='Part', text='4,506 children')]), FrameResult(name='People_by_age', trigger_location=73, frame_elements=[FrameElementResult(name='Person', text='children')]), FrameResult(name='Cause_harm', trigger_location=87, frame_elem

In [1]:
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [6]:
df_ME = pd.read_csv('../data/raw/filtered_data/MiddleEast.csv')
df_UK = pd.read_csv('../data/raw/filtered_data/UK.csv')
df_US = pd.read_csv('../data/raw/filtered_data/US.csv') 

print(df_ME.shape)
print(df_UK.shape)
print(df_US.shape)

(6369, 7)
(9116, 7)
(9982, 7)
