In [None]:
from frame_semantic_transformer import FrameSemanticTransformer
import pickle
import logging
logger = logging.getLogger()
import spacy
import pandas as pd

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
import pandas as pd
import pickle

def process_article(article_text: str):
    """Tokenize the article into sentences and clean them."""
    doc = nlp(article_text)
    return [sent.text.strip() for sent in doc.sents if sent.text.strip()]

def get_frame_semantics_docs(filename: str, region: str):
    """Process documents to extract frame semantics and save periodically."""
    df = pd.read_csv(filename)
    print(df.shape)
    frame_transformer = FrameSemanticTransformer(batch_size=32)
    results = {"index": [], "frame_semantics": [], 'title': []}

    for index, row in df.iterrows():
        try:
            headline = row['title']
            sentences = process_article(row['maintext'])
            print(len(sentences))
            
            # Create batches of sentences
            batch_size = 32
            sentence_batches = [sentences[i:i + batch_size] for i in range(0, len(sentences), batch_size)]
            
            doc_semantics = []
            for batch in sentence_batches:
                batch_semantics = frame_transformer.detect_frames_bulk(batch)
                print(batch_semantics)
                doc_semantics.extend(batch_semantics)
            
            results['title'].append(headline)
            results['index'].append(index)
            results['frame_semantics'].append(doc_semantics)
            
            # Save periodically or based on some condition
            if len(results['index']) % 100 == 0:
                save_results(results, region)

        except Exception as e:
            logger.error(f"Error processing index {index}: {e}")
            pass

    # Save remaining results at the end of processing
    if results['index']:
        save_results(results, region)

def save_results(results, region):
    """Save the processed results to a pickle file."""
    filename = f"../data/processed/doc_semantics/frame_semantics_{region}.pickle"
    with open(filename, 'wb') as f:
        pickle.dump(results, f)
    print(f"Saved results to {filename}")

if __name__ == "__main__":
    regions = ["UK", "US", "MiddleEast"]
    for region in regions:
        get_frame_semantics_docs(f"../data/raw/filtered_data/{region}.csv", region)

In [None]:
with open("../data/processed/doc_semantics/frame_semantics_UK.pickle", "rb") as f:
    frame_semantics = pickle.load(f)

In [None]:
frame_semantics['frame_semantics'][3][0]

In [None]:
for batch in frame_semantics['frame_semantics']:
    print(batch[0])

In [20]:
import pandas as pd
regions = ["UK", "US", "MiddleEast"]
region = regions[2]
df = pd.read_csv(f"../data/raw/filtered_data/{region}.csv")

In [26]:
text = df['maintext'].values

In [37]:
for data in range(0,len(text)): 
    if type(text[data])==str and "They then lost contact with the team and with Hind" in text[data]:
        print(data)

1712


In [39]:
import pickle
with open("../data/processed/doc_semantics/frame_semantics_MiddleEast.pickle", "rb") as f:
    frame_semantics = pickle.load(f)

  from .autonotebook import tqdm as notebook_tqdm


In [47]:
(frame_semantics['frame_semantics'][1299])

[DetectFramesResult(sentence='Occupied Jerusalem, SANA- Dozens of Palestinians were martyred and others were injured as a result of the continued Israeli aggression on the war-ravaged Gaza Strip for the 131st day in a row.', trigger_locations=[0, 67, 79, 179], frames=[FrameResult(name='Residence', trigger_location=0, frame_elements=[FrameElementResult(name='Location', text='Jerusalem')]), FrameResult(name='Increment', trigger_location=67, frame_elements=[FrameElementResult(name='Class', text='others')]), FrameResult(name='Cause_harm', trigger_location=79, frame_elements=[FrameElementResult(name='Victim', text='others'), FrameElementResult(name='Explanation', text='as a result of the continued Israeli aggression on the war-ravaged Gaza Strip')]), FrameResult(name='Calendric_unit', trigger_location=179, frame_elements=[FrameElementResult(name='Count', text='131st'), FrameElementResult(name='Unit', text='day')])]),
 DetectFramesResult(sentence='Wafa Agency reported that during the past ho

In [56]:
import pickle
with open("../data/processed/frame_semantics/frame_semantics_UK_main.pickle", "rb") as f: 
    data = pickle.load(f)

In [57]:
len(data['index'])

2001