In [2]:
import json
from datetime import datetime, timedelta
import pandas as pd
import glob
import os
from sentence_transformers import SentenceTransformer, util
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

  from tqdm.autonotebook import tqdm, trange


## Setting Directory

In [8]:
os.chdir("/Users/alexlin/Desktop/R For Substack/news_coverage/")

In [9]:
os.getcwd()

'/Users/alexlin/Desktop/R For Substack/news_coverage'

## Importing Data

In [10]:
# nyt data
nyt_mst_path = f"data/mst/00_nyt_articles.csv"
nyt_articles = pd.read_csv(nyt_mst_path)

# acled filtered data
acled_mst_path = f"data/mst/00b_acled_events_filter.csv"
acled_events_filter = pd.read_csv(acled_mst_path)

## Importing NYT embeddings

In [11]:
nyt_embeddings = np.load(f"data/mst/sentence_embeddings/01_nyt_embeddings.npy")

In [13]:
len(nyt_embeddings)

11818

## Grouping ACLED events

In [14]:
acled_events_filter.head(2)

Unnamed: 0.1,Unnamed: 0,event_date,year,disorder_type,event_type,sub_event_type,actor1,actor2,interaction,region,country,fatalities,notes,tags,event_index,cross_join_key
0,58,2024-03-31,2024,Political violence,Battles,Armed clash,Murle Ethnic Militia (South Sudan),Anyuak Ethnic Militia (South Sudan),Identity militia-Identity militia,Eastern Africa,South Sudan,10,"On 31 March 2024, suspected members of Murle a...",,0,1
1,62,2024-03-31,2024,Political violence,Violence against civilians,Attack,Rapid Support Forces,Civilians (Sudan),Rebel group-Civilians,Northern Africa,Sudan,9,"Around 31 March 2024 (as reported), RSF attack...",,1,1


In [84]:
acled_events_grouped = acled_events_filter.groupby(['country', 'actor1', 'actor2']).agg({
    'fatalities': 'sum',
    'notes': lambda x: " ".join(x)
})

acled_events_grouped = pd.DataFrame(acled_events_grouped).reset_index()

# getting rid of dates in the notes col

date_pattern = "On \d{1,2}\s+\w*\s+\d{4}, "
acled_events_grouped['notes'] = acled_events_grouped['notes'].str.replace(date_pattern, '', regex=True)

# setting an index/counter
acled_events_grouped['index'] = acled_events_grouped.index

In [85]:
acled_events_grouped.head(10)

Unnamed: 0,country,actor1,actor2,fatalities,notes,index
0,Afghanistan,Islamic State Khorasan Province (ISKP),Military Forces of Afghanistan (2021-),22,21-43 Taliban members and civilians were kille...,0
1,Afghanistan,Military Forces of Pakistan (2022-),Civilians (Afghanistan),6,"six civilians, three women and three children,...",1
2,Afghanistan,National Resistance Front,Military Forces of Afghanistan (2021-),9,five Taliban members were killed and two were ...,2
3,Afghanistan,Unidentified Armed Group (Afghanistan),Civilians (Afghanistan),9,Explosive remnants of war: at least nine child...,3
4,Angola,FLEC-FAC: Front for the Liberation of the Encl...,Military Forces of Angola (1975-),10,FLEC-FAC attacked a patrol of the Angolan Arme...,4
5,Benin,Military Forces of Benin (2016-),JNIM: Group for Support of Islam and Muslims,13,Beninese military clashed with suspected JNIM ...,5
6,Brazil,Military Forces of Brazil (2023-) Military Police,Unidentified Gang (Brazil),6,"in Londrina (Parana), six drug trafficking sus...",6
7,Brazil,Unidentified Gang and/or Police Militia,Unidentified Gang (Brazil),6,"in Alto Garcas (Mato Grosso), unknown armed in...",7
8,Burkina Faso,Islamic State Sahel Province (ISSP),Civilians (Burkina Faso),75,IS Sahel militants attacked a Catholic church ...,8
9,Burkina Faso,Islamic State Sahel Province (ISSP),Military Forces of Burkina Faso (2022-),8,IS Sahel militants clashed with soldiers at ea...,9


## Creating sentence embeddings for grouped event notes:

In [81]:
model = SentenceTransformer('all-MiniLM-L6-v2')

def compute_and_cache_embeddings(df, text_column, model, cache_path, reset = False):
    """
    Computes and saves embeddings for a text column.
    """
    if reset == False:
        embeddings = np.load(cache_path)
        print("Loaded embeddings from cache.")
        return embeddings
        
    else:
        texts = df[text_column].fillna("").tolist()
        embeddings = model.encode(texts, batch_size=64, show_progress_bar=True, convert_to_numpy=True)
        np.save(cache_path, embeddings)
        return embeddings

In [82]:
grouped_event_embeddings = compute_and_cache_embeddings(acled_events_grouped, "notes", model, 
                                              f"data/mst/sentence_embeddings/02_events_grouped_embeddings.npy",
                                                       reset = True)

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

## Cross joining news and events