In [73]:
import json
from datetime import datetime, timedelta
import pandas as pd
import glob
import os
from sentence_transformers import SentenceTransformer, util
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

## Setting directory 

In [5]:
os.chdir("../..")

## Importing and Cleaning NYT Archive JSON:

In [6]:
articles = []

filepath = f"data/raw/02_nyt_jan_mar_2024.json"

In [7]:
with open(filepath, 'r') as f:
    data = json.load(f)
    for doc in data:
        articles.append({
            "nyt_title": doc.get("headline", {}).get("main", ""),
            "nyt_date": doc.get("pub_date", "")[:10],  # just the date
            "nyt_abstract": doc.get("abstract", ""),
            "nyt_url": doc.get("web_url", "")
        })

In [60]:
nyt_articles = pd.DataFrame(articles)
nyt_articles['nyt_date'] = pd.to_datetime(nyt_articles['nyt_date'])

# indexing the dataframe:
nyt_articles['news_index'] = nyt_articles.index

In [88]:
print(max(nyt_articles['news_index']))

11817


## Importing ACLED Event Data

In [61]:
acled_path = f"data/raw/02_acled_event_jan_mar_24.csv"
acled_events = pd.read_csv(acled_path)

In [98]:
acled_events['event_date'] = pd.to_datetime(acled_events['event_date'])

# selecting just the relevant columns
acled_events_select = acled_events[["event_date", "year", "disorder_type",
                                   "event_type", "sub_event_type", "actor1", "actor2", 
                                   "interaction", "region", "country", "fatalities",
                                   "notes", "tags"]]

### Filtering on scale/impact

In [109]:
acled_events_filter = acled_events_select[acled_events_select["fatalities"] > 25]

# indexing the dataframe
acled_events_filter['event_index'] = acled_events_filter.reset_index().index

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  acled_events_filter['event_index'] = acled_events_filter.reset_index().index


## Compute Sentence Embeddings for Event Descriptions and News Abstracts

In [64]:
def compute_and_cache_embeddings(df, text_column, model, cache_path):
    """
    Computes and saves embeddings for a text column.
    """
    texts = df[text_column].fillna("").tolist()
    embeddings = model.encode(texts, batch_size=64, show_progress_bar=True, convert_to_numpy=True)
    np.save(cache_path, embeddings)
    return embeddings

In [66]:
model = SentenceTransformer('all-MiniLM-L6-v2')

# Compute or load cached embeddings for NYT
try:
    nyt_embeddings = np.load(f"data/mst/sentence_embeddings/01_nyt_embeddings.npy")
    print("Loaded NYT embeddings from cache.")
except FileNotFoundError:
    nyt_embeddings = compute_and_cache_embeddings(nyt_articles, "nyt_abstract", model, 
                                                  f"data/mst/sentence_embeddings/01_nyt_embeddings.npy")


Loaded NYT embeddings from cache.


In [43]:
# Compute or load cached embeddings for ACLED events

try:
    event_embeddings = np.load(f"data/mst/sentence_embeddings/02_event_embeddings.npy")
    print("Loaded event embeddings from cache.")
except FileNotFoundError:
    event_embeddings = compute_and_cache_embeddings(acled_events_filter, "notes", model, 
                                                    f"data/mst/sentence_embeddings/event_embeddings.npy")

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

In [96]:
len(acled_events_filter)

284

### Computing cosine similarities for news and event embeddings

In [97]:
cossim_news_events = cosine_similarity(nyt_embeddings, event_embeddings)
len(cossim_news_events)

11818

## Joining ACLED and NYT 

In [68]:
# getting row counts 
len(acled_events_filter)
# len(event_embeddings)

284

### First, cross join on date, and then filter to where the dates are sufficiently similar

In [110]:
# defining func

# def cross_join_on_date_proximity(df_1, df_2, date_tolerance_days=10):
#     """Cross-join rows where dates are within ±date_tolerance_days."""
#     # nyt_df["nyt_date"] = pd.to_datetime(nyt_df["nyt_date"])
    
#     results = []
#     for _, df_2_row in df_2.iterrows():
#         event_date = df_2_row["event_date"]
#         start = event_date - timedelta(days=date_tolerance_days)
#         end = event_date + timedelta(days=date_tolerance_days)
#         matched_articles = df_1[(df_1["nyt_date"] >= start) & (df_1["nyt_date"] <= end)]
        
#         for _, df_1_row in matched_articles.iterrows():
#             joined = {**df_2_row.to_dict(), **df_1_row.to_dict()}
#             results.append(joined)
    
#     return pd.DataFrame(results)


def cross_join_on_date_proximity(events_df, news_df, date_tolerance_days=1):
    
    events_df['event_date'] = pd.to_datetime(events_df['event_date'])
    # print(news_df.head())
    news_df['nyt_date'] = pd.to_datetime(news_df['nyt_date'])
    
    events_df['cross_join_key'] = 1
    news_df['cross_join_key'] = 1
    
    # cross join
    merged = pd.merge(events_df, news_df, on = 'cross_join_key').drop('cross_join_key', axis = 1)
    merged['date_diff'] = (merged['nyt_date'] - merged['event_date']).dt.days
    # print(merged.head())
    
    # filter
    filtered = merged[(merged['date_diff'] < date_tolerance_days) & (merged['date_diff'] >=0)].copy()
    
    # print(merged.dtypes)
    
    return(filtered)

In [111]:
merged_df = cross_join_on_date_proximity(acled_events_filter, nyt_articles, date_tolerance_days = 5)

# dropping large differences in dates
# merged_df['event_date'] = pd.to_datetime(merged_df['event_date'])
# merged_df['nyt_date'] = pd.to_datetime(merged_df['nyt_date'])
# diff = (merged_df['event_date'] - merged_df['nyt_date']).abs().dt.days
# merged[diff <= 2]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_df['event_date'] = pd.to_datetime(events_df['event_date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_df['cross_join_key'] = 1


In [114]:
# checking out cross join
merged_df.head(2)

Unnamed: 0,event_date,year,disorder_type,event_type,sub_event_type,actor1,actor2,interaction,region,country,fatalities,notes,tags,event_index,nyt_title,nyt_date,nyt_abstract,nyt_url,news_index,date_diff
11774,2024-03-31,2024,Political violence,Battles,Armed clash,KNU/KNLA: Karen National Union/Karen National ...,Military Forces of Myanmar (2021-),State forces-Rebel group,Southeast Asia,Myanmar,40,"On 31 March 2024, near Maung Mei Shaung villag...",,0,"Chance Perdomo, Star of ‘Chilling Adventures o...",2024-03-31,"Mr. Perdomo, who died in a motorcycle accident...",https://www.nytimes.com/2024/03/30/arts/televi...,11774,0
11775,2024-03-31,2024,Political violence,Battles,Armed clash,KNU/KNLA: Karen National Union/Karen National ...,Military Forces of Myanmar (2021-),State forces-Rebel group,Southeast Asia,Myanmar,40,"On 31 March 2024, near Maung Mei Shaung villag...",,0,A Close-Up View of the Baltimore Bridge Collapse,2024-03-31,"From roughly 100 yards away, the site of one o...",https://www.nytimes.com/2024/03/30/us/view-shi...,11775,0


In [113]:
#nrows
len(merged_df)

183469

### Looping through each row of merged dataframe and grabbing cos sims

In [121]:
merged_df_with_similarities = merged_df.assign(cosine_sim = lambda x: cossim_news_events[x['news_index'], x['event_index']])

In [126]:
# checking out merged df after 

merged_df_with_similarities.head(2)
# print(cossim_news_events[11775, 0])

Unnamed: 0,event_date,year,disorder_type,event_type,sub_event_type,actor1,actor2,interaction,region,country,...,notes,tags,event_index,nyt_title,nyt_date,nyt_abstract,nyt_url,news_index,date_diff,cosine_sim
11774,2024-03-31,2024,Political violence,Battles,Armed clash,KNU/KNLA: Karen National Union/Karen National ...,Military Forces of Myanmar (2021-),State forces-Rebel group,Southeast Asia,Myanmar,...,"On 31 March 2024, near Maung Mei Shaung villag...",,0,"Chance Perdomo, Star of ‘Chilling Adventures o...",2024-03-31,"Mr. Perdomo, who died in a motorcycle accident...",https://www.nytimes.com/2024/03/30/arts/televi...,11774,0,0.01421
11775,2024-03-31,2024,Political violence,Battles,Armed clash,KNU/KNLA: Karen National Union/Karen National ...,Military Forces of Myanmar (2021-),State forces-Rebel group,Southeast Asia,Myanmar,...,"On 31 March 2024, near Maung Mei Shaung villag...",,0,A Close-Up View of the Baltimore Bridge Collapse,2024-03-31,"From roughly 100 yards away, the site of one o...",https://www.nytimes.com/2024/03/30/us/view-shi...,11775,0,0.166382


## Exportin 

In [128]:
merged_df_with_similarities.to_csv(f"data/mst/01_event_news_similarity_build.csv", index = False)

# Scratch

In [119]:
cosine_scores = []

for i, row in merged_df.head(10).iterrows():
    news_index = row['news_index']
    
    event_index = row['event_index']
    
    # print(f"matching news item {news_index} with event_item {event_index}")
    
    cosine_scores.append(cossim_news_events[news_index, event_index])
    

In [81]:

X = [[0, 0, 0], [1, 1, 1], [1,2,3], [1,2,3]]
Y = [[1, 0, 0], [1, 1, 0], [1,2,3], [1,2,5]]
test_cos = cosine_similarity(X, Y)

test_cos[3,3]

0.9759000729485332

### Creating sentence embeddings for NYT abstracts and event descriptions, and then compute cosines

In [38]:
def embed_and_match(df, news_col='nyt_abstract', event_col='notes'):
    """
    Add a column indicating whether the abstract and notes likely refer to the same event.
    
    Parameters:
        df (pd.DataFrame): Input DataFrame with abstract and notes columns.
        abstract_col (str): Name of the NYT abstract column.
        notes_col (str): Name of the event notes column.
        match_score (numeric): Cosine similarity of news/notes sentence embeddings

    Returns:
        pd.DataFrame: With added 'match_score' col.
    """
    model = SentenceTransformer('all-MiniLM-L6-v2')

    # Encode both text columns
    abstracts = df[news_col].fillna("").tolist()
    notes = df[event_col].fillna("").tolist()

    emb_abstracts = model.encode(abstracts, convert_to_tensor=True)
    emb_notes = model.encode(notes, convert_to_tensor=True)

    # Compute cosine similarities
    similarities = util.cos_sim(emb_abstracts, emb_notes).diagonal()

    df['match_score'] = similarities.cpu().numpy()

    return df


In [20]:
# applying the fn

merged_df_sim = embed_and_match(merged_df.head(100))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['match_score'] = similarities.cpu().numpy()


In [22]:
merged_df_sim.head(10)

Unnamed: 0,event_date,year,disorder_type,event_type,sub_event_type,actor1,actor2,interaction,region,country,fatalities,notes,tags,nyt_title,nyt_date,nyt_abstract,nyt_url,date_diff,match_score
0,2024-03-31,2024,Political violence,Battles,Armed clash,KNU/KNLA: Karen National Union/Karen National ...,Military Forces of Myanmar (2021-),State forces-Rebel group,Southeast Asia,Myanmar,40,"On 31 March 2024, near Maung Mei Shaung villag...",,PGA Tour and Saudi-Backed LIV Extend Deadline ...,2024-01-01,The tentative deal for the men’s golf circuits...,https://www.nytimes.com/2023/12/31/business/de...,-90,0.031312
1,2024-03-31,2024,Political violence,Battles,Armed clash,KNU/KNLA: Karen National Union/Karen National ...,Military Forces of Myanmar (2021-),State forces-Rebel group,Southeast Asia,Myanmar,40,"On 31 March 2024, near Maung Mei Shaung villag...",,Something to Whistle,2024-01-01,Harry Zheng makes his New York Times debut.,https://www.nytimes.com/2023/12/31/crosswords/...,-90,-0.045963
2,2024-03-31,2024,Political violence,Battles,Armed clash,KNU/KNLA: Karen National Union/Karen National ...,Military Forces of Myanmar (2021-),State forces-Rebel group,Southeast Asia,Myanmar,40,"On 31 March 2024, near Maung Mei Shaung villag...",,U.S. Helicopters Sink 3 Houthi Boats in Red Se...,2024-01-01,Iranian-backed Houthi gunmen from Yemen had fi...,https://www.nytimes.com/2023/12/31/world/middl...,-90,0.245386
3,2024-03-31,2024,Political violence,Battles,Armed clash,KNU/KNLA: Karen National Union/Karen National ...,Military Forces of Myanmar (2021-),State forces-Rebel group,Southeast Asia,Myanmar,40,"On 31 March 2024, near Maung Mei Shaung villag...",,"In Times Square, Hundreds of Thousands Ring In...",2024-01-01,New Year’s celebrations took place as proteste...,https://www.nytimes.com/2023/12/31/nyregion/ti...,-90,0.110684
4,2024-03-31,2024,Political violence,Battles,Armed clash,KNU/KNLA: Karen National Union/Karen National ...,Military Forces of Myanmar (2021-),State forces-Rebel group,Southeast Asia,Myanmar,40,"On 31 March 2024, near Maung Mei Shaung villag...",,"Quotation of the Day: In a Jewish-Arab School,...",2024-01-01,"Quotation of the Day for Monday, January 1, 2024.",https://www.nytimes.com/2023/12/31/pageoneplus...,-90,0.115128
5,2024-03-31,2024,Political violence,Battles,Armed clash,KNU/KNLA: Karen National Union/Karen National ...,Military Forces of Myanmar (2021-),State forces-Rebel group,Southeast Asia,Myanmar,40,"On 31 March 2024, near Maung Mei Shaung villag...",,"Half of Gazans Are at Risk of Starving, U.N. W...",2024-01-01,More than 90 percent of Palestinians in the te...,https://www.nytimes.com/2024/01/01/world/middl...,-90,0.036945
6,2024-03-31,2024,Political violence,Battles,Armed clash,KNU/KNLA: Karen National Union/Karen National ...,Military Forces of Myanmar (2021-),State forces-Rebel group,Southeast Asia,Myanmar,40,"On 31 March 2024, near Maung Mei Shaung villag...",,How to Start the New Year? Keep the Sea Goddes...,2024-01-01,Followers of Afro-Brazilian religions have bee...,https://www.nytimes.com/2024/01/01/world/ameri...,-90,-0.005576
7,2024-03-31,2024,Political violence,Battles,Armed clash,KNU/KNLA: Karen National Union/Karen National ...,Military Forces of Myanmar (2021-),State forces-Rebel group,Southeast Asia,Myanmar,40,"On 31 March 2024, near Maung Mei Shaung villag...",,What’s Sapping Your Energy?,2024-01-01,Your results will help you figure out how to h...,https://www.nytimes.com/interactive/2024/01/01...,-90,0.028096
8,2024-03-31,2024,Political violence,Battles,Armed clash,KNU/KNLA: Karen National Union/Karen National ...,Military Forces of Myanmar (2021-),State forces-Rebel group,Southeast Asia,Myanmar,40,"On 31 March 2024, near Maung Mei Shaung villag...",,Day 1: A 5-Minute Trick for More Energy,2024-01-01,Today’s challenge involves a little daytime rest.,https://www.nytimes.com/2024/01/01/well/mind/e...,-90,0.136549
9,2024-03-31,2024,Political violence,Battles,Armed clash,KNU/KNLA: Karen National Union/Karen National ...,Military Forces of Myanmar (2021-),State forces-Rebel group,Southeast Asia,Myanmar,40,"On 31 March 2024, near Maung Mei Shaung villag...",,Monday Briefing,2024-01-01,Friction between the U.S. and Israel.,https://www.nytimes.com/2024/01/01/briefing/is...,-90,0.038894


In [45]:
nyt_embeddings[1]

array([-7.69370748e-03, -4.57484238e-02, -5.64977992e-03, -2.02745814e-02,
       -4.23186049e-02,  2.65126433e-02,  9.64816380e-03, -9.83617548e-03,
        7.22916797e-03, -3.89896631e-02,  4.58503654e-03,  6.54589906e-02,
       -8.20511952e-02,  1.24800600e-01,  1.70991626e-02,  5.32262102e-02,
        6.18778206e-02, -3.82934846e-02,  5.59193008e-02, -7.00050592e-02,
        2.94650029e-02, -4.53616716e-02,  7.30262548e-02, -3.31542082e-02,
        1.38780251e-02, -9.75425988e-02,  3.04665081e-02, -1.75592601e-02,
        7.41885453e-02, -1.94415972e-02, -6.93988577e-02,  2.94309156e-03,
       -3.19849029e-02, -2.70380042e-02, -2.09607743e-02, -1.33955455e-03,
        8.22129548e-02,  6.84462190e-02,  4.37879041e-02,  3.82863395e-02,
        9.13516730e-02, -6.63274666e-03, -5.57915866e-02,  9.30275768e-02,
       -2.84316260e-02, -2.94858981e-02,  3.26012745e-02,  8.18612576e-02,
       -1.40785528e-02,  3.61860730e-02, -8.80929269e-03,  8.55615595e-04,
       -3.83882001e-02, -