In [94]:
import json
from datetime import datetime, timedelta
import pandas as pd
import glob
import os
from sentence_transformers import SentenceTransformer, util

## Setting directory 

In [4]:
os.chdir("../..")

## Importing and Cleaning NYT Archive JSON:

In [5]:
articles = []

filepath = f"data/raw/02_nyt_jan_mar_2024.json"

In [77]:
with open(filepath, 'r') as f:
    data = json.load(f)
    for doc in data:
        articles.append({
            "nyt_title": doc.get("headline", {}).get("main", ""),
            "nyt_date": doc.get("pub_date", "")[:10],  # just the date
            "nyt_abstract": doc.get("abstract", ""),
            "nyt_url": doc.get("web_url", "")
        })

In [78]:
nyt_articles = pd.DataFrame(articles)
nyt_articles['nyt_date'] = pd.to_datetime(nyt_articles['nyt_date'])

## Importing ACLED Event Data

In [61]:
acled_path = f"data/raw/02_acled_event_jan_mar_24.csv"
acled_events = pd.read_csv(acled_path)

In [62]:
acled_events['event_date'] = pd.to_datetime(acled_events['event_date'])

# selecting just the relevant columns
acled_events_select = acled_events[["event_date", "year", "disorder_type",
                                   "event_type", "sub_event_type", "actor1", "actor2", 
                                   "interaction", "region", "country", "fatalities",
                                   "notes", "tags"]]

### Filtering on scale/impact

In [63]:
acled_events_filter = acled_events_select[acled_events_select["fatalities"] > 25]

## Joining ACLED and NYT 

In [64]:
# getting row counts 
len(acled_events_filter)
# len(nyt_articles)

284

### First, cross join on date, and then filter to where the dates are sufficiently similar

In [85]:
# defining func

# def cross_join_on_date_proximity(df_1, df_2, date_tolerance_days=10):
#     """Cross-join rows where dates are within ±date_tolerance_days."""
#     # nyt_df["nyt_date"] = pd.to_datetime(nyt_df["nyt_date"])
    
#     results = []
#     for _, df_2_row in df_2.iterrows():
#         event_date = df_2_row["event_date"]
#         start = event_date - timedelta(days=date_tolerance_days)
#         end = event_date + timedelta(days=date_tolerance_days)
#         matched_articles = df_1[(df_1["nyt_date"] >= start) & (df_1["nyt_date"] <= end)]
        
#         for _, df_1_row in matched_articles.iterrows():
#             joined = {**df_2_row.to_dict(), **df_1_row.to_dict()}
#             results.append(joined)
    
#     return pd.DataFrame(results)


def cross_join_on_date_proximity(events_df, news_df, date_tolerance_days=1):
    
    events_df['event_date'] = pd.to_datetime(events_df['event_date'])
    # print(news_df.head())
    news_df['nyt_date'] = pd.to_datetime(news_df['nyt_date'])
    
    events_df['cross_join_key'] = 1
    news_df['cross_join_key'] = 1
    
    # cross join
    merged = pd.merge(events_df, news_df, on = 'cross_join_key').drop('cross_join_key', axis = 1)
    merged['date_diff'] = (merged['nyt_date'] - merged['event_date']).dt.days
    # print(merged.head())
    
    # filter
    filtered = merged[merged['date_diff'] < date_tolerance_days].copy()
    
    # print(merged.dtypes)
    
    return(filtered)

In [86]:
merged_df = cross_join_on_date_proximity(acled_events_filter, nyt_articles, 2)

# dropping large differences in dates
# merged_df['event_date'] = pd.to_datetime(merged_df['event_date'])
# merged_df['nyt_date'] = pd.to_datetime(merged_df['nyt_date'])
# diff = (merged_df['event_date'] - merged_df['nyt_date']).abs().dt.days
# merged[diff <= 2]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_df['event_date'] = pd.to_datetime(events_df['event_date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_df['cross_join_key'] = 1


In [88]:
# checking out cross join
merged_df.head()

Unnamed: 0,event_date,year,disorder_type,event_type,sub_event_type,actor1,actor2,interaction,region,country,fatalities,notes,tags,nyt_title,nyt_date,nyt_abstract,nyt_url,date_diff
11705,2024-03-31,2024,Political violence,Battles,Armed clash,KNU/KNLA: Karen National Union/Karen National ...,Military Forces of Myanmar (2021-),State forces-Rebel group,Southeast Asia,Myanmar,40,"On 31 March 2024, near Maung Mei Shaung villag...",,Paradoxical Line of Amazement,2024-03-30,Blake Slonecker presents an erudite themeless ...,https://www.nytimes.com/2024/03/29/crosswords/...,1
11706,2024-03-31,2024,Political violence,Battles,Armed clash,KNU/KNLA: Karen National Union/Karen National ...,Military Forces of Myanmar (2021-),State forces-Rebel group,Southeast Asia,Myanmar,40,"On 31 March 2024, near Maung Mei Shaung villag...",,Quotation of the Day: A Percussive Poet Explor...,2024-03-30,"Quotation of the Day for Saturday, March 30, 2...",https://www.nytimes.com/2024/03/30/pageoneplus...,1
11707,2024-03-31,2024,Political violence,Battles,Armed clash,KNU/KNLA: Karen National Union/Karen National ...,Military Forces of Myanmar (2021-),State forces-Rebel group,Southeast Asia,Myanmar,40,"On 31 March 2024, near Maung Mei Shaung villag...",,"Corrections: March 30, 2024",2024-03-30,Corrections that appeared in print on Saturday...,https://www.nytimes.com/2024/03/30/pageoneplus...,1
11708,2024-03-31,2024,Political violence,Battles,Armed clash,KNU/KNLA: Karen National Union/Karen National ...,Military Forces of Myanmar (2021-),State forces-Rebel group,Southeast Asia,Myanmar,40,"On 31 March 2024, near Maung Mei Shaung villag...",,"A 500-Year Old Chapel, 438 Solar Panels and an...",2024-03-30,King’s College Chapel in Cambridge is one of s...,https://www.nytimes.com/2024/03/30/world/europ...,1
11709,2024-03-31,2024,Political violence,Battles,Armed clash,KNU/KNLA: Karen National Union/Karen National ...,Military Forces of Myanmar (2021-),State forces-Rebel group,Southeast Asia,Myanmar,40,"On 31 March 2024, near Maung Mei Shaung villag...",,"A Stork, a Fisherman and Their Unlikely Bond E...",2024-03-30,"Thirteen years ago, a stork landed on a fisher...",https://www.nytimes.com/2024/03/30/world/europ...,1


In [89]:
#nrows
len(merged_df)

215742

### Creating sentence embeddings for NYT abstracts and event descriptions, and then compute cosines

In [96]:
def embed_and_match(df, news_col='nyt_abstract', event_col='notes'):
    """
    Add a column indicating whether the abstract and notes likely refer to the same event.
    
    Parameters:
        df (pd.DataFrame): Input DataFrame with abstract and notes columns.
        abstract_col (str): Name of the NYT abstract column.
        notes_col (str): Name of the event notes column.
        match_score (numeric): Cosine similarity of news/notes sentence embeddings

    Returns:
        pd.DataFrame: With added 'match_score' col.
    """
    model = SentenceTransformer('all-MiniLM-L6-v2')

    # Encode both text columns
    abstracts = df[news_col].fillna("").tolist()
    notes = df[event_col].fillna("").tolist()

    emb_abstracts = model.encode(abstracts, convert_to_tensor=True)
    emb_notes = model.encode(notes, convert_to_tensor=True)

    # Compute cosine similarities
    similarities = util.cos_sim(emb_abstracts, emb_notes).diagonal()

    df['match_score'] = similarities.cpu().numpy()

    return df


In [None]:
# applying the fn

merged_df_sim = embed_and_match(merged_df)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]