# Model 1: Day-level Aggregation

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import pandas as pd
from datetime import datetime, timedelta
import tqdm

Loose Definitions:

State 0: 0 fatalities, only demonstrations for disorder type, and no civilians targeting

State 1: Political violence events and no fatalities, or demonstrations, or civilians targeted but and under 3 fatalities

State 2: Political violence events and between 1 and 5 fatalities with civilians targeted, or no civilians targeted and at least 3 fatalities

State 3: Over 5 fatalities

Features: 

'event_id_cnty', 'event_date', 'year', 'time_precision',
       'disorder_type', 'event_type', 'sub_event_type', 'actor1',
       'assoc_actor_1', 'inter1', 'actor2', 'assoc_actor_2', 'inter2',
       'interaction', 'civilian_targeting', 'iso', 'region', 'country',
       'admin1', 'admin2', 'admin3', 'location', 'latitude', 'longitude',
       'geo_precision', 'source', 'source_scale', 'notes', 'fatalities',
       'tags', 'timestamp', 'population_best'

Ones we'll use (after already having honed in on a day and a story): 'disorder_type', 'civilian_targeting', 'latitude', 'longitude', 'geo_precision', 'fatalities'

Initially I was going to use 'admin1', but given the number of administrative regions, this might be too complex to aggregate over days, I'll have to think about it more...

TODO? Include 'notes'?

In [3]:
df = pd.read_csv("2023-11-01-2024-10-31.csv")
list_all_admin = []
list_all_admin.extend(df['admin1']); list_all_admin.extend(df['admin2']); list_all_admin.extend(df['admin3'])
list_all_admin = list(set(list_all_admin))

admin_encoding = { admin : i for i, admin in enumerate(list_all_admin) }

In [99]:
def encode_disorder_type(disorder_type):
    disorder_type = disorder_type.lower()
    if 'political violence' in disorder_type:
        return 4 / 7
    if 'strategic developments' in disorder_type:
        return 2 / 7
    return 1 / 7

def embed_day(day_df):
    # aggregate over all events that day, taking the highest severity
    disorders = sum([ encode_disorder_type(type) for type in set(day_df['disorder_type']) ])

    civilian_targeting = 0
    for dude in df['interaction']:
        if 'civilian' in dude.lower():
            civilian_targeting = 1
            break
    
    fatalities = sum(day_df['fatalities']) / 200
    return [ disorders, civilian_targeting, fatalities ]

def embed_story(story_df):
    date_format = '%Y-%m-%d'
    date_objects = None
    try:
        date_objects = [ datetime.strptime(date, date_format) for date in story_df['event_date'] ]
    except Exception:
        date_format = '%d %B %Y'
        date_objects = [ datetime.strptime(date, date_format) for date in story_df['event_date'] ]
    first_date = min(date_objects)
    last_date = max(date_objects)
    current_date = first_date

    embedding = []
    while current_date <= last_date:
        day_df = story_df[story_df['event_date'] == current_date.strftime(date_format)]
        embedding.append(embed_day(day_df))
        current_date += timedelta(days=1)
    return torch.tensor(embedding)

In [87]:
columns_to_check = ['actor1', 'assoc_actor_1', 'actor2', 'assoc_actor_2']
df_placeholder = df[columns_to_check].fillna("MISSING")
duplicate_groups = df_placeholder.groupby(columns_to_check).indices
sorted_stories = [ (len(value), key, index) for index, (key, value) in enumerate(duplicate_groups.items()) ]
sorted_stories = sorted(sorted_stories, reverse=True)
sorted_stories

[(15680,
  ('Military Forces of Russia (2000-)', 'MISSING', 'MISSING', 'MISSING'),
  9652),
 (14296,
  ('Military Forces of Israel (2022-)', 'MISSING', 'MISSING', 'MISSING'),
  8361),
 (14082,
  ('Military Forces of Russia (2000-)',
   'MISSING',
   'Military Forces of Ukraine (2019-)',
   'MISSING'),
  9660),
 (7703,
  ('Military Forces of Russia (2000-) Air Force',
   'MISSING',
   'MISSING',
   'MISSING'),
  9769),
 (5216,
  ('Military Forces of Israel (2022-)',
   'MISSING',
   'Civilians (Palestine)',
   'MISSING'),
  8244),
 (4793,
  ('Protesters (Yemen)',
   'Government of Yemen (2017-) Houthi',
   'MISSING',
   'MISSING'),
  38396),
 (4661,
  ('Military Forces of Turkey (2016-)',
   'MISSING',
   'PKK: Kurdistan Workers Party',
   'MISSING'),
  10518),
 (3538,
  ('Military Forces of Ukraine (2019-) Air Force',
   'Military Forces of Ukraine (2019-)',
   'MISSING',
   'MISSING'),
  10752),
 (3242,
  ('Military Forces of Myanmar (2021-)',
   'MISSING',
   'Civilians (Myanmar)',
 

In [6]:
def get_story(i):
    temp = sorted_stories[i][1]
    # print(temp)
    return df.iloc[duplicate_groups[temp].tolist()]

In [7]:
get_story(1)

Unnamed: 0,event_id_cnty,event_date,year,time_precision,disorder_type,event_type,sub_event_type,actor1,assoc_actor_1,inter1,...,latitude,longitude,geo_precision,source,source_scale,notes,fatalities,tags,timestamp,population_best
132,LBN28153,31 October 2024,2024,1,Political violence,Explosions/Remote violence,Air/drone strike,Military Forces of Israel (2022-),,External/Other forces,...,33.4869,35.5188,1,National News Agency Lebanon; Telegram,New media-National,"On 31 October 2024, Israeli warplanes carried ...",0,,1730764851,3200.0
134,LBN28155,31 October 2024,2024,1,Political violence,Explosions/Remote violence,Air/drone strike,Military Forces of Israel (2022-),,External/Other forces,...,33.3642,35.4322,1,National News Agency Lebanon; Telegram,New media-National,"On 31 October 2024, Israeli warplanes carried ...",0,,1730764851,10893.0
135,LBN28156,31 October 2024,2024,1,Political violence,Explosions/Remote violence,Shelling/artillery/missile attack,Military Forces of Israel (2022-),,External/Other forces,...,33.3347,35.5709,2,National News Agency Lebanon,National,"On 31 October 2024, the Israeli military fired...",0,,1730764851,5663.0
136,LBN28157,31 October 2024,2024,1,Political violence,Explosions/Remote violence,Shelling/artillery/missile attack,Military Forces of Israel (2022-),,External/Other forces,...,33.3215,35.5632,2,National News Agency Lebanon; Telegram,New media-National,"On 31 October 2024, the Israeli military fired...",0,,1730764851,1007.0
137,LBN28158,31 October 2024,2024,1,Political violence,Explosions/Remote violence,Air/drone strike,Military Forces of Israel (2022-),,External/Other forces,...,33.3477,35.4183,2,National News Agency Lebanon,National,"On 31 October 2024, Israeli warplanes carried ...",0,,1730764851,3915.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
372484,LBN15692,01 November 2023,2023,1,Political violence,Explosions/Remote violence,Air/drone strike,Military Forces of Israel (2022-),,External/Other forces,...,33.1108,35.3103,1,El Nashra,National,"On 1 November 2023, Israeli fighter jets struc...",0,,1727907797,1394.0
372485,LBN15805,01 November 2023,2023,1,Political violence,Explosions/Remote violence,Shelling/artillery/missile attack,Military Forces of Israel (2022-),,External/Other forces,...,33.1400,35.5147,1,An-Nahar,National,"On 1 November 2023, Israeli military forces sh...",0,,1727907797,1634.0
372486,LBN15806,01 November 2023,2023,1,Political violence,Explosions/Remote violence,Shelling/artillery/missile attack,Military Forces of Israel (2022-),,External/Other forces,...,33.1108,35.3103,1,AlManar TV; Twitter,New media-National,"On 1 November 2023, Israeli military forces sh...",0,,1727907797,1394.0
372487,LBN15807,01 November 2023,2023,1,Political violence,Explosions/Remote violence,Shelling/artillery/missile attack,Military Forces of Israel (2022-),,External/Other forces,...,33.1064,35.2324,2,National News Agency Lebanon,National,"On 1 November 2023, Israeli military forces sh...",0,,1727907797,1847.0


In [8]:
test_sequence = embed_story(get_story(0))
test_sequence.shape

torch.Size([366, 3])

In [46]:
import math

def create_sinusoidal_positional_encoding(max_seq_len, d_model):
    position = torch.arange(0, max_seq_len, dtype=torch.float).unsqueeze(1)  # [max_seq_len, 1]
    div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))  # [d_model // 2]

    positional_encoding = torch.zeros(max_seq_len, d_model)
    positional_encoding[:, 0::2] = torch.sin(position * div_term)  # Even indices
    positional_encoding[:, 1::2] = torch.cos(position * div_term)  # Odd indices

    return positional_encoding

In [109]:
class DayLevelTransformerModel(nn.Module):
    def __init__(self, d_model, num_heads=3, num_layers=6, max_seq_len=512):
        super(DayLevelTransformerModel, self).__init__()
        
        self.in_dim = 3
        self.d_model = d_model
        self.max_seq_len = max_seq_len
        

        self.positional_encoding = create_sinusoidal_positional_encoding(max_seq_len, self.d_model)

        self.fc_in = nn.Linear(self.in_dim, self.d_model)

 
        self.transformer_encoder_layer = nn.TransformerEncoderLayer(d_model=self.d_model, nhead=num_heads)
        self.transformer_encoder = nn.TransformerEncoder(self.transformer_encoder_layer, num_layers=num_layers)
        self.transformer_decoder_layer = nn.TransformerDecoderLayer(d_model=self.d_model, nhead=num_heads)
        self.transformer_decoder = nn.TransformerDecoder(self.transformer_decoder_layer, num_layers=num_layers)


        self.fc_out = nn.Linear(self.d_model, self.in_dim) 

    def forward(self, x, y):

        x = self.fc_in(x) + self.positional_encoding
        y = self.fc_in(y) + self.positional_encoding
        
        memory = self.transformer_encoder(x) 
        output = self.transformer_decoder(y, memory)
        

        final = self.fc_out(output)
        return final

    def generate(self, x, max_length=20):
      
        generated = x 
        
        for _ in range(max_length):
            output = self.forward(generated, generated)  
            next_token = output[-1, :, :].argmax(dim=-1)  
            
            generated = torch.cat((generated, next_token.unsqueeze(0)), dim=0)  
            
        return generated

In [17]:
sequences = [
    embed_story(get_story(i)).float() for i in tqdm.tqdm(range(1000))
]

100%|██████████| 1000/1000 [00:55<00:00, 18.16it/s]


In [100]:
train_sequences = []

for i in tqdm.tqdm(range(1000)):
    story = get_story(i)
    temp = story[pd.to_datetime(story['event_date'], format='%d %B %Y') < pd.to_datetime('01 October 2024', format='%d %B %Y')]
    if len(temp) > 0:
        train_sequences.append(embed_story(temp))

100%|██████████| 1000/1000 [00:44<00:00, 22.51it/s]


In [101]:
train_sequences[9].sum(axis=0)

tensor([1.8457e+02, 3.3500e+02, 3.0000e-02])

In [110]:
seq_len = 10 
batch_size = 4

max_len = max(seq.size(0) for seq in sequences)
model = DayLevelTransformerModel(12, max_seq_len=max_len-1)

padded_sequences = []
for seq in sequences:
    padded_seq = F.pad(seq, (0, 0, 0, max_len - seq.size(0))) 
    padded_sequences.append(padded_seq)

stacked = torch.stack(padded_sequences)

batches = [ stacked[i*batch_size:(i+1)*batch_size] for i in range(len(stacked) // batch_size) ]


criterion = nn.MSELoss()  
optimizer = optim.Adam(model.parameters())

model.train()
for epoch in range(10): 
    losses = []
    for batch in tqdm.tqdm(batches):
        optimizer.zero_grad()
        
        input_sequence = batch[:, :-1]
        target_sequence = batch[:, 1:]

        logits = model(input_sequence, target_sequence)
        
        loss = criterion(logits, target_sequence)
        losses.append(loss.item())
        
        loss.backward()
        optimizer.step()
        
    print(f"Epoch {epoch+1}, Loss: {sum(losses)/len(losses)}")

 10%|█         | 25/250 [00:14<02:06,  1.78it/s]