In [1]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle

In [2]:
path = '../cleaned_gtd.csv'
data_cleaned = pd.read_csv(path, encoding='ISO-8859-1')

### Group by attack date and perpetrators before splitting

In [3]:
data_cleaned['attack_date'] = pd.to_datetime({'year': data_cleaned['iyear'], 'month': data_cleaned['imonth'], 'day': data_cleaned['iday']})
data_cleaned.sort_values(by=['gname', 'attack_date'], inplace=True)
data_cleaned = data_cleaned.drop(columns=['attack_date'])
data_cleaned.columns

Index(['iyear', 'imonth', 'iday', 'extended', 'country', 'region', 'provstate',
       'city', 'latitude', 'longitude', 'specificity', 'vicinity', 'multiple',
       'success', 'suicide', 'attacktype1', 'targtype1', 'target1', 'natlty1',
       'gname', 'individual', 'weaptype1', 'nkill', 'property', 'ishostkid'],
      dtype='object')

In [4]:
# creates train and test data, first 70% of each group is added to train and remaining 30% to test
def handle_leakage(df):
    train_frames = []
    test_frames = []

    #first 70% of each groups attacks to training set, remainin 30% to testing set
    for _, group_data in df.groupby('gname'):
        split_point = int(len(group_data) * 0.7)  # 70% for training
        train_frames.append(group_data.iloc[:split_point])
        test_frames.append(group_data.iloc[split_point:])           


    # Concatenate all the group-specific splits into final train and test DataFrames
    train_df = pd.concat(train_frames)
    test_df = pd.concat(test_frames)

    # Shuffle each DataFrame separately
    train_df = shuffle(train_df)
    test_df = shuffle(test_df)

    print(len(train_df))

    return train_df, test_df

In [5]:
sample_sizes = [100, 200, 300, 478]

for sample_size in sample_sizes:
    # extract top 30 groups and sample 
    top_30_classes = data_cleaned['gname'].value_counts().head(30).index
    top_30_df = data_cleaned[data_cleaned['gname'].isin(top_30_classes)]
    top_30_df = top_30_df.groupby('gname').sample(n=sample_size, random_state=42)

    features = top_30_df.drop(columns=['gname'])
    labels = top_30_df['gname']

    # greedy integer encoding of features
    for col in features.select_dtypes(include='object').columns:
        features[col], _ = pd.factorize(features[col])

    top_30_encoded = pd.concat([features, labels], axis = 1)

    #train test split
    train, test = handle_leakage(top_30_encoded)
    print(train.columns)
    

    combined = pd.concat([train, test])
    print('Combined ', len(combined))
    print('train + test ', len(train) + len(test))
    #save to csv
    combined.to_csv(f'engineered_dfs/df_top30_{sample_size}.csv')
    train.to_csv(f'traindata/train{sample_size}.csv')
    test.to_csv(f'testdata/test{sample_size}.csv')

2100
Index(['iyear', 'imonth', 'iday', 'extended', 'country', 'region', 'provstate',
       'city', 'latitude', 'longitude', 'specificity', 'vicinity', 'multiple',
       'success', 'suicide', 'attacktype1', 'targtype1', 'target1', 'natlty1',
       'individual', 'weaptype1', 'nkill', 'property', 'ishostkid', 'gname'],
      dtype='object')
Combined  3000
train + test  3000
4200
Index(['iyear', 'imonth', 'iday', 'extended', 'country', 'region', 'provstate',
       'city', 'latitude', 'longitude', 'specificity', 'vicinity', 'multiple',
       'success', 'suicide', 'attacktype1', 'targtype1', 'target1', 'natlty1',
       'individual', 'weaptype1', 'nkill', 'property', 'ishostkid', 'gname'],
      dtype='object')
Combined  6000
train + test  6000
6300
Index(['iyear', 'imonth', 'iday', 'extended', 'country', 'region', 'provstate',
       'city', 'latitude', 'longitude', 'specificity', 'vicinity', 'multiple',
       'success', 'suicide', 'attacktype1', 'targtype1', 'target1', 'natlty1',
   