In [1]:
import warnings
warnings.filterwarnings( 'ignore' )
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

### Need to download GTD data, too large for github. Place it in this folder

In [None]:
data_path = 'gtd.csv'
data = pd.read_csv(data_path, encoding='ISO-8859-1')
data.shape

(181691, 135)

In [3]:
data = data[data['gname'] != 'Unknown']
data = data.loc[(data['iyear'] != 0) & (data['imonth'] != 0) & (data['iday'] != 0)]

data.shape

(98343, 135)

In [4]:
data = data[data['weaptype1_txt'] != 'Unknown']
data.shape

(87500, 135)

In [5]:
print(data['weaptype1_txt'].isnull().sum())
print(data['weaptype2_txt'].isnull().sum())
print(data['weaptype3_txt'].isnull().sum())
print(data['weaptype4_txt'].isnull().sum())

0
77923
85982
87438


Many missing values in weapon types other than 1, we should drop these features.

In [6]:
#pd.set_option('display.max_rows', None)

#data.isnull().sum()


In [7]:
threshold = len(data) * 0.9  # Keep columns with at least 70% non-NaN
data_cleaned = data.dropna(axis=1, thresh=threshold)
data_cleaned.shape

(87500, 46)

In [8]:
#data_cleaned.isnull().sum()

In [9]:
data_cleaned = data_cleaned.drop(columns=['crit1', 'crit2', 'crit3', 'doubtterr', 'eventid', 'attacktype1_txt', 'targtype1_txt', 'targsubtype1_txt', 'natlty1_txt', 'weaptype1_txt', 'weapsubtype1_txt', 'INT_LOG', 'INT_IDEO', 'INT_MISC', 'INT_ANY', 'country_txt', 'region_txt', 'guncertain1'])

In [10]:
print(data_cleaned.shape)

(87500, 28)


In [11]:
pd.set_option('display.max_rows', None)

data_cleaned.isnull().sum()

iyear              0
imonth             0
iday               0
extended           0
country            0
region             0
provstate        157
city              75
latitude        2621
longitude       2621
specificity        3
vicinity           0
multiple           1
success            0
suicide            0
attacktype1        0
targtype1          0
targsubtype1    3771
target1          356
natlty1          549
gname              0
individual         0
weaptype1          0
weapsubtype1    3341
nkill           5271
property           0
ishostkid        142
dbsource           0
dtype: int64

In [12]:
data_cleaned = data_cleaned.dropna(subset=['latitude', 'longitude', 'provstate', 'city', 'multiple','target1','natlty1', 'ishostkid'])


In [13]:
pd.set_option('display.max_rows', None)

data_cleaned.isnull().sum()

iyear              0
imonth             0
iday               0
extended           0
country            0
region             0
provstate          0
city               0
latitude           0
longitude          0
specificity        0
vicinity           0
multiple           0
success            0
suicide            0
attacktype1        0
targtype1          0
targsubtype1    3188
target1            0
natlty1            0
gname              0
individual         0
weaptype1          0
weapsubtype1    3169
nkill           4892
property           0
ishostkid          0
dbsource           0
dtype: int64

In [14]:
data_cleaned = data_cleaned.drop(columns=['targsubtype1', 'weapsubtype1', 'dbsource'])

In [15]:
data_cleaned.isnull().sum()

iyear             0
imonth            0
iday              0
extended          0
country           0
region            0
provstate         0
city              0
latitude          0
longitude         0
specificity       0
vicinity          0
multiple          0
success           0
suicide           0
attacktype1       0
targtype1         0
target1           0
natlty1           0
gname             0
individual        0
weaptype1         0
nkill          4892
property          0
ishostkid         0
dtype: int64

In [16]:
data_cleaned['nkill'] = data_cleaned['nkill'].fillna(data_cleaned['nkill'].median())


In [17]:
data_cleaned.shape

(83654, 25)

In [18]:
data_cleaned.to_csv('cleaned_gtd.csv', index=False)


# Partition 1

In [19]:
partition1 = data_cleaned[data_cleaned['iyear'] <= 1980]
groups = partition1.groupby(['iyear', 'gname']).size().reset_index(name='count')
top5 = groups.groupby('gname')['count'].sum().nlargest(5).index
partition1 = data_cleaned[data_cleaned['iyear'] <= 1979]

In [20]:
partition1 = partition1[partition1['gname'].isin(top5)]
partition1 = partition1[partition1['gname'] != 'Palestinians']
partition1 = partition1[partition1['gname'] != 'Basque Fatherland and Freedom (ETA)']

partition1['gname'].value_counts()

gname
Irish Republican Army (IRA)     987
Protestant extremists           198
Ulster Volunteer Force (UVF)    167
Name: count, dtype: int64

In [21]:
trainp1 = partition1[(partition1['iyear'] <= 1975)]
len(trainp1)

783

In [22]:
testp1 = partition1[(partition1['iyear'] > 1975)]
len(testp1)

569

In [23]:
# Assuming your test set is called 'test_df'
eta_ira_df = testp1[testp1['gname'].isin(['Irish Republican Army (IRA)'])]
non_eta_ira_df = testp1[~testp1['gname'].isin(['Irish Republican Army (IRA)'])]

# Determine how much to downscale (e.g., keep 50% of the original)
downscale_factor = 0.35  # Adjust as needed
target_size = int(len(eta_ira_df) * downscale_factor)

# Stratified sampling based on key feature distributions
sampled_eta_ira_df = eta_ira_df.groupby(['weaptype1', 'attacktype1', 'city'], group_keys=False).apply(
    lambda x: x.sample(frac=downscale_factor, random_state=42) if len(x) > 1 else x
).reset_index(drop=True)

# Combine the reduced ETA/IRA data with the rest of the test set
downscaled_test_df = pd.concat([non_eta_ira_df, sampled_eta_ira_df], ignore_index=True)

# Verify new class distribution
print(downscaled_test_df['gname'].value_counts(normalize=True))  # Check proportion
print(len(downscaled_test_df))


gname
Irish Republican Army (IRA)     0.713018
Protestant extremists           0.153846
Ulster Volunteer Force (UVF)    0.133136
Name: proportion, dtype: float64
338


In [24]:
print(len(downscaled_test_df) / (len(trainp1) + len(downscaled_test_df)))

0.3015165031222123


In [25]:
#testp1 = downscaled_test_df
print(downscaled_test_df['gname'].value_counts())
trainp1['gname'].value_counts()

gname
Irish Republican Army (IRA)     241
Protestant extremists            52
Ulster Volunteer Force (UVF)     45
Name: count, dtype: int64


gname
Irish Republican Army (IRA)     515
Protestant extremists           146
Ulster Volunteer Force (UVF)    122
Name: count, dtype: int64

In [26]:
testp1 = downscaled_test_df

In [27]:
def fit_factorize(df, text_features):
    mappings = {}
    for col in text_features:
        df[col], uniques = pd.factorize(df[col])
        mappings[col] = uniques
    return df, mappings

def apply_factorize(df, mappings):
    for col, uniques in mappings.items():
        df[col] = df[col].apply(lambda x: uniques.get_loc(x) if x in uniques else -1)
    return df


In [28]:
text_features = trainp1.select_dtypes(include='object').columns

trainp1, gname_mappings = fit_factorize(trainp1, text_features)
testp1 = apply_factorize(testp1, gname_mappings)

In [29]:
trainp1['gname'].value_counts()

gname
0    515
2    146
1    122
Name: count, dtype: int64

In [30]:
testp1['gname'].value_counts()

gname
0    241
2     52
1     45
Name: count, dtype: int64

In [31]:
trainp1.to_csv('../Codes/CleanPartitions/trainp1.csv', index=False)
testp1.to_csv('../Codes/CleanPartitions/testp1.csv', index=False)

# Partition 2

In [32]:
partition2 = data_cleaned[(data_cleaned['iyear'] >= 1980) & (data_cleaned['iyear'] <= 1995)]
partition2['attack_date'] = pd.to_datetime({'year': partition2['iyear'], 'month': partition2['imonth'], 'day': partition2['iday']})

groups = partition2.groupby(['iyear', 'gname']).size().reset_index(name='count')
top5 = groups.groupby('gname')['count'].sum().nlargest(4).index
top5

Index(['Shining Path (SL)', 'Farabundo Marti National Liberation Front (FMLN)',
       'Irish Republican Army (IRA)', 'Basque Fatherland and Freedom (ETA)'],
      dtype='object', name='gname')

In [33]:
partition2 = partition2[partition2['gname'].isin(top5)]

In [34]:
trainsizep2 = int(0.7 * len(partition2))
trainsizep2

6645

In [35]:
print(partition2.iloc[trainsizep2]['attack_date'])  # If train_size is an integer position

1989-09-05 00:00:00


In [36]:
traindatep2 = '1989-09-05'
trainp2 = partition2[partition2['attack_date'] <= traindatep2]
testp2 = partition2[partition2['attack_date'] > traindatep2]

In [37]:
print('Train %: ', len(trainp2)/len(partition2))
print('Test %: ', len(testp2)/len(partition2))

Train %:  0.7001263956182853
Test %:  0.2998736043817148


In [38]:
text_features = trainp2.select_dtypes(include='object').columns

trainp2, gname_mappings = fit_factorize(trainp2, text_features)
testp2 = apply_factorize(testp2, gname_mappings)

In [39]:
trainp2.to_csv('../Codes/CleanPartitions/trainp2.csv', index=False)
testp2.to_csv('../Codes/CleanPartitions/testp2.csv', index=False)

# Partition 3

In [40]:
partition3 = data_cleaned[(data_cleaned['iyear'] >= 1996) & (data_cleaned['iyear'] <= 2010)]
partition3['attack_date'] = pd.to_datetime({'year': partition3['iyear'], 'month': partition3['imonth'], 'day': partition3['iday']})

groups = partition3.groupby(['iyear', 'gname']).size().reset_index(name='count')
top5 = groups.groupby('gname')['count'].sum().nlargest(5).index
top5

Index(['Taliban', 'Communist Party of India - Maoist (CPI-Maoist)',
       'Revolutionary Armed Forces of Colombia (FARC)',
       'Liberation Tigers of Tamil Eelam (LTTE)',
       'Tehrik-i-Taliban Pakistan (TTP)'],
      dtype='object', name='gname')

In [41]:
partition3 = partition3[partition3['gname'].isin(top5)]
partition3 = partition3[partition3['gname'] != 'Communist Party of India - Maoist (CPI-Maoist)']
partition3 = partition3[partition3['gname'] != 'Taliban']
partition3 = partition3[partition3['gname'] != 'Tehrik-i-Taliban Pakistan (TTP)']

In [42]:
trainsizep3 = int(0.7 * len(partition3))
print(partition3.iloc[trainsizep3]['attack_date'])  # If train_size is an integer position

2007-05-12 00:00:00


In [43]:
traindatep3 = '2007-05-12'
trainp3 = partition3[partition3['attack_date'] <= traindatep3]
testp3 = partition3[partition3['attack_date'] > traindatep3]

In [44]:
print('Train %: ', len(trainp3)/len(partition3))
print('Test %: ', len(testp3)/len(partition3))

Train %:  0.7012557832121613
Test %:  0.29874421678783875


In [45]:
text_features = trainp3.select_dtypes(include='object').columns

trainp3, gname_mappings = fit_factorize(trainp3, text_features)
testp3 = apply_factorize(testp3, gname_mappings)

In [46]:
trainp3.to_csv('../Codes/CleanPartitions/trainp3.csv', index=False)
testp3.to_csv('../Codes/CleanPartitions/testp3.csv', index=False)

# Partition 4

In [47]:
partition4 = data_cleaned[data_cleaned['iyear'] > 2010]
partition4['attack_date'] = pd.to_datetime({'year': data_cleaned['iyear'], 'month': data_cleaned['imonth'], 'day': data_cleaned['iday']})
groups = partition4.groupby(['iyear', 'gname']).size().reset_index(name='count')
top5 = groups.groupby('gname')['count'].sum().nlargest(5).index
top5 = top5.delete(1)
top5

Index(['Taliban', 'Al-Shabaab', 'Boko Haram', 'New People's Army (NPA)'], dtype='object', name='gname')

In [48]:
partition4 = partition4[partition4['gname'].isin(top5)]

In [49]:
def downscaling(dataframe, group, downscale_factor):
    # Assuming your test set is called 'test_df'
    eta_ira_df = dataframe[dataframe['gname'].isin([group])]
    non_eta_ira_df = dataframe[~dataframe['gname'].isin([group])]

    # Determine how much to downscale (e.g., keep 50% of the original)
    #downscale_factor = 0.25  # Adjust as needed
    target_size = int(len(eta_ira_df) * downscale_factor)

    # Stratified sampling based on key feature distributions
    sampled_eta_ira_df = eta_ira_df.groupby(['weaptype1', 'attacktype1', 'city'], group_keys=False).apply(
        lambda x: x.sample(frac=downscale_factor, random_state=42) if len(x) > 1 else x
    ).reset_index(drop=True)

    # Combine the reduced ETA/IRA data with the rest of the test set
    downscaled_test_df = pd.concat([non_eta_ira_df, sampled_eta_ira_df], ignore_index=True)

    # Verify new class distribution
    #print(downscaled_test_df['gname'].value_counts(normalize=True))  # Check proportion
    #print(len(downscaled_test_df[downscaled_test_df['gname']==group]))
    #print(counts(downscaled_test_df, top5))
    return downscaled_test_df

downscaled_df = downscaling(partition4, 'Taliban', 0.15)
downscaled_df_2 = downscaling(downscaled_df, 'Boko Haram', 0.55)
downscaled_df_3 = downscaling(downscaled_df_2, 'Al-Shabaab', 0.5)

print(downscaled_df_3['gname'].value_counts(normalize=True))  # Check proportion
#print(len(downscaled_df_3[downscaled_df_3['gname']==group]))
#total = counts(downscaled_df_3, top5)
print('Total after downscaling: ', len(downscaled_df_3))

gname
Taliban                    0.314552
Al-Shabaab                 0.240647
Boko Haram                 0.239426
New People's Army (NPA)    0.205375
Name: proportion, dtype: float64
Total after downscaling:  6549


In [50]:
partition4 = downscaled_df_3

In [51]:
partition4['attack_date'] = pd.to_datetime(partition4['attack_date'])
partition4 = partition4.sort_values(by='attack_date')
trainsizep4 = int(0.7 * len(partition4))
traindatep4 = partition4.iloc[trainsizep4]['attack_date']
print(traindatep4)

2016-01-30 00:00:00


In [52]:
trainp4 = partition4[partition4['attack_date'] <= '2016-01-30']
testp4 = partition4[partition4['attack_date'] > '2016-01-30']

In [53]:
print(len(trainp4) / len(partition4))
print(len(testp4) / len(partition4))

0.7001068865475645
0.29989311345243547


In [54]:
text_features = trainp4.select_dtypes(include='object').columns

trainp4, gname_mappings = fit_factorize(trainp4, text_features)
testp4 = apply_factorize(testp4, gname_mappings)

In [55]:
trainp4.to_csv('../Codes/CleanPartitions/trainp4.csv', index=False)
testp4.to_csv('../Codes/CleanPartitions/testp4.csv', index=False)