In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/gtd/globalterrorismdb_0718dist.csv


In [2]:
# Importing other libraries 
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import stats
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,accuracy_score
import warnings
warnings.filterwarnings("ignore")

In [3]:
# Reading the Dataset 
data = pd.read_csv('/kaggle/input/gtd/globalterrorismdb_0718dist.csv', encoding='ISO-8859-1', low_memory=False)

In [4]:
data.head()

Unnamed: 0,eventid,iyear,imonth,iday,approxdate,extended,resolution,country,country_txt,region,...,addnotes,scite1,scite2,scite3,dbsource,INT_LOG,INT_IDEO,INT_MISC,INT_ANY,related
0,197000000001,1970,7,2,,0,,58,Dominican Republic,2,...,,,,,PGIS,0,0,0,0,
1,197000000002,1970,0,0,,0,,130,Mexico,1,...,,,,,PGIS,0,1,1,1,
2,197001000001,1970,1,0,,0,,160,Philippines,5,...,,,,,PGIS,-9,-9,1,1,
3,197001000002,1970,1,0,,0,,78,Greece,8,...,,,,,PGIS,-9,-9,1,1,
4,197001000003,1970,1,0,,0,,101,Japan,4,...,,,,,PGIS,-9,-9,1,1,


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181691 entries, 0 to 181690
Columns: 135 entries, eventid to related
dtypes: float64(55), int64(22), object(58)
memory usage: 187.1+ MB


In [6]:
data.shape

(181691, 135)

### Missing Values

In [7]:
data.isnull().sum().sort_values(ascending = False)

gsubname3           181671
weapsubtype4_txt    181621
weapsubtype4        181621
weaptype4           181618
weaptype4_txt       181618
                     ...  
suicide                  0
success                  0
crit3                    0
property                 0
eventid                  0
Length: 135, dtype: int64

In [8]:
# There are a lot of null and missing values in our dataset but we cannot fill
# These vaules by assumption or reasoning because this is a real workd data.

In [9]:
# 5 point Summary 
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
eventid,181691.0,2.002705e+11,1.325957e+09,1.970000e+11,1.991021e+11,2.009022e+11,2.014081e+11,2.017123e+11
iyear,181691.0,2.002639e+03,1.325943e+01,1.970000e+03,1.991000e+03,2.009000e+03,2.014000e+03,2.017000e+03
imonth,181691.0,6.467277e+00,3.388303e+00,0.000000e+00,4.000000e+00,6.000000e+00,9.000000e+00,1.200000e+01
iday,181691.0,1.550564e+01,8.814045e+00,0.000000e+00,8.000000e+00,1.500000e+01,2.300000e+01,3.100000e+01
extended,181691.0,4.534622e-02,2.080629e-01,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,1.000000e+00
...,...,...,...,...,...,...,...,...
nreleased,10400.0,-2.901827e+01,6.572012e+01,-9.900000e+01,-9.900000e+01,0.000000e+00,1.000000e+00,2.769000e+03
INT_LOG,181691.0,-4.543731e+00,4.543547e+00,-9.000000e+00,-9.000000e+00,-9.000000e+00,0.000000e+00,1.000000e+00
INT_IDEO,181691.0,-4.464398e+00,4.637152e+00,-9.000000e+00,-9.000000e+00,-9.000000e+00,0.000000e+00,1.000000e+00
INT_MISC,181691.0,9.000996e-02,5.684573e-01,-9.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,1.000000e+00


Since we have a lot of columns so we will check the significance of columns and will drop all the columns that are not significant for our data. 
 0   year            
 1   month           
 2   day             
 3   country         181691 non-null  object 
 4   region          181691 non-null  object 
 5   state           181270 non-null  object 
 6   location        55495 non-null   object 
 7   extended        181691 non-null  int64  
 8   success         181691 non-null  int64  
 9   suicide         181691 non-null  int64  
 10  attack_type    
 11  target_type     
 12  target_subtype  
 13  target          
 14  killed          
 15  wounded        
 16  weapon_type     
 17  motive         
 18  summary         
 19  group          
 20  city             
 21  latitude        
 22  longitude       


Essential Columns (20-25 Most Impactful)
1. Incident Basics
iyear (Year of attack)

imonth, iday (Month, Day)

country_txt (Country name)

region_txt (Region name)

latitude, longitude (Location coordinates)

2. Attack Characteristics
attacktype1_txt (Attack type: Bombing, Armed Assault, etc.)

targtype1_txt (Target type: Military, Civilians, Government, etc.)

weaptype1_txt (Weapon used: Explosives, Firearms, etc.)

suicide (1 = Suicide attack)

success (1 = Successful attack)

3. Perpetrator & Group Info
gname (Terrorist group responsible)

nperps (Number of perpetrators, if known)

claimed (1 = Group claimed responsibility)

4. Casualties & Impact
nkill (Total fatalities)

nwound (Total injured)

ishostkid (1 = Hostage/kidnapping involved)

property (1 = Property damage occurred)

5. Additional Context
INT_LOG (International logistics involved)

INT_IDEO (International ideological ties)

propextent (Scale of property damage: Minor, Major, Catastrophic)



In [10]:
data.drop(columns = ['eventid','country','region','attacktype1','targtype1','weaptype1','approxdate','resolution','city','specificity','vicinity','location','summary','crit1',
                            'crit2','crit3','doubtterr','alternative','alternative_txt','multiple','attacktype2','attacktype2_txt',
                            'attacktype3','attacktype3_txt','targsubtype1','targsubtype1_txt','corp1','natlty1','natlty1_txt',
                            'targtype2','targtype2_txt','targsubtype2','targsubtype2_txt','corp2','target2','natlty2','natlty2_txt'	,'targtype3',
                            'targtype3_txt'	,'targsubtype3','targsubtype3_txt','corp3',	'target3','natlty3','natlty3_txt',
                            'gsubname','gname2','gsubname2','gname3','gsubname3','guncertain1','guncertain2','guncertain3',	
                            'individual','nperps','nperpcap','claimmode','claimmode_txt','claim2','claimmode2','claimmode2_txt',	
                            'claim3','claimmode3','claimmode3_txt',	'compclaim','weapsubtype1','weapsubtype1_txt','weaptype2',
                            'weaptype2_txt','weapsubtype2',	'weapsubtype2_txt',	'weaptype3','weaptype3_txt','weapsubtype3',	'weapsubtype3_txt',
                            'weaptype4','weaptype4_txt','weapsubtype4',	'weapsubtype4_txt',	'weapdetail','nkillus','nkillter',
                            'nwoundus',	'nwoundte','propextent','propextent_txt','propvalue','propcomment','nhostkid','nhostkidus','nhours',
                            'ndays','divert','kidhijcountry','ransom','ransomamt','ransomamtus','ransompaid','ransompaidus','ransomnote',
                            'hostkidoutcome','hostkidoutcome_txt','nreleased','addnotes','scite1','scite2',	'scite3','dbsource','INT_LOG',
                            'INT_IDEO','INT_MISC','INT_ANY','related','latitude','longitude','target1','motive','iday','provstate'],inplace = True)

In [11]:
data.dtypes

iyear                int64
imonth               int64
extended             int64
country_txt         object
region_txt          object
success              int64
suicide              int64
attacktype1_txt     object
targtype1_txt       object
gname               object
claimed            float64
weaptype1_txt       object
nkill              float64
nwound             float64
property             int64
ishostkid          float64
dtype: object

In [12]:
data.shape

(181691, 16)

In [13]:
data.isnull().sum()

iyear                  0
imonth                 0
extended               0
country_txt            0
region_txt             0
success                0
suicide                0
attacktype1_txt        0
targtype1_txt          0
gname                  0
claimed            66120
weaptype1_txt          0
nkill              10313
nwound             16311
property               0
ishostkid            178
dtype: int64

### Data Preprocessing 

##### Columns Defination 

iyear : Year in which the incident occurred.
imonth : Month in which the incident occurred (1–12, 0 if unknown)
iday : Day of the month when the incident occurred (1–31, 0 if unknown)

extended column :- i am keeping this column since this column telling me if terrorist incident extended 24 hours or not. 
The extended column is a binary flag that helps identify whether a terrorist event continued beyond a single day. For example, 
a hijacking or hostage situation that lasts several days would be marked as 1. This helps in analyzing the duration and complexity of attacks
It may help predict gname, especially for groups that are known for long-duration or complex operations

country column :- most terrorist groups operate regionally.Terrorist groups usually have geographically concentrated areas of operation — based on:
Ethnic, political, or religious presence
Historical conflicts
Border accessibility
example: Taliban - Afghanistan, Pakistan

region: - It indicates the broad geographical region where the terrorist incident occurred, using a numeric code.
It is accompanied by region_txt, which is the human-readable version.
Why region Is Important When Predicting gname
Most terror groups are active in specific regions.
ISIS → Middle East
Al-Shabaab → Sub-Saharan Africa
keep both country and region intially after modeling we can identify if 
region is playing any role or not

success: - In the Global Terrorism Database (GTD), the success column indicates whether the terrorist attack achieved its intended outcome
how its useful?
Some groups like Al-Qaeda or ISIS may have higher success rates due to better planning or suicide missions, 
while newer or fragmented groups may fail more often.
Value	Meaning
1	Attack was successful
0	Attack was unsuccessful

sucide: In the Global Terrorism Database (GTD), the suicide column indicates whether the attack was a suicide attack
— meaning the perpetrator(s) intended to die as part of carrying out the attack
1	Yes – It was a suicide attack
0	No – It was not a suicide attack
why its important?
Some groups are known for suicide attacks (e.g., ISIS, Taliban)
Indicates willingness to sacrifice members
Models can learn which groups use suicide tactics regularly

attacktype1:  the column attacktype1 indicates the primary method used to carry out the terrorist attack.
why its important?
Different groups prefer different methods
Some groups specialize in suicide bombings, others in kidnappings
Taliban → Bombing/Explosion, Armed Assault

targtype1: the target-related columns describe who or what was attacked — essentially, the primary victim or target category
why its important?
Some groups focus on military, others on civilians
Each group has preferred targets

#claimed: the claimed column indicates whether the terrorist group publicly claimed responsibility for the attack.
#1	Yes – The attack was claimed
#0	No – The attack was not claimed
#-9	Unknown – It’s not known if it was claimed
#why its important?
#Some groups regularly claim attacks (e.g., ISIS, Taliban)
#other avoid
#sometimes multiplegroups claim the attacks

#weaptype1 
#the column weaptype1 refers to the primary weapon type used in a terrorist attack.
#the column weaptype1 refers to the primary weapon type used in a terrorist attack.
#ex: 	Biological,	Biological
#wht its important
#Group tactics differ by weapon

#nkill ; this column stands for Number of People Killed in a terrorist incident
#zero means No one was killed,A positive number means confirmed fatalities and NaN means Unknown or not reported by sources
#why this column is important for us?
#Different groups have different attack styles.
#Some groups tend to carry out high-fatality attacks (e.g., suicide bombings), while others may target infrastructure with minimal loss of life.
#nkill helps capture that pattern

#nwound : Number of People Wounded in a terrorist attack
#why its important?
#help identify severity of attacks along with nkill.
#Group behavior, some groups wound more than kill

#property: Indicates whether the incident resulted in property damage
#1 = Yes,0 = No,-9 = Unknown
#why its important: Some groups target infrastructure (e.g., oil pipelines, government buildings), while others may focus on casualties.
#helps in identifying Which groups pose the highest economic threat

#ishostkid: Indicates whether hostages were taken or a kidnapping occurred during the incident
#0 = no hostage, 1 = yes,-9 = unknown
#Hostage-taking is a distinct tactic, often used to:
#Gain leverage (ransom, political demands)
#Intimidate the population

In [14]:
# Renaming The Columns :
data.rename(columns = {
                       'iyear':'year',
                       'imonth':'month',
                       'extended':'multi_day_attack',
                       'country_txt':'country',
                       'region_txt':'region',
                       'success':'attack_successful',
                       'suicide':'suicide_attack',
                       'attacktype1_txt':'attack_type',
                       'targtype1_txt':'primary_target_type',
                       'claimed':'responsibility_claimed',
                       'weaptype1_txt': 'weapon_type',
                       'nkill': 'num_killed',
                       'nwound':'num_wounded',
                       'property':'property_damage',
                       'ishostkid': 'hostage_or_kidnapping'},inplace = True)

# Renaming the columns

In [15]:
data.columns

Index(['year', 'month', 'multi_day_attack', 'country', 'region',
       'attack_successful', 'suicide_attack', 'attack_type',
       'primary_target_type', 'gname', 'responsibility_claimed', 'weapon_type',
       'num_killed', 'num_wounded', 'property_damage',
       'hostage_or_kidnapping'],
      dtype='object')

### Dealing with Target Column : Group Name

In [16]:
data = data[data['gname'] != 'Unknown']

In [17]:
group_counts = data['gname'].value_counts()

In [18]:
top20_groups = group_counts.head(20).index
print(top20_groups)

Index(['Taliban', 'Islamic State of Iraq and the Levant (ISIL)',
       'Shining Path (SL)', 'Farabundo Marti National Liberation Front (FMLN)',
       'Al-Shabaab', 'New People's Army (NPA)', 'Irish Republican Army (IRA)',
       'Revolutionary Armed Forces of Colombia (FARC)', 'Boko Haram',
       'Kurdistan Workers' Party (PKK)', 'Basque Fatherland and Freedom (ETA)',
       'Communist Party of India - Maoist (CPI-Maoist)', 'Maoists',
       'Liberation Tigers of Tamil Eelam (LTTE)',
       'National Liberation Army of Colombia (ELN)',
       'Tehrik-i-Taliban Pakistan (TTP)', 'Palestinians',
       'Houthi extremists (Ansar Allah)',
       'Al-Qaida in the Arabian Peninsula (AQAP)',
       'Nicaraguan Democratic Force (FDN)'],
      dtype='object', name='gname')


In [19]:
data['terrorist_group_mod'] = data['gname'].where(data['gname'].isin(top20_groups),'other')

#Naming the terrorist groups as 'other' which are not in top20 

In [20]:
le = LabelEncoder()

In [21]:
data['terrorist_group_encoded'] = le.fit_transform(data['terrorist_group_mod'])

In [22]:
# Lets drop now gname and terrorist_group_mod
data.drop(['gname','terrorist_group_mod'],axis =1, inplace =True)

In [23]:
data['terrorist_group_encoded'].unique()

array([20, 13, 15,  7,  2, 12, 16, 10, 17,  5, 11, 14,  9, 18,  0,  4,  6,
       19,  1,  3,  8])

In [24]:
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(label_mapping)

{'Al-Qaida in the Arabian Peninsula (AQAP)': 0, 'Al-Shabaab': 1, 'Basque Fatherland and Freedom (ETA)': 2, 'Boko Haram': 3, 'Communist Party of India - Maoist (CPI-Maoist)': 4, 'Farabundo Marti National Liberation Front (FMLN)': 5, 'Houthi extremists (Ansar Allah)': 6, 'Irish Republican Army (IRA)': 7, 'Islamic State of Iraq and the Levant (ISIL)': 8, "Kurdistan Workers' Party (PKK)": 9, 'Liberation Tigers of Tamil Eelam (LTTE)': 10, 'Maoists': 11, 'National Liberation Army of Colombia (ELN)': 12, "New People's Army (NPA)": 13, 'Nicaraguan Democratic Force (FDN)': 14, 'Palestinians': 15, 'Revolutionary Armed Forces of Colombia (FARC)': 16, 'Shining Path (SL)': 17, 'Taliban': 18, 'Tehrik-i-Taliban Pakistan (TTP)': 19, 'other': 20}


### Missing Vues Treatment

In [25]:
data.isnull().sum()

year                           0
month                          0
multi_day_attack               0
country                        0
region                         0
attack_successful              0
suicide_attack                 0
attack_type                    0
primary_target_type            0
responsibility_claimed     42971
weapon_type                    0
num_killed                  7068
num_wounded                11819
property_damage                0
hostage_or_kidnapping        145
terrorist_group_encoded        0
dtype: int64

In [26]:
# Responsibility_claimed
data['responsibility_claimed'].unique()

array([nan,  0.,  1., -9.])

we will impute null values with -9 which means unknown column. We are doing this so we could combine nan and -9 into one single category -9 or unknown.

In [27]:
data['responsibility_claimed'] = data['responsibility_claimed'].fillna(-9)

In [28]:
# After Conversion 
data['responsibility_claimed'].unique()

array([-9.,  0.,  1.])

In [29]:
data['responsibility_claimed'].isnull().sum()

0

In [30]:
data['num_killed'].isnull().sum()

7068

In [31]:
data['num_killed'].value_counts()/data.shape[0]*100

num_killed
0.0       45.155648
1.0       17.940733
2.0        7.239988
3.0        4.604232
4.0        3.196878
            ...    
94.0       0.001011
136.0      0.001011
1180.0     0.001011
228.0      0.001011
311.0      0.001011
Name: count, Length: 200, dtype: float64

In [32]:
data['num_killed']= data['num_killed'].fillna(0)

In [33]:
# After Conversion
data['num_killed'].isnull().sum()

0

**filling missing values in the nkill (number of people killed) column with 0, assuming that if the value is missing, 
it likely means no deaths were reported. This approach preserves the dataset size and avoids dropping important rows, 
while maintaining a realistic assumption about the absence of data **

#### Handling Missing values in num_wounded

In [34]:
data['num_wounded'].isnull().sum()

11819

In [35]:
data['num_wounded'].value_counts()/data.shape[0]*100

num_wounded
0.0      57.706579
1.0       7.475558
2.0       4.793295
3.0       3.434470
4.0       2.294028
           ...    
650.0     0.001011
155.0     0.001011
542.0     0.001011
163.0     0.001011
127.0     0.001011
Name: count, Length: 224, dtype: float64

In [36]:
data['num_wounded'] = data['num_wounded'].fillna(0)

In [37]:
data['num_wounded'].isnull().sum()

0

filled missing values in the num_wounded (number of people wounded) column with 0, 
assuming that if the value is missing, it likely means no deaths were reported. 
This approach preserves the dataset size and avoids dropping important rows, 
while maintaining a realistic assumption about the absence of data 

#### Handling Missing Values in Hostage_or_Kidnapping

In [38]:
#hostage_or_kidnapping:
data['hostage_or_kidnapping'].unique()
#here we will impute null values with -9 means unknown'

array([ 0.,  1., nan, -9.])

In [39]:
data['hostage_or_kidnapping'] = data['hostage_or_kidnapping'].fillna(-9)

In [40]:
#after conversion:
data['hostage_or_kidnapping'].unique()

array([ 0.,  1., -9.])

we merged null values with -9 which means unknown.
we made single category out of two as one '-9' or unknown

#### Data After Missing Values Treatment

In [41]:
#lets check our datset if we have anymore missing values or not
data.isnull().sum()
#all missing values treated

year                       0
month                      0
multi_day_attack           0
country                    0
region                     0
attack_successful          0
suicide_attack             0
attack_type                0
primary_target_type        0
responsibility_claimed     0
weapon_type                0
num_killed                 0
num_wounded                0
property_damage            0
hostage_or_kidnapping      0
terrorist_group_encoded    0
dtype: int64

### Feature Encoding

In [42]:
#columns which are already encoded in our datasets are
#1.multi_day_attack
#2.attack_successful 
#3.suicide_attack
#4.responsibility_claimed
#5.property_damage
#6.hostage_or_kidnapping

In [43]:
#month 
data['month'].unique()
#here 1-12 means(jan to december)
#0 means unknown month

array([ 7,  0,  1,  2,  3,  4,  5,  6,  8,  9, 10, 11, 12])

In [44]:
#country  
data['country'].unique()
#we will do label encoding here 

array(['Dominican Republic', 'Mexico', 'United States', 'Uruguay',
       'East Germany (GDR)', 'Ethiopia', 'Venezuela', 'Philippines',
       'West Germany (FRG)', 'Switzerland', 'Italy', 'Spain', 'Guatemala',
       'Brazil', 'Argentina', 'Lebanon', 'Japan', 'Ireland', 'Jordan',
       'Turkey', 'Paraguay', 'Iran', 'United Kingdom', 'Bolivia',
       'Greece', 'Nicaragua', 'Netherlands', 'Belgium', 'Canada',
       'Australia', 'Zambia', 'Sweden', 'South Yemen', 'Poland', 'Egypt',
       'Kuwait', 'Austria', 'Czechoslovakia', 'India', 'Israel', 'France',
       'Brunei', 'Zaire', "People's Republic of the Congo", 'Cambodia',
       'Portugal', 'Algeria', 'Colombia', 'Thailand', 'Haiti', 'Sudan',
       'Cyprus', 'Myanmar', 'Afghanistan', 'Yugoslavia', 'Singapore',
       'Pakistan', 'Jamaica', 'Chad', 'El Salvador', 'Andorra', 'Syria',
       'South Korea', 'Peru', 'United Arab Emirates', 'Kenya', 'Somalia',
       'Tanzania', 'Sri Lanka', 'Malaysia', 'Namibia', 'Morocco',
       'Ni

In [45]:
encode = LabelEncoder()

In [46]:
data['encoded_country'] = encode.fit_transform(data['country'])

In [47]:
data['encoded_country'].unique()

array([ 43, 108, 177, 178,  44,  52, 182, 130, 185, 162,  80, 154,  65,
        19,   5,  92,  83,  78,  84, 171, 127,  76, 176,  16,  63, 119,
       115,  14,  26,   7, 190, 161, 152, 131,  47,  88,   8,  38,  73,
        79,  56,  20, 189, 128,  24, 132,   2,  31, 167,  68, 158,  36,
       112,   0, 188, 145, 124,  82,  28,  48,   3, 163, 150, 129, 175,
        86, 148, 166, 155, 102, 113, 110, 121,  33, 184,  12,  29, 125,
       169, 149,  77,   4, 107,  41,  74, 153, 135,  40, 186,  67,  46,
       173, 123, 111,  11,  93,  69, 139,  10, 170, 140, 117,  95,  64,
       191, 106, 181,  27, 143,  42, 133,  21,  97,  57,  53, 116, 136,
       160,  18, 120, 114, 159,  72, 103, 126,  94, 104,  61, 138, 144,
        35,  34,  25, 187,  60, 164,  54, 100,  23,   9,  71, 168, 109,
         6,  81,  30,  17, 101, 165,  91,  96,  90,  51,  70, 137,  32,
        85,  62,  66, 146, 134, 122,  50,  37, 156,  58, 180, 118, 105,
        59,  13,  99,  98,  39,  87, 147,  89, 157, 174,  75, 17

In [48]:
#region
data['region'].unique()
#we will do label encoding here also

array(['Central America & Caribbean', 'North America', 'South America',
       'Eastern Europe', 'Sub-Saharan Africa', 'Southeast Asia',
       'Western Europe', 'Middle East & North Africa', 'East Asia',
       'Australasia & Oceania', 'South Asia', 'Central Asia'],
      dtype=object)

In [49]:
encode = LabelEncoder()

In [50]:
data['encoded_region'] = encode.fit_transform(data['region'])

In [51]:
data['encoded_region'].unique()

array([ 1,  6,  7,  4, 10,  9, 11,  5,  3,  0,  8,  2])

In [52]:
#attack_type 
data['attack_type'].unique()

array(['Assassination', 'Hostage Taking (Kidnapping)', 'Armed Assault',
       'Facility/Infrastructure Attack', 'Bombing/Explosion', 'Unknown',
       'Hijacking', 'Unarmed Assault',
       'Hostage Taking (Barricade Incident)'], dtype=object)

In [53]:
data['encoded_attack_type'] = encode.fit_transform(data['attack_type'])

In [54]:
data['encoded_attack_type'].unique()

array([1, 6, 0, 3, 2, 8, 4, 7, 5])

In [55]:
#primary_target_type 
data['primary_target_type'].unique()

array(['Private Citizens & Property', 'Government (Diplomatic)', 'Police',
       'Military', 'Government (General)', 'Business',
       'Educational Institution', 'Violent Political Party',
       'Religious Figures/Institutions', 'Unknown', 'Transportation',
       'Utilities', 'Airports & Aircraft', 'Journalists & Media',
       'Telecommunication', 'Food or Water Supply', 'NGO',
       'Terrorists/Non-State Militia', 'Other', 'Tourists', 'Maritime',
       'Abortion Related'], dtype=object)

In [56]:
data['encoded_primary_target_type'] = encode.fit_transform(data['primary_target_type'])

In [57]:
data['encoded_primary_target_type'].unique()

array([13,  5, 12,  9,  6,  2,  3, 21, 14, 19, 18, 20,  1,  7, 15,  4, 10,
       16, 11, 17,  8,  0])

In [58]:
#weapon_type 
data['weapon_type'].unique()

array(['Unknown', 'Firearms', 'Incendiary', 'Explosives', 'Chemical',
       'Melee', 'Sabotage Equipment', 'Fake Weapons', 'Radiological',
       'Other',
       'Vehicle (not to include vehicle-borne explosives, i.e., car or truck bombs)',
       'Biological'], dtype=object)

In [59]:
encode = LabelEncoder()
data['encoded_weapon_type'] = encode.fit_transform(data['weapon_type'])

In [60]:
data['encoded_weapon_type'].unique()

array([10,  4,  5,  2,  1,  6,  9,  3,  8,  7, 11,  0])

### New Feature Making 

In [61]:
# Define custom bin edges
bins = [-1, 0, 5, 10, 20, 50, 100, float('inf')]  
labels = ['0', '1-5', '6-10', '11-20', '21-50', '51-100', '100+']
# Create a new binned column
data['kill_range'] = pd.cut(data['num_killed'], bins=bins, labels=labels, right=True)

In [62]:
data['kill_range'].unique()

['1-5', '0', '6-10', '21-50', '11-20', '51-100', '100+']
Categories (7, object): ['0' < '1-5' < '6-10' < '11-20' < '21-50' < '51-100' < '100+']

The num_killed column was binned into categorical ranges using pd.cut().
Custom bins were defined to group the number of people killed into
intervals like '1–5', '6–10', up to '100+'. 
This transformation simplifies the numeric feature into meaningful categories, 
making it easier to analyze casualty severity levels 
and potentially improve model interpretability.

In [63]:
#num_wounded
data['num_wounded'].value_counts()

num_wounded
0.0      68896
1.0       7394
2.0       4741
3.0       3397
4.0       2269
         ...  
650.0        1
155.0        1
542.0        1
163.0        1
127.0        1
Name: count, Length: 224, dtype: int64

In [64]:
#lets bin these columns
# Define custom bin edges
bins = [-1, 0, 5, 10, 20, 50, 100, float('inf')]  
labels = ['0', '1-5', '6-10', '11-20', '21-50', '51-100', '100+']
# Create a new binned column
data['wounded_range'] = pd.cut(data['num_wounded'], bins=bins, labels=labels, right=True)

In [65]:
data['wounded_range'].unique()

['0', '1-5', '6-10', '11-20', '21-50', '100+', '51-100']
Categories (7, object): ['0' < '1-5' < '6-10' < '11-20' < '21-50' < '51-100' < '100+']

The num_wounded column was binned into categorical ranges using pd.cut(). 
Custom bins were defined to group the number of people wounded into 
intervals like '1–5', '6–10', up to '100+'. 
This transformation simplifies the numeric feature into meaningful 
categories, making it easier to analyze casualty severity levels and
potentially improve model interpretability.

In [66]:
#lets make new column name casuality by combining num_killed and num_wounded

In [67]:
data['casualty'] = data['num_killed'] + data['num_wounded']

In [68]:
data[['num_killed','num_wounded','casualty']]

Unnamed: 0,num_killed,num_wounded,casualty
0,1.0,0.0,1.0
1,0.0,0.0,0.0
5,0.0,0.0,0.0
6,0.0,0.0,0.0
8,0.0,0.0,0.0
...,...,...,...
181683,6.0,0.0,6.0
181684,0.0,0.0,0.0
181686,1.0,2.0,3.0
181687,2.0,7.0,9.0


In [69]:
data['casualty'].isnull().sum()

0

In [70]:
data.columns

Index(['year', 'month', 'multi_day_attack', 'country', 'region',
       'attack_successful', 'suicide_attack', 'attack_type',
       'primary_target_type', 'responsibility_claimed', 'weapon_type',
       'num_killed', 'num_wounded', 'property_damage', 'hostage_or_kidnapping',
       'terrorist_group_encoded', 'encoded_country', 'encoded_region',
       'encoded_attack_type', 'encoded_primary_target_type',
       'encoded_weapon_type', 'kill_range', 'wounded_range', 'casualty'],
      dtype='object')

In [71]:
#removing orginal columns after encodeding

columns_to_drop = ['country', 'region', 'attack_type', 'primary_target_type', 'weapon_type']
data.drop(columns=[col for col in columns_to_drop if col in data.columns], inplace=True)


In [72]:
data.shape

(98909, 19)

In [73]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 98909 entries, 0 to 181688
Data columns (total 19 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   year                         98909 non-null  int64   
 1   month                        98909 non-null  int64   
 2   multi_day_attack             98909 non-null  int64   
 3   attack_successful            98909 non-null  int64   
 4   suicide_attack               98909 non-null  int64   
 5   responsibility_claimed       98909 non-null  float64 
 6   num_killed                   98909 non-null  float64 
 7   num_wounded                  98909 non-null  float64 
 8   property_damage              98909 non-null  int64   
 9   hostage_or_kidnapping        98909 non-null  float64 
 10  terrorist_group_encoded      98909 non-null  int64   
 11  encoded_country              98909 non-null  int64   
 12  encoded_region               98909 non-null  int64   
 13  encod

In [74]:
#kill_range: this is new feature which we made since this is in category form we will convert it to our numerical form
label_map = {
    '0': 0,
    '1-5': 1,
    '6-10': 2,
    '11-20': 3,
    '21-50': 4,
    '51-100': 5,
    '100+': 6
}

data['kill_range_encoded'] = data['kill_range'].map(label_map)

In [75]:
#wounded_range: we will do same for this column
label_map = {
    '0': 0,
    '1-5': 1,
    '6-10': 2,
    '11-20': 3,
    '21-50': 4,
    '51-100': 5,
    '100+': 6
}

data['wounded_range_encoded'] = data['wounded_range'].map(label_map)

The kill_range,wounded_range categorical columns was converted to numeric 
format using a custom mapping (label_map).
Each bin label (e.g., '1-5', '6-10', etc.) was assigned an integer 
from 0 to 6 in increasing order of severity. 
This ordinal encoding makes the data suitable for machine learning models 
that require numeric input while preserving the natural order of the categories

In [76]:
#now we can drop kill_range and wounded_range
data.drop(['kill_range','wounded_range'],axis = 1,inplace = True)

In [77]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 98909 entries, 0 to 181688
Data columns (total 19 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   year                         98909 non-null  int64   
 1   month                        98909 non-null  int64   
 2   multi_day_attack             98909 non-null  int64   
 3   attack_successful            98909 non-null  int64   
 4   suicide_attack               98909 non-null  int64   
 5   responsibility_claimed       98909 non-null  float64 
 6   num_killed                   98909 non-null  float64 
 7   num_wounded                  98909 non-null  float64 
 8   property_damage              98909 non-null  int64   
 9   hostage_or_kidnapping        98909 non-null  float64 
 10  terrorist_group_encoded      98909 non-null  int64   
 11  encoded_country              98909 non-null  int64   
 12  encoded_region               98909 non-null  int64   
 13  encod

In [78]:
#conversion of some columns dtype into int from float or category
data['responsibility_claimed'] = data['responsibility_claimed'].astype(int)
data['num_killed'] = data['num_killed'].astype(int)
data['num_wounded'] = data['num_wounded'].astype(int)
data['hostage_or_kidnapping'] = data['hostage_or_kidnapping'].astype(int)
data['casualty'] = data['casualty'].astype(int)
data['kill_range_encoded'] = data['kill_range_encoded'].astype(int)
data['wounded_range_encoded'] = data['wounded_range_encoded'].astype(int)

Converted several columns to integer type (int) to ensure consistency in data types 
and compatibility with machine learning models. 
This step helps prevent errors during modeling and reduces memory usage 
by eliminating unnecessary float representations for whole numbers.

In [79]:
data.head()

Unnamed: 0,year,month,multi_day_attack,attack_successful,suicide_attack,responsibility_claimed,num_killed,num_wounded,property_damage,hostage_or_kidnapping,terrorist_group_encoded,encoded_country,encoded_region,encoded_attack_type,encoded_primary_target_type,encoded_weapon_type,casualty,kill_range_encoded,wounded_range_encoded
0,1970,7,0,1,0,-9,1,0,0,0,20,43,1,1,13,10,1,1,0
1,1970,0,0,1,0,-9,0,0,0,1,20,108,6,6,5,10,0,0,0
5,1970,1,0,1,0,0,0,0,1,0,20,177,6,0,12,4,0,0,0
6,1970,1,0,0,0,-9,0,0,0,0,20,178,7,1,12,4,0,0,0
8,1970,1,0,1,0,1,0,0,1,0,20,177,6,3,9,5,0,0,0
