In [1]:
#Install and Import Dependencies
pip install pandas scikit-learn


Note: you may need to restart the kernel to use updated packages.


In [2]:
#Load the Dataset
#Install and Import Dependencies
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score


In [3]:
#Select Relevant Columns
#Decide which columns matter for your classification goal. For predicting success (1/0), you might include:

#iyear: year
#country_txt: country name
#region_txt: region name
#attacktype1_txt: type of attack (bombing, hijacking, etc.)
#targtype1_txt: primary target type
#weaptype1_txt: primary weapon type
#nkill, nwound: fatalities, wounded
#success: (target variable for classification)
# Adjust file path as needed
df = pd.read_csv("globalterrorismdb_0718dist.csv", encoding='latin-1', low_memory=False)

# Quickly explore columns
print(df.columns)
print(df.head(3))


Index(['eventid', 'iyear', 'imonth', 'iday', 'approxdate', 'extended',
       'resolution', 'country', 'country_txt', 'region',
       ...
       'addnotes', 'scite1', 'scite2', 'scite3', 'dbsource', 'INT_LOG',
       'INT_IDEO', 'INT_MISC', 'INT_ANY', 'related'],
      dtype='object', length=135)
        eventid  iyear  imonth  iday approxdate  extended resolution  country  \
0  197000000001   1970       7     2        NaN         0        NaN       58   
1  197000000002   1970       0     0        NaN         0        NaN      130   
2  197001000001   1970       1     0        NaN         0        NaN      160   

          country_txt  region  ... addnotes scite1 scite2  scite3  dbsource  \
0  Dominican Republic       2  ...      NaN    NaN    NaN     NaN      PGIS   
1              Mexico       1  ...      NaN    NaN    NaN     NaN      PGIS   
2         Philippines       5  ...      NaN    NaN    NaN     NaN      PGIS   

   INT_LOG  INT_IDEO INT_MISC INT_ANY  related  
0        0

In [4]:
columns_needed = [
    "iyear", "country_txt", "region_txt", "attacktype1_txt",
    "targtype1_txt", "weaptype1_txt", "nkill", "nwound", "success"
]

df = df[columns_needed].copy()


In [5]:
#Handle Missing Values
#Replace NaN in numeric columns (nkill, nwound) with 0:



df["nkill"] = df["nkill"].fillna(0)
df["nwound"] = df["nwound"].fillna(0)


In [6]:
#Drop rows where the target (success) is missing (if any)
df.dropna(subset=["success"], inplace=True)


In [7]:
#Examine Class Distribution
print(df["success"].value_counts())


success
1    161632
0     20059
Name: count, dtype: int64


In [8]:
# Define Features (X) and Target (y)

y = df["success"].astype(int)  # Our classification target
X = df.drop("success", axis=1)


In [9]:
#The GTD typically stores descriptive text for columns like country_txt. Machine learning models often need numeric inputs. We can use LabelEncoder or One-Hot Encoding
cat_cols = ["country_txt", "region_txt", "attacktype1_txt", "targtype1_txt", "weaptype1_txt"]
for col in cat_cols:
    X[col] = X[col].astype(str)        # ensure string type
    encoder = LabelEncoder()
    X[col] = encoder.fit_transform(X[col])


In [10]:

#Split Data into Training and Test Sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42
)


In [11]:
#Choose and Train a Model
#Here, we demonstrate a Random Forest classifier with Grid Search for basic hyperparameter tuning
rf = RandomForestClassifier(random_state=42)

# A small parameter grid; expand or refine as needed
param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [None, 10],
    "min_samples_split": [2, 5]
}

grid_search = GridSearchCV(
    rf, 
    param_grid,
    cv=3,               # 3-fold cross validation
    scoring="accuracy", # or 'f1', 'roc_auc', etc.
    n_jobs=-1           # use all CPU cores
)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
print("Best Params:", grid_search.best_params_)


Best Params: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 200}


In [12]:
# Evaluate the Model

y_pred = best_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

#Accuracy: percentage of correct predictions.
#Classification Report: includes precision, recall, F1-score per class.
#If the dataset is imbalanced, focus on F1-score or precision/recall instead of accuracy.


Test Accuracy: 0.9309006852142326

Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.54      0.63      3978
           1       0.94      0.98      0.96     32361

    accuracy                           0.93     36339
   macro avg       0.85      0.76      0.80     36339
weighted avg       0.92      0.93      0.93     36339



In [13]:

#If successful attacks far outnumber unsuccessful (or vice versa), you might:

#Use class_weight:



rf = RandomForestClassifier(class_weight="balanced", random_state=42)


In [14]:

#For tasks like predicting total casualties (nkill + nwound) or property damage (propvalue):

#Replace RandomForestClassifier with RandomForestRegressor (or XGBoost/LightGBM regressors).
#Change GridSearchCV scoring to neg_mean_squared_error or r2

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [None, 10]
}

grid_search = GridSearchCV(
    RandomForestRegressor(random_state=42),
    param_grid,
    cv=3,
    scoring="neg_mean_squared_error"
)
# ...
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print("RMSE:", rmse)
print("R^2:", r2)


RMSE: 0.2628674852197726
R^2: 0.29118488108067064
