In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# --------------------------------------------------------------------
# 1. Load the dataset
# --------------------------------------------------------------------
# Replace with your actual file path as needed
df = pd.read_csv("globalterrorismdb_0718dist.csv", encoding='latin-1', low_memory=False)

# --------------------------------------------------------------------
# 2. Basic Data Cleaning
# --------------------------------------------------------------------
# Choose the columns you want for modeling. 
# Adjust this list based on your target and the features you find relevant.
columns_needed = [
    "iyear",            # Year
    "country_txt",      # Country (categorical)
    "region_txt",       # Region (categorical)
    "attacktype1_txt",  # Attack type (categorical)
    "targtype1_txt",    # Target type (categorical)
    "weaptype1_txt",    # Weapon type (categorical)
    "nkill",            # Number of people killed
    "nwound",           # Number of people wounded
    "success"           # Our target variable (1=successful, 0=unsuccessful)
]

# Keep only those columns (some might not exist in older versions of the dataset—adjust if needed)
df = df[columns_needed].copy()

# Example numeric cleaning: fill missing values with 0
# (In practice, consider more nuanced imputation strategies)
df["nkill"] = df["nkill"].fillna(0)
df["nwound"] = df["nwound"].fillna(0)


# Drop rows where 'success' is NaN (if any exist)
df.dropna(subset=["success"], inplace=True)

# --------------------------------------------------------------------
# 3. Feature Engineering and Encoding
# --------------------------------------------------------------------
# Separate target (y) and features (X)
y = df["success"].astype(int)  # ensure it's int
X = df.drop(["success"], axis=1)

# Identify categorical columns to encode
cat_cols = ["country_txt", "region_txt", "attacktype1_txt", "targtype1_txt", "weaptype1_txt"]
# Numeric columns
num_cols = ["iyear", "nkill", "nwound"]

# Apply label encoding to the categorical columns
encoder_map = {}
for col in cat_cols:
    encoder = LabelEncoder()
    # Convert column to string before label-encoding to avoid errors
    X[col] = encoder.fit_transform(X[col].astype(str))
    # Store encoder if you need to transform new data later
    encoder_map[col] = encoder

# At this point, X has only numeric columns.
# You could also do one-hot encoding (pd.get_dummies) for better performance on tree-based models.

# --------------------------------------------------------------------
# 4. Train/Test Split
# --------------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.2,      # 20% for testing
    random_state=42     # for reproducibility
)

# --------------------------------------------------------------------
# 5. Model Selection and Hyperparameter Tuning
# --------------------------------------------------------------------
# We'll use a Random Forest Classifier as an example
rf = RandomForestClassifier(random_state=42)

# A small parameter grid for demonstration. Feel free to expand it.
param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [None, 5, 10],
    "min_samples_split": [2, 5]
}

# GridSearchCV will try each combination using cross-validation
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=3,              # 3-fold cross-validation
    scoring="accuracy",# You can switch to f1, recall, etc., if data is imbalanced
    n_jobs=-1          # Use all CPU cores
)

# Train the models
grid_search.fit(X_train, y_train)

# Retrieve the best model
best_model = grid_search.best_estimator_

# --------------------------------------------------------------------
# 6. Evaluation
# --------------------------------------------------------------------
# Predict on the test set
y_pred = best_model.predict(X_test)

# Check best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Compute accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)

# Classification report (precision, recall, F1-score)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# --------------------------------------------------------------------
# 7. Feature Importance (Optional)
# --------------------------------------------------------------------
# Feature importances can reveal which features contributed most to the model
if hasattr(best_model, "feature_importances_"):
    importances = best_model.feature_importances_
    feature_names = X.columns
    
    importance_df = pd.DataFrame({
        "Feature": feature_names,
        "Importance": importances
    }).sort_values(by="Importance", ascending=False)
    
    print("\nFeature Importances:")
    print(importance_df)

# --------------------------------------------------------------------
# 8. Next Steps
# --------------------------------------------------------------------
# - Consider deeper data cleaning/imputation.
# - Handle class imbalance if "success" is skewed.
# - Try advanced techniques (XGBoost, LightGBM), or time-based splits 
#   if predicting future attacks based on older data.
# - Incorporate domain knowledge for more relevant features.


Best Hyperparameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 200}
Test Accuracy: 0.9309006852142326

Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.54      0.63      3978
           1       0.94      0.98      0.96     32361

    accuracy                           0.93     36339
   macro avg       0.85      0.76      0.80     36339
weighted avg       0.92      0.93      0.93     36339


Feature Importances:
           Feature  Importance
4    targtype1_txt    0.208826
0            iyear    0.197534
3  attacktype1_txt    0.190224
1      country_txt    0.122186
6            nkill    0.102945
7           nwound    0.078312
2       region_txt    0.054458
5    weaptype1_txt    0.045516


In [4]:
from sklearn.metrics import f1_score

# Suppose you have ground truth labels and model predictions:
# y_test = [...]
# y_pred = [...]

# Calculate the F1-score for a binary classification
f1 = f1_score(y_test, y_pred)  # Default: pos_label=1

print("F1-score:", f1)


F1-score: 0.9618985478658028


In [5]:
from sklearn.metrics import f1_score

# y_test = [...]
# y_pred = [...]

# macro average – treats all classes equally
f1_macro = f1_score(y_test, y_pred, average='macro')

# weighted average – weights each class by its support (i.e., number of samples)
f1_weighted = f1_score(y_test, y_pred, average='weighted')

print("F1-score (macro):", f1_macro)
print("F1-score (weighted):", f1_weighted)


F1-score (macro): 0.79563562079637
F1-score (weighted): 0.9254972201527559


In [11]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder

# --------------------------------------------------------------------
# 1. Load the dataset
# --------------------------------------------------------------------
df = pd.read_csv("globalterrorismdb_0718dist.csv", encoding='latin-1', low_memory=False)

# --------------------------------------------------------------------
# 2. Create 'casualties' target variable
# --------------------------------------------------------------------
# Fill missing values with 0 first, then sum
df["nkill"] = df["nkill"].fillna(0)
df["nwound"] = df["nwound"].fillna(0)

df['casualties'] = df['nkill'] + df['nwound']

# We'll drop rows where casualties is NaN just in case, but we already filled with 0
df.dropna(subset=['casualties'], inplace=True)

# --------------------------------------------------------------------
# 3. Select Features (X) and Target (y)
# --------------------------------------------------------------------
# For demonstration, let's use a few columns:
#   - iyear (numeric)
#   - attacktype1_txt (categorical)
#   - weaptype1_txt (categorical)
#   - targtype1_txt (categorical)
#   - region_txt (categorical)
#   - city (categorical) - optional, can be large cardinality

columns_needed = [
    'iyear',
    'attacktype1_txt',
    'weaptype1_txt',
    'targtype1_txt',
    'region_txt'
]

# Make sure these columns exist in the dataset
df = df[columns_needed + ['casualties']].copy()

# Define target
y = df['casualties']
X = df.drop('casualties', axis=1)

# --------------------------------------------------------------------
# 4. Convert Categorical Columns to Numeric
# --------------------------------------------------------------------
cat_cols = ['attacktype1_txt', 'weaptype1_txt', 'targtype1_txt', 'region_txt']
for col in cat_cols:
    X[col] = X[col].astype(str)
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])



# --------------------------------------------------------------------
# 5. Train/Test Split
# --------------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

# --------------------------------------------------------------------
# 6. Model Definition & Hyperparameter Tuning
# --------------------------------------------------------------------
rfr = RandomForestRegressor(random_state=42)

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10],
    'min_samples_split': [2, 5]
}

grid_search = GridSearchCV(
    estimator=rfr,
    param_grid=param_grid,
    cv=3,
    scoring='neg_mean_squared_error',
    n_jobs=-1
)

grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

# --------------------------------------------------------------------
# 7. Evaluation
# --------------------------------------------------------------------
y_pred = best_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("Best Hyperparameters:", grid_search.best_params_)
print("Test RMSE:", rmse)
print("Test R^2 Score:", r2)


Best Hyperparameters: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 200}
Test RMSE: 33.15702063861292
Test R^2 Score: 0.6144554569296967


In [9]:
print(df.columns.tolist())


['iyear', 'attacktype1_txt', 'weaptype1_txt', 'targtype1_txt', 'region_txt', 'casualties']
