# Global Terrorism Database - Comprehensive Analysis

## Project Overview
This comprehensive analysis explores global terrorism patterns using the Global Terrorism Database (GTD) from 1970-2017. The project implements multiple machine learning approaches to understand and predict various aspects of terrorist activities.

**Key Objectives:**
1. **Attack Success Prediction**: Classify whether terrorist attacks will be successful
2. **Casualty Prediction**: Predict the number of casualties (killed + wounded)
3. **Attack Type Classification**: Predict the type of terrorist attack
4. **Temporal Analysis**: Analyze trends and patterns over time
5. **Geographical Analysis**: Identify regional patterns and hotspots
6. **Risk Assessment**: Develop comprehensive risk scoring models

**Dataset:** 181,691 terrorist incidents with 135+ features including location, attack details, casualties, and outcomes.

## 1. Import Libraries and Setup

In [None]:
# Data manipulation and analysis
import pandas as pd
import numpy as np
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
try:
    import plotly.express as px
    import plotly.graph_objects as go
    from plotly.subplots import make_subplots
    PLOTLY_AVAILABLE = True
except ImportError:
    PLOTLY_AVAILABLE = False
    print("Plotly not available, using matplotlib/seaborn only")

# Machine Learning
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso
from sklearn.svm import SVC, SVR
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve,
    mean_squared_error, mean_absolute_error, r2_score,
    precision_recall_curve, f1_score
)
from sklearn.feature_selection import SelectKBest, chi2, f_regression
from imblearn.over_sampling import SMOTE

# Date/Time
from datetime import datetime
import calendar

# Set style for visualizations
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

print("✅ Libraries imported successfully!")
print(f"📊 Plotly available: {PLOTLY_AVAILABLE}")

## 2. Data Loading and Initial Exploration

In [None]:
# Load the Global Terrorism Database
print("🔄 Loading Global Terrorism Database...")

df = pd.read_csv('globalterrorismdb_0718dist.csv', encoding='latin-1', low_memory=False)

print(f"✅ Dataset loaded successfully!")
print(f"📊 Dataset Shape: {df.shape}")
print(f"💣 Total Incidents: {len(df):,}")
print(f"📋 Features: {len(df.columns)}")
print(f"📅 Time Range: {df['iyear'].min()} - {df['iyear'].max()}")

# Display basic information
print("\n=== DATASET OVERVIEW ===")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

print("\n=== KEY COLUMNS AVAILABLE ===")
key_columns = [
    'iyear', 'imonth', 'iday', 'country_txt', 'region_txt', 'city',
    'attacktype1_txt', 'targtype1_txt', 'weaptype1_txt', 'gname',
    'nkill', 'nwound', 'success', 'suicide', 'property', 'propvalue'
]

for col in key_columns:
    if col in df.columns:
        print(f"✅ {col}")
    else:
        print(f"❌ {col} (not available)")

print("\n=== MISSING VALUES SUMMARY ===")
missing_summary = df[key_columns].isnull().sum().sort_values(ascending=False)
missing_pct = (missing_summary / len(df) * 100).round(1)
missing_df = pd.DataFrame({
    'Missing_Count': missing_summary,
    'Missing_Percentage': missing_pct
})
print(missing_df[missing_df['Missing_Count'] > 0])

print("\n=== SAMPLE DATA ===")
display(df[key_columns].head())

In [None]:
# Comprehensive data exploration
print("=== TEMPORAL DISTRIBUTION ===")
yearly_counts = df['iyear'].value_counts().sort_index()
print(f"Peak year: {yearly_counts.idxmax()} ({yearly_counts.max():,} incidents)")
print(f"Lowest year: {yearly_counts.idxmin()} ({yearly_counts.min():,} incidents)")
print(f"Average per year: {yearly_counts.mean():.0f} incidents")

print("\n=== GEOGRAPHICAL DISTRIBUTION ===")
country_counts = df['country_txt'].value_counts().head(10)
print("Top 10 most affected countries:")
for country, count in country_counts.items():
    pct = (count / len(df)) * 100
    print(f"  {country}: {count:,} incidents ({pct:.1f}%)")

print("\n=== ATTACK TYPES ===")
attack_types = df['attacktype1_txt'].value_counts().head(5)
print("Top 5 attack types:")
for attack_type, count in attack_types.items():
    pct = (count / len(df)) * 100
    print(f"  {attack_type}: {count:,} incidents ({pct:.1f}%)")

print("\n=== CASUALTY STATISTICS ===")
total_killed = df['nkill'].sum()
total_wounded = df['nwound'].sum()
print(f"Total killed: {total_killed:,}")
print(f"Total wounded: {total_wounded:,}")
print(f"Total casualties: {total_killed + total_wounded:,}")
print(f"Average casualties per incident: {(total_killed + total_wounded) / len(df):.2f}")

print("\n=== SUCCESS RATE ===")
success_rate = df['success'].mean() * 100
print(f"Overall success rate: {success_rate:.1f}%")

## 3. Comprehensive Data Preprocessing

In [None]:
# Create a comprehensive preprocessed dataset
print("=== DATA PREPROCESSING ===")

# Create working copy
df_processed = df.copy()

# 1. Handle missing values systematically
print("\n🔧 Handling missing values...")

# Casualty data: fill with 0 (assumption: missing = no casualties)
df_processed['nkill'] = df_processed['nkill'].fillna(0)
df_processed['nwound'] = df_processed['nwound'].fillna(0)

# Property value: fill with 0
if 'propvalue' in df_processed.columns:
    df_processed['propvalue'] = df_processed['propvalue'].fillna(0)

# Text fields: fill with 'Unknown'
text_cols = ['attacktype1_txt', 'targtype1_txt', 'weaptype1_txt', 'gname']
for col in text_cols:
    if col in df_processed.columns:
        df_processed[col] = df_processed[col].fillna('Unknown')

# Location fields: fill with 'Unknown'
location_cols = ['country_txt', 'region_txt', 'city']
for col in location_cols:
    if col in df_processed.columns:
        df_processed[col] = df_processed[col].fillna('Unknown')

# 2. Feature engineering
print("\n🔧 Creating derived features...")

# Casualty features
df_processed['total_casualties'] = df_processed['nkill'] + df_processed['nwound']
df_processed['casualty_ratio'] = df_processed['nkill'] / (df_processed['total_casualties'] + 0.1)
df_processed['high_casualty'] = (df_processed['total_casualties'] >= 10).astype(int)
df_processed['mass_casualty'] = (df_processed['total_casualties'] >= 50).astype(int)

# Temporal features
df_processed['decade'] = (df_processed['iyear'] // 10) * 10
df_processed['month_name'] = df_processed['imonth'].apply(lambda x: calendar.month_name[int(x)] if pd.notnull(x) and 1 <= x <= 12 else 'Unknown')
df_processed['is_weekend'] = df_processed.apply(lambda row: 
    datetime(int(row['iyear']), int(row['imonth']) if pd.notnull(row['imonth']) else 1, 
             int(row['iday']) if pd.notnull(row['iday']) else 1).weekday() >= 5 
    if pd.notnull(row['iyear']) else False, axis=1).astype(int)

# Attack characteristics
df_processed['is_suicide'] = df_processed.get('suicide', 0).fillna(0).astype(int)
df_processed['has_property_damage'] = df_processed.get('property', 0).fillna(0).astype(int)

# Geographic features
df_processed['is_capital'] = df_processed['city'].str.contains('capital|Capital', na=False).astype(int)

# Target variables for different prediction tasks
df_processed['success_binary'] = df_processed['success'].fillna(0).astype(int)
df_processed['casualty_category'] = pd.cut(df_processed['total_casualties'], 
                                          bins=[0, 1, 5, 15, 50, float('inf')], 
                                          labels=['None', 'Low', 'Medium', 'High', 'Extreme'])

print(f"✅ Feature engineering completed")
print(f"📊 New dataset shape: {df_processed.shape}")

# 3. Create categorical encodings
print("\n🔧 Encoding categorical variables...")

# Select features for modeling
categorical_features = [
    'country_txt', 'region_txt', 'attacktype1_txt', 'targtype1_txt', 'weaptype1_txt'
]

numerical_features = [
    'iyear', 'imonth', 'iday', 'nkill', 'nwound', 'total_casualties',
    'casualty_ratio', 'high_casualty', 'mass_casualty', 'is_suicide',
    'has_property_damage', 'is_weekend', 'is_capital'
]

# Create encoded version
df_encoded = df_processed.copy()

# Label encoding for categorical variables
label_encoders = {}
for col in categorical_features:
    if col in df_encoded.columns:
        le = LabelEncoder()
        df_encoded[col + '_encoded'] = le.fit_transform(df_encoded[col].astype(str))
        label_encoders[col] = le

# One-hot encoding for selected categorical variables (for better tree model performance)
high_cardinality_cols = ['country_txt', 'region_txt']
low_cardinality_cols = ['attacktype1_txt', 'targtype1_txt', 'weaptype1_txt']

for col in low_cardinality_cols:
    if col in df_encoded.columns:
        dummies = pd.get_dummies(df_encoded[col], prefix=col, drop_first=True)
        df_encoded = pd.concat([df_encoded, dummies], axis=1)

print(f"✅ Categorical encoding completed")
print(f"📊 Encoded dataset shape: {df_encoded.shape}")

print("\n✅ Preprocessing completed successfully!")

## 4. Exploratory Data Analysis (EDA)

In [None]:
# Comprehensive EDA with multiple visualizations
print("=== EXPLORATORY DATA ANALYSIS ===")

# Create comprehensive visualization
fig, axes = plt.subplots(3, 3, figsize=(20, 18))

# 1. Temporal trends
yearly_incidents = df_processed.groupby('iyear').size()
axes[0,0].plot(yearly_incidents.index, yearly_incidents.values, linewidth=2, color='red')
axes[0,0].set_title('Terrorist Incidents Over Time', fontsize=14, fontweight='bold')
axes[0,0].set_xlabel('Year')
axes[0,0].set_ylabel('Number of Incidents')
axes[0,0].grid(True, alpha=0.3)

# 2. Regional distribution
regional_counts = df_processed['region_txt'].value_counts().head(10)
axes[0,1].barh(range(len(regional_counts)), regional_counts.values, color='skyblue')
axes[0,1].set_yticks(range(len(regional_counts)))
axes[0,1].set_yticklabels(regional_counts.index)
axes[0,1].set_title('Incidents by Region (Top 10)', fontsize=14, fontweight='bold')
axes[0,1].set_xlabel('Number of Incidents')

# 3. Attack types
attack_counts = df_processed['attacktype1_txt'].value_counts().head(8)
colors = plt.cm.Set3(np.linspace(0, 1, len(attack_counts)))
axes[0,2].pie(attack_counts.values, labels=attack_counts.index, autopct='%1.1f%%', colors=colors)
axes[0,2].set_title('Attack Types Distribution', fontsize=14, fontweight='bold')

# 4. Casualty distribution
casualty_data = df_processed['total_casualties'][df_processed['total_casualties'] <= 100]
axes[1,0].hist(casualty_data, bins=50, alpha=0.7, color='orange', edgecolor='black')
axes[1,0].set_title('Total Casualties Distribution (≤100)', fontsize=14, fontweight='bold')
axes[1,0].set_xlabel('Total Casualties')
axes[1,0].set_ylabel('Frequency')
axes[1,0].set_yscale('log')

# 5. Success rate by attack type
success_by_attack = df_processed.groupby('attacktype1_txt')['success_binary'].mean().sort_values(ascending=False).head(8)
axes[1,1].bar(range(len(success_by_attack)), success_by_attack.values, color='green', alpha=0.7)
axes[1,1].set_xticks(range(len(success_by_attack)))
axes[1,1].set_xticklabels(success_by_attack.index, rotation=45, ha='right')
axes[1,1].set_title('Success Rate by Attack Type', fontsize=14, fontweight='bold')
axes[1,1].set_ylabel('Success Rate')

# 6. Monthly patterns
monthly_incidents = df_processed.groupby('imonth').size()
month_names = [calendar.month_abbr[i] for i in range(1, 13)]
axes[1,2].bar(range(1, 13), [monthly_incidents.get(i, 0) for i in range(1, 13)], color='purple', alpha=0.7)
axes[1,2].set_xticks(range(1, 13))
axes[1,2].set_xticklabels(month_names)
axes[1,2].set_title('Incidents by Month', fontsize=14, fontweight='bold')
axes[1,2].set_ylabel('Number of Incidents')

# 7. Weapon types
weapon_counts = df_processed['weaptype1_txt'].value_counts().head(6)
axes[2,0].barh(range(len(weapon_counts)), weapon_counts.values, color='red', alpha=0.7)
axes[2,0].set_yticks(range(len(weapon_counts)))
axes[2,0].set_yticklabels(weapon_counts.index)
axes[2,0].set_title('Primary Weapon Types', fontsize=14, fontweight='bold')
axes[2,0].set_xlabel('Number of Incidents')

# 8. Target types
target_counts = df_processed['targtype1_txt'].value_counts().head(8)
axes[2,1].bar(range(len(target_counts)), target_counts.values, color='brown', alpha=0.7)
axes[2,1].set_xticks(range(len(target_counts)))
axes[2,1].set_xticklabels(target_counts.index, rotation=45, ha='right')
axes[2,1].set_title('Primary Target Types', fontsize=14, fontweight='bold')
axes[2,1].set_ylabel('Number of Incidents')

# 9. Decade comparison
decade_casualties = df_processed.groupby('decade')['total_casualties'].sum()
axes[2,2].bar(decade_casualties.index, decade_casualties.values, color='darkblue', alpha=0.7)
axes[2,2].set_title('Total Casualties by Decade', fontsize=14, fontweight='bold')
axes[2,2].set_xlabel('Decade')
axes[2,2].set_ylabel('Total Casualties')

plt.tight_layout()
plt.show()

# Summary statistics
print("\n=== KEY INSIGHTS ===")
print(f"📈 Peak incident year: {yearly_incidents.idxmax()} ({yearly_incidents.max():,} incidents)")
print(f"🌍 Most affected region: {regional_counts.index[0]} ({regional_counts.iloc[0]:,} incidents)")
print(f"💥 Most common attack: {attack_counts.index[0]} ({attack_counts.iloc[0]:,} incidents)")
print(f"🎯 Most targeted: {target_counts.index[0]} ({target_counts.iloc[0]:,} incidents)")
print(f"🔫 Most used weapon: {weapon_counts.index[0]} ({weapon_counts.iloc[0]:,} incidents)")
print(f"📊 Overall success rate: {df_processed['success_binary'].mean()*100:.1f}%")
print(f"💀 Average casualties per incident: {df_processed['total_casualties'].mean():.2f}")

## 5. Advanced Statistical Analysis

In [None]:
# Advanced statistical analysis
print("=== STATISTICAL ANALYSIS ===")

# 1. Correlation analysis
print("\n📊 Correlation Analysis:")
correlation_features = [
    'iyear', 'total_casualties', 'nkill', 'nwound', 'success_binary',
    'high_casualty', 'mass_casualty', 'is_suicide', 'has_property_damage'
]

corr_matrix = df_processed[correlation_features].corr()

plt.figure(figsize=(12, 10))
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, mask=mask, annot=True, cmap='RdYlBu_r', center=0,
            square=True, linewidths=0.5, cbar_kws={"shrink": .8})
plt.title('Correlation Matrix - Key Variables', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

# 2. Success rate analysis by various factors
print("\n📊 Success Rate Analysis:")

factors = {
    'Attack Type': 'attacktype1_txt',
    'Target Type': 'targtype1_txt',
    'Weapon Type': 'weaptype1_txt',
    'Region': 'region_txt'
}

fig, axes = plt.subplots(2, 2, figsize=(16, 12))
axes = axes.ravel()

for i, (factor_name, factor_col) in enumerate(factors.items()):
    success_by_factor = df_processed.groupby(factor_col)['success_binary'].agg(['mean', 'count'])
    success_by_factor = success_by_factor[success_by_factor['count'] >= 100].sort_values('mean', ascending=False).head(10)
    
    bars = axes[i].bar(range(len(success_by_factor)), success_by_factor['mean'], 
                      color=plt.cm.viridis(np.linspace(0, 1, len(success_by_factor))))
    axes[i].set_xticks(range(len(success_by_factor)))
    axes[i].set_xticklabels(success_by_factor.index, rotation=45, ha='right')
    axes[i].set_title(f'Success Rate by {factor_name}', fontsize=12, fontweight='bold')
    axes[i].set_ylabel('Success Rate')
    
    # Add value labels on bars
    for bar, value in zip(bars, success_by_factor['mean']):
        axes[i].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
                    f'{value:.2f}', ha='center', va='bottom', fontsize=8)

plt.tight_layout()
plt.show()

# 3. Casualty analysis
print("\n📊 Casualty Analysis:")

casualty_by_attack = df_processed.groupby('attacktype1_txt')['total_casualties'].agg(['mean', 'median', 'std', 'count'])
casualty_by_attack = casualty_by_attack[casualty_by_attack['count'] >= 100].sort_values('mean', ascending=False).head(10)

print("Average casualties by attack type (top 10):")
for attack_type, stats in casualty_by_attack.iterrows():
    print(f"  {attack_type}: {stats['mean']:.2f} avg, {stats['median']:.1f} median ({stats['count']} incidents)")

# 4. Temporal trends analysis
print("\n📊 Temporal Trends:")

yearly_stats = df_processed.groupby('iyear').agg({
    'eventid': 'count',
    'total_casualties': ['sum', 'mean'],
    'success_binary': 'mean'
}).round(2)

yearly_stats.columns = ['incidents', 'total_casualties', 'avg_casualties', 'success_rate']

fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Incidents over time
axes[0,0].plot(yearly_stats.index, yearly_stats['incidents'], linewidth=2, color='blue')
axes[0,0].set_title('Annual Incidents', fontweight='bold')
axes[0,0].set_ylabel('Number of Incidents')
axes[0,0].grid(True, alpha=0.3)

# Total casualties over time
axes[0,1].plot(yearly_stats.index, yearly_stats['total_casualties'], linewidth=2, color='red')
axes[0,1].set_title('Annual Total Casualties', fontweight='bold')
axes[0,1].set_ylabel('Total Casualties')
axes[0,1].grid(True, alpha=0.3)

# Average casualties per incident
axes[1,0].plot(yearly_stats.index, yearly_stats['avg_casualties'], linewidth=2, color='orange')
axes[1,0].set_title('Average Casualties per Incident', fontweight='bold')
axes[1,0].set_ylabel('Average Casualties')
axes[1,0].set_xlabel('Year')
axes[1,0].grid(True, alpha=0.3)

# Success rate over time
axes[1,1].plot(yearly_stats.index, yearly_stats['success_rate'], linewidth=2, color='green')
axes[1,1].set_title('Annual Success Rate', fontweight='bold')
axes[1,1].set_ylabel('Success Rate')
axes[1,1].set_xlabel('Year')
axes[1,1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\n📈 Trend Analysis:")
print(f"   Peak incidents: {yearly_stats['incidents'].max():,} in {yearly_stats['incidents'].idxmax()}")
print(f"   Peak casualties: {yearly_stats['total_casualties'].max():,} in {yearly_stats['total_casualties'].idxmax()}")
print(f"   Highest avg casualties: {yearly_stats['avg_casualties'].max():.2f} in {yearly_stats['avg_casualties'].idxmax()}")
print(f"   Recent trends (2010-2017):")
recent_trend = yearly_stats.loc[2010:2017]
print(f"     Avg incidents/year: {recent_trend['incidents'].mean():.0f}")
print(f"     Avg casualties/year: {recent_trend['total_casualties'].mean():.0f}")
print(f"     Avg success rate: {recent_trend['success_rate'].mean():.3f}")

## 6. Machine Learning Pipeline - Task 1: Attack Success Prediction

In [None]:
# Task 1: Predict attack success (binary classification)
print("=== TASK 1: ATTACK SUCCESS PREDICTION ===")

# Prepare data for classification
print("\n🔧 Preparing data for classification...")

# Select features for attack success prediction
feature_columns = [
    'iyear', 'imonth', 'nkill', 'nwound', 'total_casualties',
    'is_suicide', 'has_property_damage', 'is_weekend',
    'country_txt_encoded', 'region_txt_encoded', 'attacktype1_txt_encoded',
    'targtype1_txt_encoded', 'weaptype1_txt_encoded'
]

# Filter available columns
available_features = [col for col in feature_columns if col in df_encoded.columns]
print(f"Available features: {len(available_features)}")

# Prepare data
X = df_encoded[available_features].copy()
y = df_encoded['success_binary'].copy()

# Remove rows with missing target
mask = ~y.isnull()
X = X[mask]
y = y[mask]

# Handle any remaining missing values
X = X.fillna(0)

print(f"📊 Dataset shape: {X.shape}")
print(f"📊 Target distribution: {y.value_counts().to_dict()}")
print(f"📊 Success rate: {y.mean()*100:.1f}%")

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"📊 Training set: {X_train.shape}")
print(f"📊 Test set: {X_test.shape}")

# Handle class imbalance with SMOTE (if needed)
if y.value_counts().min() / len(y) < 0.1:  # If minority class < 10%
    print("\n🔧 Applying SMOTE for class balancing...")
    smote = SMOTE(random_state=42)
    X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)
    X_train_scaled_balanced = scaler.fit_transform(X_train_balanced)
    
    print(f"📊 Balanced training set: {X_train_balanced.shape}")
    print(f"📊 Balanced target distribution: {pd.Series(y_train_balanced).value_counts().to_dict()}")
else:
    X_train_balanced = X_train
    y_train_balanced = y_train
    X_train_scaled_balanced = X_train_scaled

print("\n✅ Data preparation completed!")

In [None]:
# Train multiple models for attack success prediction
print("=== MODEL TRAINING: ATTACK SUCCESS PREDICTION ===")

# Define models
classification_models = {
    'Logistic Regression': {
        'model': LogisticRegression(random_state=42, max_iter=1000),
        'params': {
            'C': [0.1, 1.0, 10.0],
            'penalty': ['l1', 'l2'],
            'solver': ['liblinear']
        },
        'scaled': True
    },
    'Random Forest': {
        'model': RandomForestClassifier(random_state=42),
        'params': {
            'n_estimators': [100, 200],
            'max_depth': [10, 20, None],
            'min_samples_split': [2, 5],
            'min_samples_leaf': [1, 2]
        },
        'scaled': False
    },
    'Gradient Boosting': {
        'model': GradientBoostingClassifier(random_state=42),
        'params': {
            'n_estimators': [100, 200],
            'learning_rate': [0.05, 0.1, 0.2],
            'max_depth': [3, 5, 7]
        },
        'scaled': False
    },
    'Neural Network': {
        'model': MLPClassifier(random_state=42, max_iter=500),
        'params': {
            'hidden_layer_sizes': [(50,), (100,), (50, 50)],
            'alpha': [0.001, 0.01, 0.1],
            'learning_rate_init': [0.001, 0.01]
        },
        'scaled': True
    }
}

# Train models
classification_results = {}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for name, model_config in classification_models.items():
    print(f"\n🤖 Training {name}...")
    
    # Select appropriate data
    if model_config['scaled']:
        X_train_use = X_train_scaled_balanced
        X_test_use = X_test_scaled
    else:
        X_train_use = X_train_balanced
        X_test_use = X_test
    
    # Grid search
    grid_search = GridSearchCV(
        model_config['model'],
        model_config['params'],
        cv=cv,
        scoring='roc_auc',
        n_jobs=-1
    )
    
    grid_search.fit(X_train_use, y_train_balanced)
    best_model = grid_search.best_estimator_
    
    # Cross-validation scores
    cv_scores = cross_val_score(best_model, X_train_use, y_train_balanced, cv=cv, scoring='roc_auc')
    
    # Test predictions
    y_pred = best_model.predict(X_test_use)
    y_pred_proba = best_model.predict_proba(X_test_use)[:, 1]
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    auc_score = roc_auc_score(y_test, y_pred_proba)
    f1 = f1_score(y_test, y_pred)
    
    classification_results[name] = {
        'model': best_model,
        'best_params': grid_search.best_params_,
        'cv_auc_mean': cv_scores.mean(),
        'cv_auc_std': cv_scores.std(),
        'test_accuracy': accuracy,
        'test_auc': auc_score,
        'test_f1': f1,
        'predictions': y_pred,
        'predictions_proba': y_pred_proba
    }
    
    print(f"  ✅ Best parameters: {grid_search.best_params_}")
    print(f"  📊 CV AUC: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
    print(f"  📊 Test Accuracy: {accuracy:.4f}")
    print(f"  📊 Test AUC: {auc_score:.4f}")
    print(f"  📊 Test F1: {f1:.4f}")

print("\n✅ Classification model training completed!")

## 7. Machine Learning Pipeline - Task 2: Casualty Prediction

In [None]:
# Task 2: Predict casualties (regression)
print("=== TASK 2: CASUALTY PREDICTION ===")

# Prepare data for regression
print("\n🔧 Preparing data for casualty prediction...")

# Use same features but exclude casualty-related targets
regression_features = [
    'iyear', 'imonth', 'is_suicide', 'has_property_damage', 'is_weekend',
    'country_txt_encoded', 'region_txt_encoded', 'attacktype1_txt_encoded',
    'targtype1_txt_encoded', 'weaptype1_txt_encoded'
]

available_reg_features = [col for col in regression_features if col in df_encoded.columns]

X_reg = df_encoded[available_reg_features].copy()
y_reg = df_encoded['total_casualties'].copy()

# Remove outliers (casualties > 1000) for better model performance
mask = (y_reg <= 1000) & (~y_reg.isnull())
X_reg = X_reg[mask]
y_reg = y_reg[mask]

# Handle missing values
X_reg = X_reg.fillna(0)

print(f"📊 Regression dataset shape: {X_reg.shape}")
print(f"📊 Casualty statistics:")
print(f"   Mean: {y_reg.mean():.2f}")
print(f"   Median: {y_reg.median():.1f}")
print(f"   Max: {y_reg.max():.0f}")
print(f"   % Zero casualties: {(y_reg == 0).mean()*100:.1f}%")

# Split data
X_reg_train, X_reg_test, y_reg_train, y_reg_test = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)

# Scale features
scaler_reg = StandardScaler()
X_reg_train_scaled = scaler_reg.fit_transform(X_reg_train)
X_reg_test_scaled = scaler_reg.transform(X_reg_test)

print(f"📊 Regression training set: {X_reg_train.shape}")
print(f"📊 Regression test set: {X_reg_test.shape}")

print("\n✅ Regression data preparation completed!")

In [None]:
# Train regression models for casualty prediction
print("=== MODEL TRAINING: CASUALTY PREDICTION ===")

# Define regression models
regression_models = {
    'Linear Regression': {
        'model': LinearRegression(),
        'params': {},
        'scaled': True
    },
    'Ridge Regression': {
        'model': Ridge(random_state=42),
        'params': {
            'alpha': [0.1, 1.0, 10.0, 100.0]
        },
        'scaled': True
    },
    'Random Forest': {
        'model': RandomForestRegressor(random_state=42),
        'params': {
            'n_estimators': [100, 200],
            'max_depth': [10, 20, None],
            'min_samples_split': [2, 5],
            'min_samples_leaf': [1, 2]
        },
        'scaled': False
    },
    'Gradient Boosting': {
        'model': GradientBoostingRegressor(random_state=42),
        'params': {
            'n_estimators': [100, 200],
            'learning_rate': [0.05, 0.1, 0.2],
            'max_depth': [3, 5, 7]
        },
        'scaled': False
    }
}

# Train regression models
regression_results = {}
cv_reg = KFold(n_splits=5, shuffle=True, random_state=42)

for name, model_config in regression_models.items():
    print(f"\n🤖 Training {name}...")
    
    # Select appropriate data
    if model_config['scaled']:
        X_train_use = X_reg_train_scaled
        X_test_use = X_reg_test_scaled
    else:
        X_train_use = X_reg_train
        X_test_use = X_reg_test
    
    # Grid search or direct training
    if model_config['params']:
        grid_search = GridSearchCV(
            model_config['model'],
            model_config['params'],
            cv=cv_reg,
            scoring='r2',
            n_jobs=-1
        )
        grid_search.fit(X_train_use, y_reg_train)
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
    else:
        best_model = model_config['model']
        best_model.fit(X_train_use, y_reg_train)
        best_params = {}
    
    # Cross-validation scores
    cv_scores = cross_val_score(best_model, X_train_use, y_reg_train, cv=cv_reg, scoring='r2')
    
    # Test predictions
    y_reg_pred = best_model.predict(X_test_use)
    
    # Calculate metrics
    mae = mean_absolute_error(y_reg_test, y_reg_pred)
    mse = mean_squared_error(y_reg_test, y_reg_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_reg_test, y_reg_pred)
    
    regression_results[name] = {
        'model': best_model,
        'best_params': best_params,
        'cv_r2_mean': cv_scores.mean(),
        'cv_r2_std': cv_scores.std(),
        'test_mae': mae,
        'test_rmse': rmse,
        'test_r2': r2,
        'predictions': y_reg_pred
    }
    
    print(f"  ✅ Best parameters: {best_params}")
    print(f"  📊 CV R²: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
    print(f"  📊 Test R²: {r2:.4f}")
    print(f"  📊 Test MAE: {mae:.4f}")
    print(f"  📊 Test RMSE: {rmse:.4f}")

print("\n✅ Regression model training completed!")

## 8. Model Evaluation and Comparison

In [None]:
# Comprehensive model evaluation and comparison
print("=== MODEL EVALUATION AND COMPARISON ===")

# Classification models comparison
print("\n🎯 Classification Models (Attack Success Prediction):")
classification_comparison = pd.DataFrame({
    'Model': classification_results.keys(),
    'CV AUC': [classification_results[name]['cv_auc_mean'] for name in classification_results.keys()],
    'CV AUC Std': [classification_results[name]['cv_auc_std'] for name in classification_results.keys()],
    'Test Accuracy': [classification_results[name]['test_accuracy'] for name in classification_results.keys()],
    'Test AUC': [classification_results[name]['test_auc'] for name in classification_results.keys()],
    'Test F1': [classification_results[name]['test_f1'] for name in classification_results.keys()]
})

classification_comparison = classification_comparison.sort_values('Test AUC', ascending=False)
print(classification_comparison.round(4))

# Regression models comparison
print("\n🎯 Regression Models (Casualty Prediction):")
regression_comparison = pd.DataFrame({
    'Model': regression_results.keys(),
    'CV R²': [regression_results[name]['cv_r2_mean'] for name in regression_results.keys()],
    'CV R² Std': [regression_results[name]['cv_r2_std'] for name in regression_results.keys()],
    'Test R²': [regression_results[name]['test_r2'] for name in regression_results.keys()],
    'Test MAE': [regression_results[name]['test_mae'] for name in regression_results.keys()],
    'Test RMSE': [regression_results[name]['test_rmse'] for name in regression_results.keys()]
})

regression_comparison = regression_comparison.sort_values('Test R²', ascending=False)
print(regression_comparison.round(4))

# Best models
best_classifier_name = classification_comparison.iloc[0]['Model']
best_classifier = classification_results[best_classifier_name]['model']
best_regressor_name = regression_comparison.iloc[0]['Model']
best_regressor = regression_results[best_regressor_name]['model']

print(f"\n🏆 Best Classification Model: {best_classifier_name}")
print(f"   AUC: {classification_results[best_classifier_name]['test_auc']:.4f}")
print(f"   Accuracy: {classification_results[best_classifier_name]['test_accuracy']:.4f}")
print(f"   F1-Score: {classification_results[best_classifier_name]['test_f1']:.4f}")

print(f"\n🏆 Best Regression Model: {best_regressor_name}")
print(f"   R²: {regression_results[best_regressor_name]['test_r2']:.4f}")
print(f"   MAE: {regression_results[best_regressor_name]['test_mae']:.4f}")
print(f"   RMSE: {regression_results[best_regressor_name]['test_rmse']:.4f}")

In [None]:
# Create comprehensive visualization of model performance
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# 1. Classification model comparison
models = list(classification_results.keys())
aucs = [classification_results[name]['test_auc'] for name in models]
accuracies = [classification_results[name]['test_accuracy'] for name in models]

x = np.arange(len(models))
width = 0.35

axes[0,0].bar(x - width/2, aucs, width, label='AUC', alpha=0.8)
axes[0,0].bar(x + width/2, accuracies, width, label='Accuracy', alpha=0.8)
axes[0,0].set_xlabel('Models')
axes[0,0].set_ylabel('Score')
axes[0,0].set_title('Classification Model Performance')
axes[0,0].set_xticks(x)
axes[0,0].set_xticklabels(models, rotation=45, ha='right')
axes[0,0].legend()
axes[0,0].grid(True, alpha=0.3)

# 2. Regression model comparison
reg_models = list(regression_results.keys())
r2_scores = [regression_results[name]['test_r2'] for name in reg_models]

axes[0,1].bar(reg_models, r2_scores, color='green', alpha=0.7)
axes[0,1].set_ylabel('R² Score')
axes[0,1].set_title('Regression Model Performance (R²)')
axes[0,1].tick_params(axis='x', rotation=45)
axes[0,1].grid(True, alpha=0.3)

# 3. ROC Curve for best classifier
best_predictions_proba = classification_results[best_classifier_name]['predictions_proba']
fpr, tpr, _ = roc_curve(y_test, best_predictions_proba)
auc_score = classification_results[best_classifier_name]['test_auc']

axes[0,2].plot(fpr, tpr, label=f'{best_classifier_name} (AUC = {auc_score:.3f})', linewidth=2)
axes[0,2].plot([0, 1], [0, 1], 'k--', label='Random Classifier')
axes[0,2].set_xlabel('False Positive Rate')
axes[0,2].set_ylabel('True Positive Rate')
axes[0,2].set_title('ROC Curve - Best Classifier')
axes[0,2].legend()
axes[0,2].grid(True, alpha=0.3)

# 4. Confusion Matrix for best classifier
best_predictions = classification_results[best_classifier_name]['predictions']
cm = confusion_matrix(y_test, best_predictions)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[1,0])
axes[1,0].set_title(f'Confusion Matrix - {best_classifier_name}')
axes[1,0].set_ylabel('True Label')
axes[1,0].set_xlabel('Predicted Label')

# 5. Regression predictions vs actual
best_reg_predictions = regression_results[best_regressor_name]['predictions']
axes[1,1].scatter(y_reg_test, best_reg_predictions, alpha=0.5, s=1)
axes[1,1].plot([y_reg_test.min(), y_reg_test.max()], [y_reg_test.min(), y_reg_test.max()], 'r--', lw=2)
axes[1,1].set_xlabel('Actual Casualties')
axes[1,1].set_ylabel('Predicted Casualties')
axes[1,1].set_title(f'Predictions vs Actual - {best_regressor_name}')
axes[1,1].grid(True, alpha=0.3)

# 6. Feature importance (if available)
if hasattr(best_classifier, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'feature': available_features,
        'importance': best_classifier.feature_importances_
    }).sort_values('importance', ascending=False).head(10)
    
    axes[1,2].barh(range(len(feature_importance)), feature_importance['importance'])
    axes[1,2].set_yticks(range(len(feature_importance)))
    axes[1,2].set_yticklabels(feature_importance['feature'])
    axes[1,2].set_xlabel('Feature Importance')
    axes[1,2].set_title(f'Top 10 Feature Importances - {best_classifier_name}')
    axes[1,2].invert_yaxis()
else:
    axes[1,2].text(0.5, 0.5, 'Feature importance\nnot available\nfor this model', 
                   ha='center', va='center', transform=axes[1,2].transAxes)
    axes[1,2].set_title('Feature Importance')

plt.tight_layout()
plt.show()

# Detailed classification report for best classifier
print(f"\n📋 Detailed Classification Report - {best_classifier_name}:")
print(classification_report(y_test, best_predictions, target_names=['Unsuccessful', 'Successful']))

## 9. Business Insights and Risk Assessment

In [None]:
# Comprehensive business insights and risk assessment
print("=== BUSINESS INSIGHTS AND RISK ASSESSMENT ===")

# 1. Predictive insights using best models
print("\n🎯 PREDICTIVE INSIGHTS:")

# Success prediction insights
success_proba = classification_results[best_classifier_name]['predictions_proba']
high_risk_threshold = 0.8
high_risk_attacks = (success_proba >= high_risk_threshold).sum()
print(f"   High-risk attacks (success probability ≥ {high_risk_threshold}): {high_risk_attacks} ({high_risk_attacks/len(success_proba)*100:.1f}%)")

# Casualty prediction insights
casualty_predictions = regression_results[best_regressor_name]['predictions']
high_casualty_predicted = (casualty_predictions >= 10).sum()
print(f"   Predicted high-casualty events (≥10 casualties): {high_casualty_predicted} ({high_casualty_predicted/len(casualty_predictions)*100:.1f}%)")

# 2. Risk factors analysis
print("\n🚨 TOP RISK FACTORS:")

if hasattr(best_classifier, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'feature': available_features,
        'importance': best_classifier.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("   Top 10 factors for attack success prediction:")
    for i, (_, row) in enumerate(feature_importance.head(10).iterrows()):
        print(f"   {i+1:2d}. {row['feature']:25s}: {row['importance']:.4f}")

# 3. Geographical risk assessment
print("\n🌍 GEOGRAPHICAL RISK ANALYSIS:")

# Calculate risk scores by country
country_risk = df_processed.groupby('country_txt').agg({
    'eventid': 'count',
    'total_casualties': ['sum', 'mean'],
    'success_binary': 'mean'
}).round(3)

country_risk.columns = ['incidents', 'total_casualties', 'avg_casualties', 'success_rate']
country_risk = country_risk[country_risk['incidents'] >= 100]  # Filter for significant data

# Create composite risk score
country_risk['risk_score'] = (
    country_risk['incidents'].rank(pct=True) * 0.3 +
    country_risk['avg_casualties'].rank(pct=True) * 0.4 +
    country_risk['success_rate'].rank(pct=True) * 0.3
)

top_risk_countries = country_risk.sort_values('risk_score', ascending=False).head(10)
print("   Top 10 highest-risk countries:")
for country, stats in top_risk_countries.iterrows():
    print(f"   {country:20s}: Risk={stats['risk_score']:.3f}, Incidents={stats['incidents']:4.0f}, Avg Casualties={stats['avg_casualties']:5.2f}")

# 4. Temporal risk patterns
print("\n📅 TEMPORAL RISK PATTERNS:")

# Monthly risk analysis
monthly_risk = df_processed.groupby('imonth').agg({
    'eventid': 'count',
    'total_casualties': 'mean',
    'success_binary': 'mean'
}).round(3)

highest_risk_month = monthly_risk['success_binary'].idxmax()
highest_casualty_month = monthly_risk['total_casualties'].idxmax()
most_active_month = monthly_risk['eventid'].idxmax()

print(f"   Highest success rate: {calendar.month_name[int(highest_risk_month)]} ({monthly_risk.loc[highest_risk_month, 'success_binary']:.3f})")
print(f"   Highest avg casualties: {calendar.month_name[int(highest_casualty_month)]} ({monthly_risk.loc[highest_casualty_month, 'total_casualties']:.2f})")
print(f"   Most active month: {calendar.month_name[int(most_active_month)]} ({monthly_risk.loc[most_active_month, 'eventid']:,} incidents)")

# 5. Attack method risk analysis
print("\n💥 ATTACK METHOD RISK ANALYSIS:")

method_risk = df_processed.groupby('attacktype1_txt').agg({
    'eventid': 'count',
    'total_casualties': 'mean',
    'success_binary': 'mean'
}).round(3)

method_risk = method_risk[method_risk['eventid'] >= 100]  # Filter for significant data
method_risk = method_risk.sort_values('total_casualties', ascending=False)

print("   Most lethal attack methods (avg casualties):")
for method, stats in method_risk.head(8).iterrows():
    print(f"   {method:25s}: {stats['total_casualties']:5.2f} casualties, {stats['success_binary']:.3f} success rate")

# 6. Recommendations
print("\n💡 STRATEGIC RECOMMENDATIONS:")
print("   1. PREVENTION PRIORITIES:")
print("      🔸 Focus on high-risk countries and regions")
print("      🔸 Enhanced monitoring during high-risk months")
print("      🔸 Targeted countermeasures for most lethal attack types")
print("      🔸 Early warning systems based on predictive models")

print("   2. RESOURCE ALLOCATION:")
print("      🔸 Prioritize security in countries with highest risk scores")
print("      🔸 Increase preparedness for high-casualty attack methods")
print("      🔸 Deploy predictive models for real-time threat assessment")
print("      🔸 Focus on preventing attacks with high success probability")

print("   3. MONITORING AND INTELLIGENCE:")
print("      🔸 Enhanced surveillance during peak risk periods")
print("      🔸 Pattern recognition for emerging threat vectors")
print("      🔸 International cooperation in high-risk regions")
print("      🔸 Continuous model updating with new incident data")

# 7. Model deployment insights
print("\n🎯 MODEL DEPLOYMENT INSIGHTS:")
print(f"   Success Prediction Model ({best_classifier_name}):")
print(f"      - Accuracy: {classification_results[best_classifier_name]['test_accuracy']:.1%}")
print(f"      - AUC Score: {classification_results[best_classifier_name]['test_auc']:.3f}")
print(f"      - Use case: Early warning system for attack prevention")

print(f"   Casualty Prediction Model ({best_regressor_name}):")
print(f"      - R² Score: {regression_results[best_regressor_name]['test_r2']:.3f}")
print(f"      - RMSE: {regression_results[best_regressor_name]['test_rmse']:.2f} casualties")
print(f"      - Use case: Emergency response planning and resource allocation")

print("\n" + "="*80)
print("GLOBAL TERRORISM ANALYSIS COMPLETED SUCCESSFULLY!")
print("This analysis provides actionable insights for security and policy decision-making.")
print("="*80)