In [1]:
import pandas as pd
import numpy as np
from elasticsearch import Elasticsearch
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

def connect_elasticsearch():
    """Connect to Elasticsearch with error handling"""
    try:
        es = Elasticsearch("http://localhost:9200")
        if es.ping():
            print("Connected to Elasticsearch")
            return es
        else:
            print("Failed to connect to Elasticsearch")
            return None
    except Exception as e:
        print(f"Error connecting to Elasticsearch: {e}")
        return None

def enhanced_data_exploration():
    """Enhanced data exploration with multiple visualizations"""
    es = connect_elasticsearch()
    if not es:
        return
    
    # Get data with pagination
    all_data = []
    scroll_size = 1000
    
    try:
        # Initial search
        res = es.search(
            index="terrorism",
            body={
                "query": {"match_all": {}},
                "size": scroll_size
            },
            scroll='2m'
        )
        
        # Get scroll ID
        scroll_id = res['_scroll_id']
        hits = res['hits']['hits']
        
        while hits:
            all_data.extend([hit["_source"] for hit in hits])
            
            # Continue scrolling
            res = es.scroll(scroll_id=scroll_id, scroll='2m')
            scroll_id = res['_scroll_id']
            hits = res['hits']['hits']
        
        # Convert to DataFrame
        df = pd.DataFrame(all_data)
        
        if df.empty:
            print("No data found in Elasticsearch")
            return
        
        print(f"Loaded {len(df)} records")
        
        # Basic statistics
        print("\nDataset Overview:")
        print(f"Date range: {df['iyear'].min()} - {df['iyear'].max()}")
        print(f"Countries: {df['country_txt'].nunique()}")
        print(f"Regions: {df['region_txt'].nunique()}")
        print(f"Attack types: {df['attacktype1_txt'].nunique()}")
        
        # Create visualizations
        fig, axes = plt.subplots(2, 2, figsize=(15, 12))
        
        # 1. Attacks per year
        attacks_by_year = df.groupby("iyear").size()
        attacks_by_year.plot(kind="line", ax=axes[0,0], marker='o')
        axes[0,0].set_title("Terror Attacks Over Time")
        axes[0,0].set_xlabel("Year")
        axes[0,0].set_ylabel("Number of Attacks")
        axes[0,0].grid(True, alpha=0.3)
        
        # 2. Top 10 countries by attacks
        top_countries = df['country_txt'].value_counts().head(10)
        top_countries.plot(kind="barh", ax=axes[0,1])
        axes[0,1].set_title("Top 10 Countries by Attack Count")
        axes[0,1].set_xlabel("Number of Attacks")
        
        # 3. Attack types distribution
        attack_types = df['attacktype1_txt'].value_counts().head(8)
        attack_types.plot(kind="pie", ax=axes[1,0], autopct='%1.1f%%')
        axes[1,0].set_title("Attack Types Distribution")
        axes[1,0].set_ylabel("")
        
        # 4. Casualties by region
        df['casualties'] = df['nkill'].fillna(0) + df['nwound'].fillna(0)
        casualties_by_region = df.groupby('region_txt')['casualties'].sum().sort_values(ascending=False)
        casualties_by_region.plot(kind="bar", ax=axes[1,1])
        axes[1,1].set_title("Total Casualties by Region")
        axes[1,1].set_xlabel("Region")
        axes[1,1].set_ylabel("Total Casualties")
        axes[1,1].tick_params(axis='x', rotation=45)
        
        plt.tight_layout()
        plt.show()

        # Additional analysis
        print("\nTop 5 Deadliest Attack Types:")
        deadliest_types = df.groupby('attacktype1_txt')['casualties'].sum().sort_values(ascending=False).head()
        print(deadliest_types)
        
        # Monthly trend
        monthly_attacks = df.groupby('imonth').size()
        plt.figure(figsize=(10, 6))
        monthly_attacks.plot(kind='bar')
        plt.title("Terror Attacks by Month")
        plt.xlabel("Month")
        plt.ylabel("Number of Attacks")
        plt.xticks(range(12), ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 
                              'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
        plt.grid(True, alpha=0.3)
        plt.show()
        
    except Exception as e:
        print(f"Error during data exploration: {e}")

In [2]:
enhanced_data_exploration()

Connected to Elasticsearch
Error during data exploration: NotFoundError(404, 'index_not_found_exception', 'no such index [terrorism]', terrorism, index_or_alias)
