In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Get the Data

In [None]:
# load the data - update path based on what's shown above
df = pd.read_csv('/kaggle/input/worlds-biggest-data-breaches-and-hacks-2004-2025/worlds_biggest_breaches_cleaned.csv')
print(f"Shape: {df.shape}")
df.head()

In [None]:
# Basic stats
print(f"Breaches: {len(df)}")
print(f"Date range: {df['year'].min()} - {df['year'].max()}")
print(f"Total records exposed: {df['records lost'].sum():,.0f}")

In [None]:
# ANIMATED LINE RACE
import plotly.express as px

# Get top 10 organizations by total records
top_orgs = df.groupby('organisation')['records lost'].sum().nlargest(10).index.tolist()
df_top = df[df['organisation'].isin(top_orgs)].copy()

# Build cumulative totals
years = sorted(df['year'].unique())
records = []

for org in top_orgs:
    cumulative = 0
    for year in years:
        year_total = df_top[(df_top['organisation'] == org) & (df_top['year'] == year)]['records lost'].sum()
        cumulative += year_total
        records.append({'organisation': org, 'year': year, 'cumulative': cumulative})

df_line = pd.DataFrame(records)

# Create animation frames - each frame shows data UP TO that year
frames = []
for year in years:
    frame_data = df_line[df_line['year'] <= year].copy()
    frame_data['frame'] = str(year)
    frames.append(frame_data)

df_anim = pd.concat(frames, ignore_index=True)

fig = px.line(df_anim, x='year', y='cumulative', color='organisation',
    animation_frame='frame', markers=True,
    title='<b>Cumulative Records Exposed: Top 10 Breached Organizations</b>',
    labels={'cumulative': 'Cumulative Records Exposed', 'year': 'Year'},
    range_x=[2003, 2026],
    range_y=[0, df_line['cumulative'].max() * 1.1])

fig.update_layout(
    height=600, 
    width=1000,
    yaxis=dict(tickformat=',.0s'),
    legend=dict(title='Organization')
)

fig.layout.updatemenus[0].buttons[0].args[1]['frame']['duration'] = 500
fig.layout.updatemenus[0].buttons[0].args[1]['transition']['duration'] = 300
fig.show()

In [None]:
# 1. BUBBLE CHART - FIXED
import plotly.express as px

sensitivity_map = {1.0: 'Email/Online Info', 2.0: 'Personal Details & SSN', 3.0: 'Credit Cards & Banking', 4.0: 'Health & Personal Records', 5.0: 'Full Details'}
df['sensitivity_label'] = df['data sensitivity'].map(sensitivity_map)

color_map = {'Email/Online Info': '#4A90D9', 'Personal Details & SSN': '#2E5A8B', 'Credit Cards & Banking': '#8B9EA8', 'Health & Personal Records': '#D4726A', 'Full Details': '#C94A4A'}

df_plot = df[df['sensitivity_label'].notna()].copy()

fig = px.scatter(df_plot, x='data sensitivity', y='records lost', size='records lost', color='sensitivity_label',
    hover_name='organisation', hover_data=['year', 'sector', 'method'],
    color_discrete_map=color_map, size_max=80, log_y=True, 
    title='Data Breaches by Data Sensitivity')

fig.update_layout(height=600, width=1000)
fig.show()