# Cybercrime Trends in Canada (2014-2024)

**Data Source:** Statistics Canada, Table 35-10-0001-01  
**Analysis:** Trends in cybercrime violations across Canada

This notebook analyzes cybercrime data from Statistics Canada, visualizing trends over the past decade and identifying the most common violation types.

## Setup and Dependencies

In [None]:
# Install required packages
!pip install -q openpyxl

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import zipfile
from io import BytesIO
from pathlib import Path
from IPython.display import display, HTML

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

# Create output directory
output_dir = Path('outputs')
output_dir.mkdir(exist_ok=True)

print("Libraries loaded successfully")

## Fetch Cybercrime Data from Statistics Canada

**Dataset:** Table 35-10-0001-01 - Police-reported cybercrime, by cyber-related violation  
**URL:** https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=3510000101

In [None]:
# Statistics Canada table ID for cybercrime data
TABLE_ID = "35100001"

# Construct download URL
download_url = f"https://www150.statcan.gc.ca/t1/tbl1/en/dtl!downloadDbLoadingData-nonTraduit.action?pid={TABLE_ID}&latestN=0&startDate=&endDate=&csvLocale=en&selectedMembers=%5B%5B%5D%2C%5B%5D%2C%5B%5D%5D"

print("Downloading cybercrime data from Statistics Canada...")
print(f"Table ID: {TABLE_ID}")

try:
    # Download the data
    response = requests.get(download_url, timeout=60)
    response.raise_for_status()
    
    # Extract ZIP file
    with zipfile.ZipFile(BytesIO(response.content)) as zip_file:
        csv_files = [f for f in zip_file.namelist() if f.endswith('.csv')]
        
        if not csv_files:
            raise ValueError("No CSV file found in downloaded ZIP")
        
        csv_filename = csv_files[0]
        
        with zip_file.open(csv_filename) as csv_file:
            df_raw = pd.read_csv(csv_file)
    
    print(f"Data loaded: {len(df_raw):,} rows, {len(df_raw.columns)} columns")
    print(f"Time period: {df_raw['REF_DATE'].min()} to {df_raw['REF_DATE'].max()}")
    
except Exception as e:
    print(f"Error: {e}")
    raise

In [None]:
# Explore the data structure
print("Columns:", df_raw.columns.tolist())
print("\nFirst few rows:")
df_raw.head()

## Process and Clean Data

In [None]:
# Create a clean copy of the data
df = df_raw.copy()

# Convert REF_DATE to integer year
df['Year'] = df['REF_DATE'].astype(int)

# Filter for 2014-2024
df = df[df['Year'].between(2014, 2024)]

# Remove rows with missing values
df = df[df['VALUE'].notna()]

# Filter for Canada-level data only (exclude provinces)
if 'GEO' in df.columns:
    df = df[df['GEO'] == 'Canada']

# Identify the violation column
violation_col = None
for col in df.columns:
    if 'violation' in col.lower() or 'offence' in col.lower():
        violation_col = col
        break

if violation_col is None:
    # Look for columns that might contain violation types
    for col in df.columns:
        if df[col].dtype == 'object' and col not in ['REF_DATE', 'GEO', 'DGUID', 'UOM', 'STATUS', 'SYMBOL', 'SCALAR_FACTOR']:
            unique_vals = df[col].nunique()
            if unique_vals > 5 and unique_vals < 500:
                violation_col = col
                break

print(f"Violation column: {violation_col}")
print(f"\nUnique violation types: {df[violation_col].nunique()}")
print(f"Years: {df['Year'].min()} to {df['Year'].max()}")
print(f"Total rows: {len(df):,}")

In [None]:
# Display sample violation types
print("Sample violation types:")
for i, v in enumerate(df[violation_col].unique()[:15], 1):
    print(f"  {i}. {v}")

In [None]:
# Separate total cybercrime from specific violations
# Look for the total row
total_mask = df[violation_col].str.contains('Total|All cyber', case=False, na=False)

df_total = df[total_mask].copy()
df_violations = df[~total_mask].copy()

print(f"Total cybercrime rows: {len(df_total)}")
print(f"Specific violation rows: {len(df_violations)}")
print(f"\nSpecific violation types: {df_violations[violation_col].nunique()}")

## Line Graph: Cybercrime Trends (2014-2024)

Visualizing total cybercrime and the three most common cybercrime violation types over the past decade.

In [None]:
# Calculate yearly totals
if len(df_total) > 0:
    # Use the pre-calculated total from the dataset
    yearly_totals = df_total.groupby('Year')['VALUE'].sum().reset_index()
    yearly_totals.columns = ['Year', 'Total']
else:
    # Calculate total from individual violations
    yearly_totals = df_violations.groupby('Year')['VALUE'].sum().reset_index()
    yearly_totals.columns = ['Year', 'Total']

print("Yearly Total Cybercrime:")
print(yearly_totals.to_string(index=False))

In [None]:
# Find the top 3 most common violations (by total across all years)
violation_totals = df_violations.groupby(violation_col)['VALUE'].sum().sort_values(ascending=False)
top_3_violations = violation_totals.head(3)

print("Top 3 Most Common Cybercrime Violations (2014-2024):")
for i, (violation, total) in enumerate(top_3_violations.items(), 1):
    print(f"  {i}. {violation}: {total:,.0f} total incidents")

In [None]:
# Prepare data for the top 3 violations over time
top_3_names = top_3_violations.index.tolist()
df_top3 = df_violations[df_violations[violation_col].isin(top_3_names)]

# Pivot for plotting
df_pivot = df_top3.pivot_table(
    index='Year',
    columns=violation_col,
    values='VALUE',
    aggfunc='sum'
).fillna(0)

print("Yearly breakdown of top 3 violations:")
df_pivot

In [None]:
# Create the line graph
fig, ax = plt.subplots(figsize=(14, 8))

# Plot total cybercrime
ax.plot(yearly_totals['Year'], yearly_totals['Total'],
        marker='o', linewidth=3, markersize=8,
        label='Total Cybercrime', color='#1f77b4', linestyle='-')

# Colors and styles for top 3 violations
colors = ['#ff7f0e', '#2ca02c', '#d62728']
linestyles = ['--', '-.', ':']

# Plot top 3 violations
for i, violation in enumerate(top_3_names):
    if violation in df_pivot.columns:
        # Truncate long labels for legend
        label = violation if len(violation) <= 50 else violation[:47] + '...'
        ax.plot(df_pivot.index, df_pivot[violation],
                marker='s', linewidth=2.5, markersize=6,
                label=label, color=colors[i], linestyle=linestyles[i])

# Formatting
ax.set_xlabel('Year', fontsize=12, fontweight='bold')
ax.set_ylabel('Number of Incidents', fontsize=12, fontweight='bold')
ax.set_title('Cybercrime Trends in Canada (2014-2024)\nTotal Cybercrime and Top 3 Violation Types',
             fontsize=14, fontweight='bold', pad=20)

# Configure axes
ax.set_xticks(range(2014, 2025))
ax.grid(True, alpha=0.3, linestyle='--')
ax.legend(fontsize=9, loc='upper left', framealpha=0.9)

# Format y-axis with commas
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'{int(x):,}'))

# Background styling
ax.set_facecolor('#fafafa')
fig.patch.set_facecolor('white')

plt.tight_layout()

# Save the figure
output_path = output_dir / 'cybercrime_trends_2014_2024.png'
plt.savefig(output_path, dpi=300, bbox_inches='tight', facecolor='white')
print(f"Graph saved to: {output_path}")

plt.show()

## Table: Top 20 Cybercrimes in 2024 with Growth Metrics

Analyzing the most common cybercrime violations in 2024 and their growth trends over the past 5 and 10 years.

In [None]:
# Get data for specific years
df_2024 = df_violations[df_violations['Year'] == 2024]
df_2019 = df_violations[df_violations['Year'] == 2019]
df_2014 = df_violations[df_violations['Year'] == 2014]

# Calculate totals by violation type for each year
violations_2024 = df_2024.groupby(violation_col)['VALUE'].sum()
violations_2019 = df_2019.groupby(violation_col)['VALUE'].sum()
violations_2014 = df_2014.groupby(violation_col)['VALUE'].sum()

# Get top 20 by 2024 count
top_20 = violations_2024.nlargest(20)

print(f"Top 20 cybercrimes in 2024: {len(top_20)} violations")

In [None]:
# Build the summary table
summary_data = []

for rank, (violation, count_2024) in enumerate(top_20.items(), 1):
    count_2019 = violations_2019.get(violation, 0)
    count_2014 = violations_2014.get(violation, 0)
    
    # Calculate 5-year growth (2019-2024)
    if count_2019 > 0:
        growth_5yr = ((count_2024 - count_2019) / count_2019) * 100
    else:
        growth_5yr = None  # New violation type
    
    # Calculate 10-year growth (2014-2024)
    if count_2014 > 0:
        growth_10yr = ((count_2024 - count_2014) / count_2014) * 100
    else:
        growth_10yr = None  # New violation type
    
    summary_data.append({
        'Rank': rank,
        'Violation Type': violation,
        '2024 Incidents': int(count_2024),
        '2019 Incidents': int(count_2019) if count_2019 > 0 else 0,
        '2014 Incidents': int(count_2014) if count_2014 > 0 else 0,
        'Growth 2019-2024 (%)': growth_5yr,
        'Growth 2014-2024 (%)': growth_10yr
    })

# Create DataFrame
df_summary = pd.DataFrame(summary_data)

print("Summary table created successfully")

In [None]:
# Format the display table
df_display = df_summary.copy()

# Format numbers with commas
df_display['2024 Incidents'] = df_display['2024 Incidents'].apply(lambda x: f'{x:,}')

# Format growth percentages
def format_growth(x):
    if x is None:
        return 'N/A'
    elif x >= 1000:
        return f'+{x:,.0f}%'
    else:
        return f'{x:+.1f}%'

df_display['Growth 2019-2024 (%)'] = df_display['Growth 2019-2024 (%)'].apply(format_growth)
df_display['Growth 2014-2024 (%)'] = df_display['Growth 2014-2024 (%)'].apply(format_growth)

# Select columns for display
display_cols = ['Rank', 'Violation Type', '2024 Incidents', 'Growth 2019-2024 (%)', 'Growth 2014-2024 (%)']
df_display = df_display[display_cols]

print("="*100)
print("TOP 20 CYBERCRIMES IN CANADA (2024) WITH GROWTH METRICS")
print("="*100)
print(df_display.to_string(index=False))
print("="*100)

In [None]:
# Display as styled HTML table
styled_table = df_display.style.set_properties(**{
    'text-align': 'left',
    'font-size': '11px',
    'padding': '8px'
}).set_table_styles([
    {'selector': 'th', 'props': [
        ('background-color', '#2E86AB'),
        ('color', 'white'),
        ('font-weight', 'bold'),
        ('text-align', 'center'),
        ('padding', '10px'),
        ('font-size', '11px')
    ]},
    {'selector': 'td', 'props': [
        ('border', '1px solid #ddd')
    ]},
    {'selector': 'tr:nth-of-type(even)', 'props': [
        ('background-color', '#f8f9fa')
    ]}
]).set_caption('Top 20 Cybercrimes in Canada (2024) with Growth Metrics')

display(styled_table)

In [None]:
# Save the table to CSV
csv_path = output_dir / 'top_20_cybercrimes_2024.csv'
df_summary.to_csv(csv_path, index=False)
print(f"Table saved to: {csv_path}")

# Summary statistics
print("\n" + "="*60)
print("SUMMARY STATISTICS")
print("="*60)

total_2024 = df_summary['2024 Incidents'].sum()
print(f"Total incidents in top 20 (2024): {total_2024:,}")

# Average growth rates (excluding None values)
valid_5yr = [x for x in df_summary['Growth 2019-2024 (%)'] if x is not None]
valid_10yr = [x for x in df_summary['Growth 2014-2024 (%)'] if x is not None]

if valid_5yr:
    print(f"Average 5-year growth (2019-2024): {np.mean(valid_5yr):+.1f}%")
    print(f"Median 5-year growth (2019-2024): {np.median(valid_5yr):+.1f}%")

if valid_10yr:
    print(f"Average 10-year growth (2014-2024): {np.mean(valid_10yr):+.1f}%")
    print(f"Median 10-year growth (2014-2024): {np.median(valid_10yr):+.1f}%")

print("="*60)

## Analysis Complete

**Outputs Generated:**
- Line graph: `outputs/cybercrime_trends_2014_2024.png`
- Data table: `outputs/top_20_cybercrimes_2024.csv`

**Data Source:**  
Statistics Canada, Table 35-10-0001-01: Police-reported cybercrime, by cyber-related violation  
https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=3510000101