In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Load your seizure data
df = pd.read_csv('data/seizures_cocaine_filtered.csv')

# Define your 12 focus countries
focus_countries = [
    # Producer Countries
    'Peru', 'Colombia', 'Bolivia',
    # Transit Countries  
    'Spain', 'Netherlands', 'France', 'Belgium',
    # Consumer Countries
    'USA', 'Sweden', 'Portugal', 'Greece', 'Finland'
]

print("Data shape:", df.shape)
print("Columns:", df.columns.tolist())
print("Sample countries in data:", df['Country/Territory of Seizure'].unique()[:10])

  df = pd.read_csv('data/seizures_cocaine_filtered.csv')


Data shape: (699252, 17)
Columns: ['Seizure Date', 'Country/Territory of Seizure', 'ISO3', 'Region', 'Subregion', 'City', 'Administrative Region', 'Drug/Substance', 'Measurement Unit', 'Quantity Seized', 'Physical Seizure Location', 'Trafficking Mode of Transportation', 'Source', 'Reporting Channel', 'Source_File', 'Sheet_Name', 'Subrgion']
Sample countries in data: ['Greece' 'Portugal' 'Bolivia (Plurinational State of)' 'Peru' 'Colombia'
 'Belgium' 'Brazil' 'Spain' 'Sweden' 'France']


In [2]:
# Let's see all unique countries in your data vs your focus countries
print("=== COUNTRIES IN YOUR DATA ===")
unique_countries = df['Country/Territory of Seizure'].unique()
print(f"Total unique countries in data: {len(unique_countries)}")
print("Countries:", sorted(unique_countries))

print("\n=== FOCUS COUNTRIES MATCHING ===")
focus_countries = [
    'Peru', 'Colombia', 'Bolivia',
    'Spain', 'Netherlands', 'France', 'Belgium', 
    'USA', 'Sweden', 'Portugal', 'Greece', 'Finland'
]

# Check which focus countries are found in data
found_countries = []
missing_countries = []

for country in focus_countries:
    if country in unique_countries:
        found_countries.append(country)
    else:
        # Check for partial matches (like Bolivia vs Bolivia (Plurinational State of))
        matches = [c for c in unique_countries if country.lower() in c.lower() or c.lower() in country.lower()]
        if matches:
            print(f"✓ {country} found as: {matches}")
            found_countries.extend(matches)
        else:
            missing_countries.append(country)
            print(f"✗ {country} NOT FOUND")

print(f"\nFound: {len(found_countries)} countries")
print(f"Missing: {len(missing_countries)} countries")

=== COUNTRIES IN YOUR DATA ===
Total unique countries in data: 11
Countries: ['Belgium', 'Bolivia (Plurinational State of)', 'Brazil', 'Colombia', 'France', 'Greece', 'Netherlands', 'Peru', 'Portugal', 'Spain', 'Sweden']

=== FOCUS COUNTRIES MATCHING ===
✓ Bolivia found as: ['Bolivia (Plurinational State of)']
✗ USA NOT FOUND
✗ Finland NOT FOUND

Found: 10 countries
Missing: 2 countries


In [3]:
# Check date range and data quality
print("\n=== DATA RANGE & QUALITY ===")
df['Seizure Date'] = pd.to_datetime(df['Seizure Date'], errors='coerce')
print("Date range:", df['Seizure Date'].min(), "to", df['Seizure Date'].max())
print("Total records:", len(df))

# Check quantity and unit columns
print("\nQuantity Seized info:")
print("Data type:", df['Quantity Seized'].dtype)
print("Sample values:", df['Quantity Seized'].head(10).tolist())
print("Any null values?", df['Quantity Seized'].isnull().sum())

print("\nMeasurement units:")
print(df['Measurement Unit'].value_counts().head(10))

# Check data for focus countries only
focus_countries_in_data = [
    'Peru', 'Colombia', 'Bolivia (Plurinational State of)', 'Spain', 
    'Netherlands', 'France', 'Belgium', 'Sweden', 'Portugal', 'Greece'
]

country_counts = df[df['Country/Territory of Seizure'].isin(focus_countries_in_data)]['Country/Territory of Seizure'].value_counts()
print(f"\nSeizure records per focus country:")
print(country_counts)



=== DATA RANGE & QUALITY ===
Date range: 2011-01-01 00:00:00 to 2024-09-30 00:00:00
Total records: 699252

Quantity Seized info:
Data type: float64
Sample values: [0.15486, 0.19863, 2.895, 5.18, 4.916, 6.051, 6.429, 5.673, 5.673, 253.75]
Any null values? 0

Measurement units:
Measurement Unit
kg    699252
Name: count, dtype: int64

Seizure records per focus country:
Country/Territory of Seizure
Colombia                            648194
Bolivia (Plurinational State of)     21747
Spain                                13402
Sweden                                7006
Portugal                              3424
Peru                                  3310
Belgium                               1071
Greece                                 618
France                                 203
Netherlands                              2
Name: count, dtype: int64


In [4]:
from datetime import datetime
import pandas as pd

# Define our focus countries (with correct names)
focus_countries_in_data = [
    'Peru', 'Colombia', 'Bolivia (Plurinational State of)', 'Spain', 
    'Netherlands', 'France', 'Belgium', 'Sweden', 'Portugal', 'Greece'
]

# Define 4-year period (2020-2024)
start_date = datetime(2020, 1, 1)
end_date = datetime(2024, 12, 31)

# Filter data for focus countries and time period
seizure_filtered = df[
    (df['Country/Territory of Seizure'].isin(focus_countries_in_data)) &
    (df['Seizure Date'] >= start_date) & 
    (df['Seizure Date'] <= end_date)
].copy()

print(f"Filtered data: {len(seizure_filtered):,} seizures")
print(f"Date range: {seizure_filtered['Seizure Date'].min()} to {seizure_filtered['Seizure Date'].max()}")

# Add year column for aggregation
seizure_filtered['Year'] = seizure_filtered['Seizure Date'].dt.year

# Check data by country and year
country_year_summary = seizure_filtered.groupby(['Country/Territory of Seizure', 'Year']).agg({
    'Quantity Seized': ['count', 'sum']
}).round(2)

print("\nSeizures and quantities by country-year (2020-2024):")
print(country_year_summary)

# Also show total by country
country_totals = seizure_filtered.groupby('Country/Territory of Seizure').agg({
    'Quantity Seized': ['count', 'sum']
}).round(2)

print("\nTotal seizures by country (2020-2024):")
print(country_totals)

Filtered data: 138,804 seizures
Date range: 2020-01-01 00:00:00 to 2024-09-30 00:00:00

Seizures and quantities by country-year (2020-2024):
                                      Quantity Seized            
                                                count         sum
Country/Territory of Seizure     Year                            
Belgium                          2020             161    69793.39
                                 2021             178    93801.07
                                 2022             103    33919.91
                                 2023             136   107585.80
                                 2024              82    22193.39
Bolivia (Plurinational State of) 2020             876    15661.25
                                 2021            1131    19721.66
                                 2022            1274    20331.06
                                 2023            1597    32928.38
                                 2024            2037    37734.73
C

In [5]:
# Population data (2022 estimates in millions) - World Bank data
population_data = {
    'Colombia': 51.52,
    'Spain': 47.78,
    'Bolivia (Plurinational State of)': 12.08,
    'Belgium': 11.59,
    'Peru': 33.72,
    'Portugal': 10.33,
    'Greece': 10.64,
    'Sweden': 10.49,
    'Netherlands': 17.53,  # In case we had data
    'France': 68.04       # In case we had data
}

# Calculate 4-year average seizure rates (kg per 100k population)
seizure_rates = []

print("=== COCAINE SEIZURE RATES (2020-2024 Average) ===")
print("Formula: (Total kg seized ÷ 4 years ÷ Population) × 100,000")
print()

for country in focus_countries_in_data:
    if country in population_data:
        # Get total seizures for this country
        country_data = seizure_filtered[seizure_filtered['Country/Territory of Seizure'] == country]
        
        if len(country_data) > 0:
            total_kg = country_data['Quantity Seized'].sum()
            population_millions = population_data[country]
            population_total = population_millions * 1_000_000  # Convert to actual population
            
            # Calculate 4-year average annual rate per 100k
            annual_avg_kg = total_kg / 4  # 4-year period
            rate_per_100k = (annual_avg_kg / population_total) * 100_000
            
            seizure_rates.append({
                'Country': country,
                'Population_Millions': population_millions,
                'Total_Seized_kg': total_kg,
                'Annual_Avg_kg': annual_avg_kg,
                'Rate_per_100k': rate_per_100k
            })
            
            print(f"{country:35} | Pop: {population_millions:5.1f}M | Total: {total_kg:10,.1f} kg | Rate: {rate_per_100k:8.2f} kg/100k")

# Convert to DataFrame and sort by seizure rate
rates_df = pd.DataFrame(seizure_rates)
rates_df = rates_df.sort_values('Rate_per_100k', ascending=False)

print(f"\n=== RANKED BY SEIZURE RATE (kg per 100k population) ===")
for i, row in rates_df.iterrows():
    print(f"{row['Country']:35} | {row['Rate_per_100k']:8.2f} kg/100k")

=== COCAINE SEIZURE RATES (2020-2024 Average) ===
Formula: (Total kg seized ÷ 4 years ÷ Population) × 100,000

Peru                                | Pop:  33.7M | Total:   88,640.3 kg | Rate:    65.72 kg/100k
Colombia                            | Pop:  51.5M | Total: 4,463,600.5 kg | Rate:  2165.96 kg/100k
Bolivia (Plurinational State of)    | Pop:  12.1M | Total:  126,377.1 kg | Rate:   261.54 kg/100k
Spain                               | Pop:  47.8M | Total:  247,935.7 kg | Rate:   129.73 kg/100k
Belgium                             | Pop:  11.6M | Total:  327,293.6 kg | Rate:   705.98 kg/100k
Sweden                              | Pop:  10.5M | Total:    2,944.4 kg | Rate:     7.02 kg/100k
Portugal                            | Pop:  10.3M | Total:   54,270.3 kg | Rate:   131.34 kg/100k
Greece                              | Pop:  10.6M | Total:   16,133.2 kg | Rate:    37.91 kg/100k

=== RANKED BY SEIZURE RATE (kg per 100k population) ===
Colombia                            |  2165.96 

In [6]:
# Let's investigate the missing countries
print("=== CHECKING ALL 10 FOCUS COUNTRIES ===")

focus_countries_in_data = [
    'Peru', 'Colombia', 'Bolivia (Plurinational State of)', 'Spain', 
    'Netherlands', 'France', 'Belgium', 'Sweden', 'Portugal', 'Greece'
]

# Check each country individually in the filtered dataset
for country in focus_countries_in_data:
    country_data = seizure_filtered[seizure_filtered['Country/Territory of Seizure'] == country]
    
    if len(country_data) > 0:
        total_kg = country_data['Quantity Seized'].sum()
        years_present = sorted(country_data['Year'].unique())
        print(f"✓ {country:35} | {len(country_data):6} seizures | {total_kg:10,.1f} kg | Years: {years_present}")
    else:
        print(f"✗ {country:35} | NO DATA in 2020-2024 period")

# Let's also check if Netherlands and France have data in other periods
print(f"\n=== CHECKING NETHERLANDS & FRANCE IN FULL DATASET ===")
for country in ['Netherlands', 'France']:
    all_data = df[df['Country/Territory of Seizure'] == country]
    if len(all_data) > 0:
        all_data['Year'] = pd.to_datetime(all_data['Seizure Date']).dt.year
        year_counts = all_data['Year'].value_counts().sort_index()
        print(f"{country}: {len(all_data)} total seizures")
        print(f"  Years with data: {dict(year_counts)}")
    else:
        print(f"{country}: NO DATA in entire dataset")

=== CHECKING ALL 10 FOCUS COUNTRIES ===
✓ Peru                                |    922 seizures |   88,640.3 kg | Years: [np.int32(2022), np.int32(2023), np.int32(2024)]
✓ Colombia                            | 115584 seizures | 4,463,600.5 kg | Years: [np.int32(2020), np.int32(2021), np.int32(2022), np.int32(2023), np.int32(2024)]
✓ Bolivia (Plurinational State of)    |   6915 seizures |  126,377.1 kg | Years: [np.int32(2020), np.int32(2021), np.int32(2022), np.int32(2023), np.int32(2024)]
✓ Spain                               |   5662 seizures |  247,935.7 kg | Years: [np.int32(2020), np.int32(2021), np.int32(2022), np.int32(2023), np.int32(2024)]
✗ Netherlands                         | NO DATA in 2020-2024 period
✗ France                              | NO DATA in 2020-2024 period
✓ Belgium                             |    660 seizures |  327,293.6 kg | Years: [np.int32(2020), np.int32(2021), np.int32(2022), np.int32(2023), np.int32(2024)]
✓ Sweden                              |   698

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_data['Year'] = pd.to_datetime(all_data['Seizure Date']).dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_data['Year'] = pd.to_datetime(all_data['Seizure Date']).dt.year


In [7]:
# Corrected focus countries list
focus_countries_in_data = [
    'Greece', 'Portugal', 'Spain', 'Sweden', 'Peru', 'Colombia', 
    'Bolivia (Plurinational State of)', 'Belgium', 'Brazil'  # Added Brazil!
]

# Check Brazil's seizure data for 2020-2024
brazil_data = seizure_filtered[seizure_filtered['Country/Territory of Seizure'] == 'Brazil']
if len(brazil_data) > 0:
    brazil_total = brazil_data['Quantity Seized'].sum()
    brazil_years = sorted(brazil_data['Year'].unique())
    print(f"Brazil: {len(brazil_data)} seizures | {brazil_total:,.1f} kg | Years: {brazil_years}")
    
    # Calculate Brazil's seizure rate
    brazil_pop = 215.31  # Brazil population in millions (2022)
    brazil_annual_avg = brazil_total / 4
    brazil_rate = (brazil_annual_avg / (brazil_pop * 1_000_000)) * 100_000
    print(f"Brazil seizure rate: {brazil_rate:.2f} kg/100k")
else:
    print("Brazil: NO DATA in 2020-2024")

Brazil: NO DATA in 2020-2024


In [10]:
# Step 1: Update your original analysis to 2019-2023 period
from datetime import datetime
import pandas as pd

# All 12 focus countries with correct names
all_focus_countries = [
    'Greece', 'Portugal', 'Spain', 'Sweden', 'Peru', 'Colombia', 
    'Bolivia (Plurinational State of)', 'Belgium', 'Brazil'  # Original 9 from your filtered data
    # France, Netherlands, USA will come from UNODC data
]

# Update period to 2019-2023 (5 years for consistency)
start_date = datetime(2019, 1, 1)
end_date = datetime(2023, 12, 31)

# Re-filter your original data
seizure_filtered_updated = df[
    (df['Country/Territory of Seizure'].isin(all_focus_countries)) &
    (df['Seizure Date'] >= start_date) & 
    (df['Seizure Date'] <= end_date)
].copy()

print(f"Updated filtered data (2019-2023): {len(seizure_filtered_updated):,} seizures")

# Recalculate rates for original countries with 5-year period
# [Your existing calculation code but with /5 instead of /4]

Updated filtered data (2019-2023): 157,447 seizures


In [11]:
# Population data (2022 estimates in millions) - World Bank data
population_data = {
    'Colombia': 51.52,
    'Spain': 47.78,
    'Bolivia (Plurinational State of)': 12.08,
    'Belgium': 11.59,
    'Peru': 33.72,
    'Portugal': 10.33,
    'Greece': 10.64,
    'Sweden': 10.49,
    'Netherlands': 17.53,  # In case we had data
    'France': 68.04       # In case we had data
}

# Calculate 4-year average seizure rates (kg per 100k population)
seizure_rates = []

print("=== COCAINE SEIZURE RATES (2020-2024 Average) ===")
print("Formula: (Total kg seized ÷ 4 years ÷ Population) × 100,000")
print()

for country in focus_countries_in_data:
    if country in population_data:
        # Get total seizures for this country
        country_data = seizure_filtered[seizure_filtered['Country/Territory of Seizure'] == country]
        
        if len(country_data) > 0:
            total_kg = country_data['Quantity Seized'].sum()
            population_millions = population_data[country]
            population_total = population_millions * 1_000_000  # Convert to actual population
            
            # Calculate 4-year average annual rate per 100k
            annual_avg_kg = total_kg / 5  # 4-year period
            rate_per_100k = (annual_avg_kg / population_total) * 100_000
            
            seizure_rates.append({
                'Country': country,
                'Population_Millions': population_millions,
                'Total_Seized_kg': total_kg,
                'Annual_Avg_kg': annual_avg_kg,
                'Rate_per_100k': rate_per_100k
            })
            
            print(f"{country:35} | Pop: {population_millions:5.1f}M | Total: {total_kg:10,.1f} kg | Rate: {rate_per_100k:8.2f} kg/100k")

# Convert to DataFrame and sort by seizure rate
rates_df = pd.DataFrame(seizure_rates)
rates_df = rates_df.sort_values('Rate_per_100k', ascending=False)

print(f"\n=== RANKED BY SEIZURE RATE (kg per 100k population) ===")
for i, row in rates_df.iterrows():
    print(f"{row['Country']:35} | {row['Rate_per_100k']:8.2f} kg/100k")

=== COCAINE SEIZURE RATES (2020-2024 Average) ===
Formula: (Total kg seized ÷ 4 years ÷ Population) × 100,000

Greece                              | Pop:  10.6M | Total:   16,133.2 kg | Rate:    30.33 kg/100k
Portugal                            | Pop:  10.3M | Total:   54,270.3 kg | Rate:   105.07 kg/100k
Spain                               | Pop:  47.8M | Total:  247,935.7 kg | Rate:   103.78 kg/100k
Sweden                              | Pop:  10.5M | Total:    2,944.4 kg | Rate:     5.61 kg/100k
Peru                                | Pop:  33.7M | Total:   88,640.3 kg | Rate:    52.57 kg/100k
Colombia                            | Pop:  51.5M | Total: 4,463,600.5 kg | Rate:  1732.76 kg/100k
Bolivia (Plurinational State of)    | Pop:  12.1M | Total:  126,377.1 kg | Rate:   209.23 kg/100k
Belgium                             | Pop:  11.6M | Total:  327,293.6 kg | Rate:   564.79 kg/100k

=== RANKED BY SEIZURE RATE (kg per 100k population) ===
Colombia                            |  1732.76 