In [22]:
import spacy
from spacy import displacy
import pandas as pd
import os
import pickle
import requests

### Load SpaCy model

In [115]:
nlp = spacy.load('models/configured_spacy_model')

In [116]:
txt = "De verdachte smokkelde 123 kg mdma van Nederland naar Bergen op Zoom."

In [84]:
txt = "hij in of omstreeks de periode van 11 september 2018 tot en met 26 februari 2019 te Amsterdam, in elk geval in Nederland en/of Peru, tezamen en in vereniging met een ander of anderen, althans alleen, om een feit, bedoeld in het vierde of vijfde lid van artikel 10 van de Opiumwet, te weten het opzettelijk telen, bereiden, bewerken, verwerken, verkopen, afleveren, verstrekken, vervoeren, vervaardigen en/of binnen of buiten het grondgebied van Nederland brengen van heroïne en/of cocaïne, in elk geval een hoeveelheid van een materiaal bevattende heroïne en/of cocaïne, in elk geval een middel vermeld op de bij de Opiumwet behorende lijst I voor te bereiden en/of te bevorderen"

In [117]:
displacy.render(nlp(txt), style='ent')

### Load dataset

In [40]:
merged_df = pd.read_pickle("merged_df.pkl")
merged_df

Unnamed: 0,id,verdict_date,publication_date,verdict_type,jurisdiction_type,inhoudsindicatie,case text
0,ECLI-NL-RBNNE-2021-5018,2021-01-10,2021-11-23,uitspraak,['Strafrecht'],"Productie synthetische drugs, medeplegen, prod...",\n\nRECHTBANK NOORD-NEDERLAND\nAfdeling strafr...
1,ECLI-NL-RBZUT-2003-AH9598,2003-03-06,2003-09-07,uitspraak,['Strafrecht'],Leveren grondstoffen synthetische drugs en sto...,\n\nRECHTBANK ZUTPHEN\nMeervoudige economische...
2,ECLI-NL-RBZWB-2020-2646,2020-06-23,2020-06-23,uitspraak,['Strafrecht'],plegen van voorbereidingshandelingen ten behoe...,\n\nRECHTBANK ZEELAND-WEST-BRABANT\n\nStrafrec...
3,ECLI-NL-GHAMS-2019-1601,2019-08-05,2019-07-26,uitspraak,['Strafrecht'],medeplegen witwassen - medeplegen voorhanden h...,\n\nafdeling strafrecht\nparketnummer: 23-0017...
4,ECLI-NL-GHAMS-2019-1602,2019-08-05,2019-07-26,uitspraak,['Strafrecht'],medeplegen witwassen - medeplegen voorhanden h...,\n\nafdeling strafrecht\nparketnummer: 23-0017...
...,...,...,...,...,...,...,...
18457,ECLI-NL-RBAMS-2013-1294,2013-01-29,2013-08-10,uitspraak,['Strafrecht'],De rechtbank acht bewezen dat er sprake is van...,\n\nRECHTBANK AMSTERDAM\n\n\nVONNIS\n\n \n\n13...
18458,ECLI-NL-PHR-2020-1106,2020-11-24,2020-11-24,conclusie,['Strafrecht'],Conclusie AG. Vervolging van een politieagent ...,\n\nPROCUREUR-GENERAAL\n\n\nBIJ DE\n\n\nHOGE R...
18459,ECLI-NL-GHAMS-2017-2618,2017-06-29,2017-05-07,uitspraak,['Strafrecht'],Liquidatieproces Passage\n\n ...,\n\n\nparketnummer: 23-001217-13\ndatum uitspr...
18460,ECLI-NL-RBAMS-2013-BZ0392,2013-01-29,2013-01-02,uitspraak,['Strafrecht'],De rechtbank acht bewezen dat verdachte zich s...,\nRECHTBANK AMSTERDAM \nVONNIS \n\n13/529144-...


### Load country configs & country classifier

In [86]:
geopy_mistakes = {}

In [87]:
open_file = open('saves/countries_to_exclude.pkl', "rb")
countries_to_exclude = pickle.load(open_file)
open_file.close()

In [88]:
open_file = open('saves/country_translation_dict.pkl', "rb")
country_translation_dict = pickle.load(open_file)
open_file.close()

In [89]:
api_key = 'AIzaSyCcbIhMxSz5OP74pDT0aQLTvXDSMaV8tFk'
geocode_url = 'https://maps.googleapis.com/maps/api/geocode/json?address='

def get_geocode_country(txt):
    res = requests.get(f"{geocode_url}{txt}&key={api_key}").json()['results']
    country_name = "None"
    try:
        for address_component in res[0]['address_components']:
            if 'country' in address_component['types']:
                country_name = address_component['long_name']
    except:
        return "None"
    return country_name

In [90]:
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent = "geoapiExercises")

def get_geopy_country(txt):
    try:
        location = geolocator.geocode(txt, language='en')
        country_name = location.raw['display_name'].split(',')[-1].strip()
        return country_name
    except:
        return "None"

In [91]:
def country_pipeline(txt):
    txt = txt.lower()
    # Check if already exists
    if txt in countries_to_exclude:
        return "None"
    for key in country_translation_dict:
        if txt in country_translation_dict[key]:
            return key
    
    # Get location
    geopy_loc = get_geopy_country(txt)
    if geopy_loc == "None":
        countries_to_exclude.append(txt)
        return "None"
    else:
        geocode_loc = get_geocode_country(txt)
        if geocode_loc == "None":
            countries_to_exclude.append(txt)
            geopy_mistakes[txt] = geopy_loc
            return "None"
        else:
            if geocode_loc not in country_translation_dict:
                country_translation_dict[geocode_loc] = []
            country_translation_dict[geocode_loc].append(txt)
            return geocode_loc

In [92]:
country_pipeline('Nederlandse Antillen')

'Netherlands'

### Filter cases

In [93]:
import re
def split_text_in_chunks(doc):
    chunks = re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])|\n', doc)
    chunks = [x for x in chunks if len(x) > 1]
    return chunks

def split_cases_in_chunks(df):
    return_df = pd.DataFrame(columns=['ecli', 'chunks'])
    for index, row in df.iterrows():
        chunks = split_text_in_chunks(row['case text'])
        return_df = return_df.append({'ecli': row['id'].replace('-', ':'), 'chunks': chunks}, ignore_index = True)
    return return_df

Rules:
1. Case needs to concern criminal law
2. Case needs to contain a smuggle word
3. Case needs to contain a country that is not "Netherlands"
4. Case needs to contain a chunk that contains a country and a drug

In [35]:
def enforce_rule_1(df):
    return_df = pd.DataFrame()
    for index, row in df.iterrows():
        if 'Strafrecht' in row['jurisdiction_type']:
            return_df = return_df.append(row)
    return return_df

In [36]:
open_file = open('saves/smuggle_words.pkl', "rb")
smuggle_words = pickle.load(open_file)
open_file.close()

def enforce_rule_2(df):
    return_df = pd.DataFrame()
    for index, row in df.iterrows():
        if any(word in row['case text'] for word in smuggle_words):
            return_df = return_df.append(row)
    return return_df

In [37]:
def enforce_rule_3(df):
    return_df = pd.DataFrame(columns=['ecli', 'chunks', 'countries_present'])
    for index, row in df.iterrows():
        countries_present = []
        chunks = row['chunks']
        for chunk in chunks:
            append = False
            ents = nlp(chunk).ents
            for ent in ents:
                if ent.label_ == "GPE":
                    country = country_pipeline(ent.text)
                    if country != "None" and country not in countries_present:
                        countries_present.append(ent.text)
                        if country != "Netherlands":
                            append = True
        if len(countries_present) > 0 and append:
            return_df = return_df.append({'ecli': row['ecli'], 'chunks': chunks, 'countries_present': countries_present}, ignore_index = True)
    return return_df  


In [38]:
open_file = open('saves/drug_list.pkl', "rb")
drug_list = pickle.load(open_file)
open_file.close()

def enforce_rule_4(df):
    return_df = pd.DataFrame(columns['ecli', 'chunks', 'countries_present'])
    for index, row in df.iterrows():
        chunks = row['chunks']
        relevant_chunks = []
        for chunk in chunks:
            word_list = [x for x in chunk.lower().rstrip().replace('.', '').split(' ') if len(x)>0]
            if any(drug in word_list for drug in drug_list):
                ents = nlp(chunk).ents
                for ent in ents:
                    if ent.type_ == "GPE":
                        country = country_pipeline(ent.text)
                        if country != "None":
                            relevant_chunks.append(chunk)
                        
        if len(relevant_chunks) > 0:
            return_df = return_df.append({'ecli': row['ecli'], 'chunks': relevant_chunks, 'countries_present': row['countries_present']}, ignore_index = True)
    return return_df


In [39]:
import warnings
warnings.filterwarnings('ignore')

#### Create temp df with custom ecli's 

In [None]:
ecli_list = ['ECLI:NL:RBNHO:2020:2746']

In [124]:
ecli_list = []
for id in list(merged_df['id'].sample(n=10, random_state=12)):
    ecli_list.append(id.replace('-', ':'))

ecli_list

['ECLI:NL:RBSGR:2004:AP0058',
 'ECLI:NL:RBARN:2012:BV8013',
 'ECLI:NL:GHSHE:2021:1482',
 'ECLI:NL:RBNNE:2019:1542',
 'ECLI:NL:RBBRE:2004:AR4371',
 'ECLI:NL:RBZWB:2020:2342',
 'ECLI:NL:HR:2011:BR2990',
 'ECLI:NL:OGEAA:2017:225',
 'ECLI:NL:PHR:2009:BJ2785',
 'ECLI:NL:HR:2010:BK4154']

In [125]:

corr_ecli_list = [ecli.replace(':', '-') for ecli in ecli_list]
temp_df = merged_df[merged_df['id'].isin(corr_ecli_list)]

### Execute

In [126]:
curr_df = merged_df
curr_df = temp_df

print(f"{len(curr_df)} cases in original df.")
trafficking_df = enforce_rule_1(curr_df)
print(f"{len(trafficking_df)} cases after rule 1.")
trafficking_df = enforce_rule_2(trafficking_df)
print(f"{len(trafficking_df)} cases after rule 2.")
trafficking_df = split_cases_in_chunks(trafficking_df)
# trafficking_df = enforce_rule_3(trafficking_df)
# print(f"{len(trafficking_df)} cases after rule 3.")
# trafficking_df = enforce_rule_4(trafficking_df)
# print(f"{len(trafficking_df)} cases after rule 4.")
trafficking_df

10 cases in original df.
8 cases after rule 1.
8 cases after rule 2.


Unnamed: 0,ecli,chunks
0,ECLI:NL:RBNNE:2019:1542,"[RECHTBANK NOORD-NEDERLAND, Afdeling strafrech..."
1,ECLI:NL:OGEAA:2017:225,"[GERECHT IN EERSTE AANLEG VAN ARUBA, S T R A F..."
2,ECLI:NL:HR:2010:BK4154,"[2 maart 2010, Strafkamer, nr. 07/11850, Hoge ..."
3,ECLI:NL:HR:2011:BR2990,"[11 oktober 2011, Strafkamer, nr. 10/03004, Ho..."
4,ECLI:NL:RBBRE:2004:AR4371,"[RECHTBANK BREDA , Parketnummer(s): 02/004325-..."
5,ECLI:NL:PHR:2009:BJ2785,"[Nr. 07/13255 B, Mr. Machielse, Zitting 23 jun..."
6,ECLI:NL:RBARN:2012:BV8013,"[RECHTBANK ARNHEM , Sector strafrecht , Meervo..."
7,ECLI:NL:RBZWB:2020:2342,"[RECHTBANK ZEELAND-WEST-BRABANT, Strafrecht, Z..."


In [79]:
open_file = open('saves/drug_list.pkl', "rb")
drug_list = pickle.load(open_file)
open_file.close()

open_file = open('saves/countries_to_exclude.pkl', "rb")
countries_to_exclude = pickle.load(open_file)
open_file.close()

my_file = open("saves/countries_to_exclude.txt", "r")
content = my_file.read()
countries_to_exclude_from_txt = content.split('\n')

for loc in countries_to_exclude_from_txt:
    if loc not in countries_to_exclude:
        countries_to_exclude.append(loc)

open_file = open('saves/country_translation_dict.pkl', "rb")
country_translation_dict = pickle.load(open_file)
open_file.close()

valid_countries_list = []
for key in country_translation_dict:
    for loc in country_translation_dict[key]:
        if loc not in countries_to_exclude:
            valid_countries_list.append(loc)
print(f"{len(valid_countries_list)} locations in {len(country_translation_dict)} countries")


def temporary_rule_3_and_4_solution(df):
    return_df = pd.DataFrame(columns=['ecli', 'chunks', 'countries_present', 'relevant_chunks'])
    for index, row in df.iterrows():
        print(index)
        countries_present = {}
        relevant_chunks = []
        append = False
        for chunk in row['chunks']:
            word_list = [x for x in chunk.lower().rstrip().replace('.', '').split(' ') if len(x)>0]
            for word in word_list:
                if word not in countries_to_exclude:
                    if word in valid_countries_list:
                        for key in country_translation_dict:
                            if word in country_translation_dict[key]:
                                if key not in countries_present and key != "Netherlands":
                                    if key not in countries_present:
                                        countries_present[key] = []
                                    countries_present[key].append(word)
                                    append = True
                                if any(drug in word_list for drug in drug_list):
                                    relevant_chunks.append(chunk)
                                       
        if append and len(relevant_chunks) > 0:
            return_df = return_df.append({'ecli': row['ecli'], 'chunks': row['chunks'], 'countries_present': countries_present, 'relevant_chunks': relevant_chunks}, ignore_index = True)
    return return_df

6402 locations in 181 countries


In [118]:
def temporary_rule_3_and_4_solution(df):
    return_df = pd.DataFrame(columns=['ecli', 'chunks', 'countries_present', 'relevant_chunks'])
    for index, row in df.iterrows():
        print(index)
        countries_present = {}
        relevant_chunks = []
        
        for chunk in row['chunks']:
            drug = False
            append = False
            for ent in nlp(chunk).ents:
                if ent.label_ == "GPE":
                    country = country_pipeline(ent.text)
                    if country != "None":
                        if country != "Netherlands":
                            append = True
                            if country not in countries_present:
                                countries_present[country] = []
                            if ent.text not in countries_present[country]:
                                countries_present[country].append(ent.text)
                elif ent.label_ == "DRUG":
                    drug = True
            
            if drug and append:
                relevant_chunks.append(chunk)
                            
        if len(relevant_chunks) > 0:
            return_df = return_df.append({'ecli': row['ecli'], 'chunks': row['chunks'], 'countries_present': countries_present, 'relevant_chunks': relevant_chunks}, ignore_index = True)
    return return_df

In [127]:
%%time
filtered_df = temporary_rule_3_and_4_solution(trafficking_df)

0
1
2
3
4
5
6
7
CPU times: total: 35 s
Wall time: 35 s


In [129]:
filtered_df

Unnamed: 0,ecli,chunks,countries_present,relevant_chunks
0,ECLI:NL:RBNNE:2019:1542,"[RECHTBANK NOORD-NEDERLAND, Afdeling strafrech...","{'South Africa': ['Stratum'], 'Germany': ['lag...",[AAJC7176NL: bevat amfetamine gerelateerde syn...
1,ECLI:NL:OGEAA:2017:225,"[GERECHT IN EERSTE AANLEG VAN ARUBA, S T R A F...","{'Aruba': ['ARUBA', 'Aruba'], 'United States':...",[Dat hij op 1 november 2016 in Aruba tezamen e...
2,ECLI:NL:RBBRE:2004:AR4371,"[RECHTBANK BREDA , Parketnummer(s): 02/004325-...","{'Belgium': ['Zuid'], 'United Kingdom': ['Groo...",[Onder feit 1 primair is ten laste is gelegd h...


In [96]:
filtered_df['countries_present'].iloc[0]

{'Norway': ['bergen'],
 'United States': ['kent'],
 'Belgium': ['voeren'],
 'Syria': ['syrië'],
 'Austria': ['steinhaus']}

In [None]:
filtered_df.to_pickle('saves/trafficking_df.pkl')

### Results

In [46]:
trafficking_df.to_pickle('saves/trafficking_df.pkl')

In [None]:
trafficking_df