In [1]:
import spacy
from spacy import displacy
import pandas as pd
import os
import pickle
import requests

### Load SpaCy model

In [2]:
nlp = spacy.load('models/configured_spacy_model')

In [3]:
txt = "De verdachte smokkelde 123 kg mdma van Nederland naar Ibiza."

In [4]:
displacy.render(nlp(txt), style='ent')

### Load dataset

In [5]:
merged_df = pd.read_pickle("merged_df.pkl")
merged_df

Unnamed: 0,id,verdict_date,publication_date,verdict_type,jurisdiction_type,inhoudsindicatie,case text
0,ECLI-NL-RBNNE-2021-5018,2021-01-10,2021-11-23,uitspraak,['Strafrecht'],"Productie synthetische drugs, medeplegen, prod...",\n\nRECHTBANK NOORD-NEDERLAND\nAfdeling strafr...
1,ECLI-NL-RBZUT-2003-AH9598,2003-03-06,2003-09-07,uitspraak,['Strafrecht'],Leveren grondstoffen synthetische drugs en sto...,\n\nRECHTBANK ZUTPHEN\nMeervoudige economische...
2,ECLI-NL-RBZWB-2020-2646,2020-06-23,2020-06-23,uitspraak,['Strafrecht'],plegen van voorbereidingshandelingen ten behoe...,\n\nRECHTBANK ZEELAND-WEST-BRABANT\n\nStrafrec...
3,ECLI-NL-GHAMS-2019-1601,2019-08-05,2019-07-26,uitspraak,['Strafrecht'],medeplegen witwassen - medeplegen voorhanden h...,\n\nafdeling strafrecht\nparketnummer: 23-0017...
4,ECLI-NL-GHAMS-2019-1602,2019-08-05,2019-07-26,uitspraak,['Strafrecht'],medeplegen witwassen - medeplegen voorhanden h...,\n\nafdeling strafrecht\nparketnummer: 23-0017...
...,...,...,...,...,...,...,...
18487,ECLI-NL-RBAMS-2013-1294,2013-01-29,2013-08-10,uitspraak,['Strafrecht'],De rechtbank acht bewezen dat er sprake is van...,\n\nRECHTBANK AMSTERDAM\n\n\nVONNIS\n\n \n\n13...
18488,ECLI-NL-PHR-2020-1106,2020-11-24,2020-11-24,conclusie,['Strafrecht'],Conclusie AG. Vervolging van een politieagent ...,\n\nPROCUREUR-GENERAAL\n\n\nBIJ DE\n\n\nHOGE R...
18489,ECLI-NL-GHAMS-2017-2618,2017-06-29,2017-05-07,uitspraak,['Strafrecht'],Liquidatieproces Passage\n\n ...,\n\n\nparketnummer: 23-001217-13\ndatum uitspr...
18490,ECLI-NL-RBAMS-2013-BZ0392,2013-01-29,2013-01-02,uitspraak,['Strafrecht'],De rechtbank acht bewezen dat verdachte zich s...,\nRECHTBANK AMSTERDAM \nVONNIS \n\n13/529144-...


### Load country configs & country classifier

In [6]:
geopy_mistakes = {}

In [7]:
open_file = open('saves/countries_to_exclude.pkl', "rb")
countries_to_exclude = pickle.load(open_file)
open_file.close()

In [8]:
open_file = open('saves/country_translation_dict.pkl', "rb")
country_translation_dict = pickle.load(open_file)
open_file.close()

In [9]:
api_key = 'AIzaSyCcbIhMxSz5OP74pDT0aQLTvXDSMaV8tFk'
geocode_url = 'https://maps.googleapis.com/maps/api/geocode/json?address='

def get_geocode_country(txt):
    res = requests.get(f"{geocode_url}{txt}&key={api_key}").json()['results']
    country_name = "None"
    try:
        for address_component in res[0]['address_components']:
            if 'country' in address_component['types']:
                country_name = address_component['long_name']
    except:
        return "None"
    return country_name

In [10]:
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent = "geoapiExercises")

def get_geopy_country(txt):
    try:
        location = geolocator.geocode(txt, language='en')
        country_name = location.raw['display_name'].split(',')[-1].strip()
        return country_name
    except:
        return "None"

In [11]:
def country_pipeline(txt):
    txt = txt.lower()
    # Check if already exists
    if txt in countries_to_exclude:
        return "None"
    for key in country_translation_dict:
        if txt in country_translation_dict[key]:
            return key
    
    # Get location
    geopy_loc = get_geopy_country(txt)
    if geopy_loc == "None":
        countries_to_exclude.append(txt)
        return "None"
    else:
        geocode_loc = get_geocode_country(txt)
        if geocode_loc == "None":
            countries_to_exclude.append(txt)
            geopy_mistakes[txt] = geopy_loc
            return "None"
        else:
            if geocode_loc not in country_translation_dict:
                country_translation_dict[geocode_loc] = []
            country_translation_dict[geocode_loc].append(txt)
            return geocode_loc

In [12]:
country_pipeline('mek')

'None'

### Filter cases

In [13]:
import re
def split_text_in_chunks(doc):
    chunks = re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])|\n', doc)
    chunks = [x for x in chunks if len(x) > 1]
    return chunks

def split_cases_in_chunks(df):
    return_df = pd.DataFrame(columns=['ecli', 'chunks'])
    for index, row in df.iterrows():
        chunks = split_text_in_chunks(row['case text'])
        return_df = return_df.append({'ecli': row['id'].replace('-', ':'), 'chunks': chunks}, ignore_index = True)
    return return_df

Rules:
1. Case needs to concern criminal law
2. Case needs to contain a smuggle word
3. Case needs to contain a country that is not "Netherlands"
4. Case needs to contain a chunk that contains a country and a drug

In [38]:
def enforce_rule_1(df):
    return_df = pd.DataFrame()
    for index, row in df.iterrows():
        if 'Strafrecht' in row['jurisdiction_type']:
            return_df = return_df.append(row)
    return return_df

In [39]:
open_file = open('saves/smuggle_words.pkl', "rb")
smuggle_words = pickle.load(open_file)
open_file.close()

def enforce_rule_2(df):
    return_df = pd.DataFrame()
    for index, row in df.iterrows():
        if any(word in row['case text'] for word in smuggle_words):
            return_df = return_df.append(row)
    return return_df

In [44]:
open_file = open('saves/drug_list.pkl', "rb")
drug_list = pickle.load(open_file)
open_file.close()

valid_countries_list = []
for key in country_translation_dict:
    for loc in country_translation_dict[key]:
        valid_countries_list.append(loc)
print(f"{len(valid_countries_list)} locations in {len(country_translation_dict)} countries")


def temporary_rule_3_and_4_solution(df):
    return_df = pd.DataFrame(columns=['ecli', 'chunks', 'countries_present', 'relevant_chunks'])
    for index, row in df[:100].iterrows():
        countries_present = []
        relevant_chunks = []
        append = False
        for chunk in row['chunks']:
            word_list = [x for x in chunk.lower().rstrip().replace('.', '').split(' ') if len(x)>0]
            for word in word_list:
                if word not in countries_to_exclude:
                    if word in valid_countries_list:
                        for key in country_translation_dict:
                            if word in country_translation_dict[key]:
                                if key not in countries_present and key != "Netherlands":
                                    countries_present.append(key)
                                    append = True
                                if any(drug in word_list for drug in drug_list):
                                    relevant_chunks.append(chunk)
                                    
        if append and len(relevant_chunks) > 0:
            return_df = return_df.append({'ecli': row['ecli'], 'chunks': row['chunks'], 'countries_present': countries_present, 'relevant_chunks': relevant_chunks}, ignore_index = True)
    return return_df
            


In [45]:
%%time
temporary_rule_3_and_4_solution(trafficking_df)

CPU times: user 1min 57s, sys: 196 ms, total: 1min 58s
Wall time: 1min 58s


Unnamed: 0,ecli,chunks,countries_present,relevant_chunks
0,ECLI:NL:RBNNE:2021:5018,"[RECHTBANK NOORD-NEDERLAND, Afdeling strafrech...",[United States],[Op 19 februari 2021 is bij de doorzoeking op ...
1,ECLI:NL:RBZWB:2020:2646,"[RECHTBANK ZEELAND-WEST-BRABANT, Strafrecht, Z...","[United States, Germany, Belgium]",[1.in de periode van 1 oktober 2019 tot en me...
2,ECLI:NL:GHAMS:2019:1601,"[afdeling strafrecht, parketnummer: 23-001795-...","[Belgium, United States]",[1:zij op of omstreeks 21 september 2015 te Ho...
3,ECLI:NL:GHAMS:2019:1602,"[afdeling strafrecht, parketnummer: 23-001762-...","[Austria, Belgium]",[1:hij op of omstreeks 21 september 2015 te Ho...
4,ECLI:NL:RBNNE:2021:5026,"[RECHTBANK NOORD-NEDERLAND, Afdeling strafrech...",[United States],[1.hij in of omstreeks de periode van 01 oktob...
...,...,...,...,...
64,ECLI:NL:RBSHE:2010:BO6588,"[vonnis, RECHTBANK 'S-HERTOGENBOSCH, Sector St...","[United States, Belgium, Germany]",[1. hij op of omstreeks 27 mei 2010 te Vlierde...
65,ECLI:NL:GHDHA:2013:3408,"[Rolnummer:\t\t22-006181-10 , Parketnummer:\t\...",[Belgium],[1.hij op een of meer tijdstippen in of omstre...
66,ECLI:NL:RBMAA:2010:BN3088,"[RECHTBANK MAASTRICHT , Sector strafrecht , pa...","[Belgium, Denmark, Germany]",[hij op of omstreeks 13 april 2010 in de gemee...
67,ECLI:NL:GHAMS:2021:3470,"[afdeling strafrecht, parketnummer: 23-000569-...",[Suriname],[Daar komt bovenop dat het een feit van algeme...


In [21]:
def enforce_rule_3(df):
    return_df = pd.DataFrame(columns=['ecli', 'chunks', 'countries_present'])
    for index, row in df.iterrows():
        countries_present = []
        chunks = row['chunks']
        for chunk in chunks:
            ents = nlp(chunk).ents
            for ent in ents:
                if ent.label_ == "GPE":
                    country = country_pipeline(ent.text)
                    if country != "None" and country not in countries_present:
                        countries_present.append(ent.text)
                        if country != "Netherlands":
                            append = True
        if len(countries_present) > 0 and append:
            return_df = return_df.append({'ecli': row['ecli'], 'chunks': chunks, 'countries_present': countries_present}, ignore_index = True)
    return return_df  


In [29]:
open_file = open('saves/drug_list.pkl', "rb")
drug_list = pickle.load(open_file)
open_file.close()

def enforce_rule_4(df):
    return_df = pd.DataFrame(columns['ecli', 'chunks', 'countries_present'])
    for index, row in df.iterrows():
        chunks = row['chunks']
        relevant_chunks = []
        for chunk in chunks:
            word_list = [x for x in chunk.lower().rstrip().replace('.', '').split(' ') if len(x)>0]
            if any(drug in word_list for drug in drug_list):
                ents = nlp(chunk).ents
                for ent in ents:
                    if ent.type_ == "GPE":
                        country = country_pipeline(ent.text)
                        if country != "None":
                            relevant_chunks.append(chunk)
                        
        if len(relevant_chunks) > 0:
            return_df = return_df.append({'ecli': row['ecli'], 'chunks': relevant_chunks, 'countries_present': row['countries_present']}, ignore_index = True)
    return return_df


In [18]:
import warnings
warnings.filterwarnings('ignore')

In [19]:
print(f"{len(merged_df)} cases in original df.")
trafficking_df = enforce_rule_1(merged_df)
print(f"{len(trafficking_df)} cases after rule 1.")
trafficking_df = enforce_rule_2(trafficking_df)
print(f"{len(trafficking_df)} cases after rule 2.")
trafficking_df = split_cases_in_chunks(trafficking_df)
# trafficking_df = enforce_rule_3(trafficking_df)
# print(f"{len(trafficking_df)} cases after rule 3.")
# trafficking_df = enforce_rule_4(trafficking_df)
# print(f"{len(trafficking_df)} cases after rule 4.")
trafficking_df

18492 cases in original df.
13719 cases after rule 1.
13598 cases after rule 2.


Unnamed: 0,ecli,chunks
0,ECLI:NL:RBNNE:2021:5018,"[RECHTBANK NOORD-NEDERLAND, Afdeling strafrech..."
1,ECLI:NL:RBZUT:2003:AH9598,"[RECHTBANK ZUTPHEN, Meervoudige economische st..."
2,ECLI:NL:RBZWB:2020:2646,"[RECHTBANK ZEELAND-WEST-BRABANT, Strafrecht, Z..."
3,ECLI:NL:GHAMS:2019:1601,"[afdeling strafrecht, parketnummer: 23-001795-..."
4,ECLI:NL:GHAMS:2019:1602,"[afdeling strafrecht, parketnummer: 23-001762-..."
...,...,...
13593,ECLI:NL:RBAMS:2013:1294,"[RECHTBANK AMSTERDAM, VONNIS, 13/529106-06 (za..."
13594,ECLI:NL:PHR:2020:1106,"[PROCUREUR-GENERAAL, BIJ DE, HOGE RAAD DER NED..."
13595,ECLI:NL:GHAMS:2017:2618,"[parketnummer: 23-001217-13, datum uitspraak: ..."
13596,ECLI:NL:RBAMS:2013:BZ0392,"[RECHTBANK AMSTERDAM , VONNIS , 13/529144-06 ..."


In [None]:
open_file = open('saves/trafficking_df.pkl', "wb")
pickle.dump(trafficking_df, open_file)
open_file.close()

### Results

In [46]:
trafficking_df.to_pickle('saves/trafficking_df.pkl')

In [None]:
trafficking_df