In [14]:
import spacy
from spacy import displacy
import pandas as pd
import os
import pickle
import requests
import warnings
warnings.filterwarnings('ignore')


# Spacy & Rule based

### Load SpaCy model

In [2]:
nlp = spacy.load('models/configured_spacy_model')

In [7]:
txt = "De verdachte smokkelde 123 kg mdma van Nederland naar Bergen op Zoom."

In [8]:
txt = "hij in of omstreeks de periode van 11 september 2018 tot en met 26 februari 2019 te Amsterdam, in elk geval in Nederland en/of Peru, tezamen en in vereniging met een ander of anderen, althans alleen, om een feit, bedoeld in het vierde of vijfde lid van artikel 10 van de Opiumwet, te weten het opzettelijk telen, bereiden, bewerken, verwerken, verkopen, afleveren, verstrekken, vervoeren, vervaardigen en/of binnen of buiten het grondgebied van Nederland brengen van heroïne en/of cocaïne, in elk geval een hoeveelheid van een materiaal bevattende heroïne en/of cocaïne, in elk geval een middel vermeld op de bij de Opiumwet behorende lijst I voor te bereiden en/of te bevorderen"

In [9]:
displacy.render(nlp(txt), style='ent')

### Load dataset

In [3]:
merged_df = pd.read_pickle("merged_df.pkl")
merged_df

Unnamed: 0,id,verdict_date,publication_date,verdict_type,jurisdiction_type,inhoudsindicatie,case text
0,ECLI-NL-RBNNE-2021-5018,2021-01-10,2021-11-23,uitspraak,['Strafrecht'],"Productie synthetische drugs, medeplegen, prod...",\n\nRECHTBANK NOORD-NEDERLAND\nAfdeling strafr...
1,ECLI-NL-RBZUT-2003-AH9598,2003-03-06,2003-09-07,uitspraak,['Strafrecht'],Leveren grondstoffen synthetische drugs en sto...,\n\nRECHTBANK ZUTPHEN\nMeervoudige economische...
2,ECLI-NL-RBZWB-2020-2646,2020-06-23,2020-06-23,uitspraak,['Strafrecht'],plegen van voorbereidingshandelingen ten behoe...,\n\nRECHTBANK ZEELAND-WEST-BRABANT\n\nStrafrec...
3,ECLI-NL-GHAMS-2019-1601,2019-08-05,2019-07-26,uitspraak,['Strafrecht'],medeplegen witwassen - medeplegen voorhanden h...,\n\nafdeling strafrecht\nparketnummer: 23-0017...
4,ECLI-NL-GHAMS-2019-1602,2019-08-05,2019-07-26,uitspraak,['Strafrecht'],medeplegen witwassen - medeplegen voorhanden h...,\n\nafdeling strafrecht\nparketnummer: 23-0017...
...,...,...,...,...,...,...,...
18457,ECLI-NL-RBAMS-2013-1294,2013-01-29,2013-08-10,uitspraak,['Strafrecht'],De rechtbank acht bewezen dat er sprake is van...,\n\nRECHTBANK AMSTERDAM\n\n\nVONNIS\n\n \n\n13...
18458,ECLI-NL-PHR-2020-1106,2020-11-24,2020-11-24,conclusie,['Strafrecht'],Conclusie AG. Vervolging van een politieagent ...,\n\nPROCUREUR-GENERAAL\n\n\nBIJ DE\n\n\nHOGE R...
18459,ECLI-NL-GHAMS-2017-2618,2017-06-29,2017-05-07,uitspraak,['Strafrecht'],Liquidatieproces Passage\n\n ...,\n\n\nparketnummer: 23-001217-13\ndatum uitspr...
18460,ECLI-NL-RBAMS-2013-BZ0392,2013-01-29,2013-01-02,uitspraak,['Strafrecht'],De rechtbank acht bewezen dat verdachte zich s...,\nRECHTBANK AMSTERDAM \nVONNIS \n\n13/529144-...


### Load country configs & country classifier

In [4]:
geopy_mistakes = {}

In [5]:
open_file = open('saves/countries_to_exclude.pkl', "rb")
countries_to_exclude = pickle.load(open_file)
open_file.close()

In [6]:
open_file = open('saves/country_translation_dict.pkl', "rb")
country_translation_dict = pickle.load(open_file)
open_file.close()

In [7]:
api_key = 'AIzaSyCcbIhMxSz5OP74pDT0aQLTvXDSMaV8tFk'
geocode_url = 'https://maps.googleapis.com/maps/api/geocode/json?address='

def get_geocode_country(txt):
    res = requests.get(f"{geocode_url}{txt}&key={api_key}").json()['results']
    country_name = "None"
    try:
        for address_component in res[0]['address_components']:
            if 'country' in address_component['types']:
                country_name = address_component['long_name']
    except:
        return "None"
    return country_name

In [8]:
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent = "geoapiExercises")

def get_geopy_country(txt):
    try:
        location = geolocator.geocode(txt, language='en')
        country_name = location.raw['display_name'].split(',')[-1].strip()
        return country_name
    except:
        return "None"

In [9]:
def country_pipeline(txt):
    txt = txt.lower()
    # Check if already exists
    if txt in countries_to_exclude:
        return "None"
    for key in country_translation_dict:
        if txt in country_translation_dict[key]:
            return key
    
    # Get location
    geopy_loc = get_geopy_country(txt)
    if geopy_loc == "None":
        countries_to_exclude.append(txt)
        return "None"
    else:
        geocode_loc = get_geocode_country(txt)
        if geocode_loc == "None":
            countries_to_exclude.append(txt)
            geopy_mistakes[txt] = geopy_loc
            return "None"
        else:
            if geocode_loc not in country_translation_dict:
                country_translation_dict[geocode_loc] = []
            country_translation_dict[geocode_loc].append(txt)
            return geocode_loc

In [10]:
country_pipeline('Nederlandse Antillen')

'Netherlands'

### Filter cases

In [11]:
import re
def split_text_in_chunks(doc):
    chunks = re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])|\n', doc)
    chunks = [x for x in chunks if len(x) > 1]
    return chunks

def split_cases_in_chunks(df):
    return_df = pd.DataFrame(columns=['ecli', 'chunks'])
    for index, row in df.iterrows():
        chunks = split_text_in_chunks(row['case text'])
        return_df = return_df.append({'ecli': row['id'].replace('-', ':'), 'chunks': chunks}, ignore_index = True)
    return return_df

Rules:
1. Case needs to concern criminal law
2. Case needs to contain a smuggle word
3. Case needs to contain a country that is not "Netherlands"
4. Case needs to contain a chunk that contains a country and a drug

In [12]:
def enforce_rule_1(df):
    return_df = pd.DataFrame()
    for index, row in df.iterrows():
        if 'Strafrecht' in row['jurisdiction_type']:
            return_df = return_df.append(row)
    return return_df

In [13]:
open_file = open('saves/smuggle_words.pkl', "rb")
smuggle_words = pickle.load(open_file)
open_file.close()

def enforce_rule_2(df):
    return_df = pd.DataFrame()
    for index, row in df.iterrows():
        if any(word in row['case text'] for word in smuggle_words):
            return_df = return_df.append(row)
    return return_df

In [14]:
def enforce_rule_3(df):
    return_df = pd.DataFrame(columns=['ecli', 'chunks', 'countries_present'])
    for index, row in df.iterrows():
        countries_present = []
        chunks = row['chunks']
        for chunk in chunks:
            append = False
            ents = nlp(chunk).ents
            for ent in ents:
                if ent.label_ == "GPE":
                    country = country_pipeline(ent.text)
                    if country != "None" and country not in countries_present:
                        countries_present.append(ent.text)
                        if country != "Netherlands":
                            append = True
        if len(countries_present) > 0 and append:
            return_df = return_df.append({'ecli': row['ecli'], 'chunks': chunks, 'countries_present': countries_present}, ignore_index = True)
    return return_df  


In [15]:
open_file = open('saves/drug_list.pkl', "rb")
drug_list = pickle.load(open_file)
open_file.close()

def enforce_rule_4(df):
    return_df = pd.DataFrame(columns['ecli', 'chunks', 'countries_present'])
    for index, row in df.iterrows():
        chunks = row['chunks']
        relevant_chunks = []
        for chunk in chunks:
            word_list = [x for x in chunk.lower().rstrip().replace('.', '').split(' ') if len(x)>0]
            if any(drug in word_list for drug in drug_list):
                ents = nlp(chunk).ents
                for ent in ents:
                    if ent.type_ == "GPE":
                        country = country_pipeline(ent.text)
                        if country != "None":
                            relevant_chunks.append(chunk)
                        
        if len(relevant_chunks) > 0:
            return_df = return_df.append({'ecli': row['ecli'], 'chunks': relevant_chunks, 'countries_present': row['countries_present']}, ignore_index = True)
    return return_df


In [39]:
import warnings
warnings.filterwarnings('ignore')

#### Create temp df with custom ecli's 

In [None]:
ecli_list = ['ECLI:NL:RBNHO:2020:2746']

In [124]:
ecli_list = []
for id in list(merged_df['id'].sample(n=10, random_state=12)):
    ecli_list.append(id.replace('-', ':'))

ecli_list

['ECLI:NL:RBSGR:2004:AP0058',
 'ECLI:NL:RBARN:2012:BV8013',
 'ECLI:NL:GHSHE:2021:1482',
 'ECLI:NL:RBNNE:2019:1542',
 'ECLI:NL:RBBRE:2004:AR4371',
 'ECLI:NL:RBZWB:2020:2342',
 'ECLI:NL:HR:2011:BR2990',
 'ECLI:NL:OGEAA:2017:225',
 'ECLI:NL:PHR:2009:BJ2785',
 'ECLI:NL:HR:2010:BK4154']

In [125]:

corr_ecli_list = [ecli.replace(':', '-') for ecli in ecli_list]
temp_df = merged_df[merged_df['id'].isin(corr_ecli_list)]

### Execute

In [133]:
%%time
curr_df = merged_df
# curr_df = temp_df

print(f"{len(curr_df)} cases in original df.")
trafficking_df = enforce_rule_1(curr_df)
print(f"{len(trafficking_df)} cases after rule 1.")
trafficking_df = enforce_rule_2(trafficking_df)
print(f"{len(trafficking_df)} cases after rule 2.")
trafficking_df = split_cases_in_chunks(trafficking_df)
# trafficking_df = enforce_rule_3(trafficking_df)
# print(f"{len(trafficking_df)} cases after rule 3.")
# trafficking_df = enforce_rule_4(trafficking_df)
# print(f"{len(trafficking_df)} cases after rule 4.")
trafficking_df

18462 cases in original df.
13695 cases after rule 1.
13579 cases after rule 2.
CPU times: total: 1min 38s
Wall time: 1min 39s


Unnamed: 0,ecli,chunks
0,ECLI:NL:RBNNE:2021:5018,"[RECHTBANK NOORD-NEDERLAND, Afdeling strafrech..."
1,ECLI:NL:RBZUT:2003:AH9598,"[RECHTBANK ZUTPHEN, Meervoudige economische st..."
2,ECLI:NL:RBZWB:2020:2646,"[RECHTBANK ZEELAND-WEST-BRABANT, Strafrecht, Z..."
3,ECLI:NL:GHAMS:2019:1601,"[afdeling strafrecht, parketnummer: 23-001795-..."
4,ECLI:NL:GHAMS:2019:1602,"[afdeling strafrecht, parketnummer: 23-001762-..."
...,...,...
13574,ECLI:NL:RBAMS:2013:1294,"[RECHTBANK AMSTERDAM, VONNIS, 13/529106-06 (za..."
13575,ECLI:NL:PHR:2020:1106,"[PROCUREUR-GENERAAL, BIJ DE, HOGE RAAD DER NED..."
13576,ECLI:NL:GHAMS:2017:2618,"[parketnummer: 23-001217-13, datum uitspraak: ..."
13577,ECLI:NL:RBAMS:2013:BZ0392,"[RECHTBANK AMSTERDAM , VONNIS , 13/529144-06 ..."


In [79]:
open_file = open('saves/drug_list.pkl', "rb")
drug_list = pickle.load(open_file)
open_file.close()

open_file = open('saves/countries_to_exclude.pkl', "rb")
countries_to_exclude = pickle.load(open_file)
open_file.close()

my_file = open("saves/countries_to_exclude.txt", "r")
content = my_file.read()
countries_to_exclude_from_txt = content.split('\n')

for loc in countries_to_exclude_from_txt:
    if loc not in countries_to_exclude:
        countries_to_exclude.append(loc)

open_file = open('saves/country_translation_dict.pkl', "rb")
country_translation_dict = pickle.load(open_file)
open_file.close()

valid_countries_list = []
for key in country_translation_dict:
    for loc in country_translation_dict[key]:
        if loc not in countries_to_exclude:
            valid_countries_list.append(loc)
print(f"{len(valid_countries_list)} locations in {len(country_translation_dict)} countries")


def temporary_rule_3_and_4_solution(df):
    return_df = pd.DataFrame(columns=['ecli', 'chunks', 'countries_present', 'relevant_chunks'])
    for index, row in df.iterrows():
        print(index)
        countries_present = {}
        relevant_chunks = []
        append = False
        for chunk in row['chunks']:
            word_list = [x for x in chunk.lower().rstrip().replace('.', '').split(' ') if len(x)>0]
            for word in word_list:
                if word not in countries_to_exclude:
                    if word in valid_countries_list:
                        for key in country_translation_dict:
                            if word in country_translation_dict[key]:
                                if key not in countries_present and key != "Netherlands":
                                    if key not in countries_present:
                                        countries_present[key] = []
                                    countries_present[key].append(word)
                                    append = True
                                if any(drug in word_list for drug in drug_list):
                                    relevant_chunks.append(chunk)
                                       
        if append and len(relevant_chunks) > 0:
            return_df = return_df.append({'ecli': row['ecli'], 'chunks': row['chunks'], 'countries_present': countries_present, 'relevant_chunks': relevant_chunks}, ignore_index = True)
    return return_df

6402 locations in 181 countries


In [21]:
def temporary_rule_3_and_4_solution(df):
    return_df = pd.DataFrame(columns=['ecli', 'chunks', 'countries_present', 'relevant_chunks'])
    for index, row in df.iterrows():
        print(index)
        countries_present = {}
        relevant_chunks = []
        
        for chunk in row['chunks']:
            drug = False
            append = False
            for ent in nlp(chunk).ents:
                if ent.label_ == "GPE":
                    country = country_pipeline(ent.text)
                    if country != "None":
                        if country != "Netherlands":
                            append = True
                            if country not in countries_present:
                                countries_present[country] = []
                            if ent.text not in countries_present[country]:
                                countries_present[country].append(ent.text)
                elif ent.label_ == "DRUG":
                    drug = True
            
            if drug and append:
                relevant_chunks.append(chunk)
                            
        if len(relevant_chunks) > 0:
            return_df = return_df.append({'ecli': row['ecli'], 'chunks': row['chunks'], 'countries_present': countries_present, 'relevant_chunks': relevant_chunks}, ignore_index = True)
    return return_df

In [127]:
%%time
filtered_df = temporary_rule_3_and_4_solution(trafficking_df)

0
1
2
3
4
5
6
7
CPU times: total: 35 s
Wall time: 35 s


In [129]:
filtered_df

Unnamed: 0,ecli,chunks,countries_present,relevant_chunks
0,ECLI:NL:RBNNE:2019:1542,"[RECHTBANK NOORD-NEDERLAND, Afdeling strafrech...","{'South Africa': ['Stratum'], 'Germany': ['lag...",[AAJC7176NL: bevat amfetamine gerelateerde syn...
1,ECLI:NL:OGEAA:2017:225,"[GERECHT IN EERSTE AANLEG VAN ARUBA, S T R A F...","{'Aruba': ['ARUBA', 'Aruba'], 'United States':...",[Dat hij op 1 november 2016 in Aruba tezamen e...
2,ECLI:NL:RBBRE:2004:AR4371,"[RECHTBANK BREDA , Parketnummer(s): 02/004325-...","{'Belgium': ['Zuid'], 'United Kingdom': ['Groo...",[Onder feit 1 primair is ten laste is gelegd h...


In [130]:
filtered_df['countries_present'].iloc[0]

{'South Africa': ['Stratum'], 'Germany': ['lage']}

In [None]:
filtered_df.to_pickle('saves/trafficking_df.pkl')

### Use threading

In [16]:
from threading import Thread

In [17]:
%%time
curr_df = merged_df[:1000]
# curr_df = temp_df

print(f"{len(curr_df)} cases in original df.")
trafficking_df = enforce_rule_1(curr_df)
print(f"{len(trafficking_df)} cases after rule 1.")
trafficking_df = enforce_rule_2(trafficking_df)
print(f"{len(trafficking_df)} cases after rule 2.")
trafficking_df = split_cases_in_chunks(trafficking_df)
# trafficking_df = enforce_rule_3(trafficking_df)
# print(f"{len(trafficking_df)} cases after rule 3.")
# trafficking_df = enforce_rule_4(trafficking_df)
# print(f"{len(trafficking_df)} cases after rule 4.")
trafficking_df

1000 cases in original df.
777 cases after rule 1.
759 cases after rule 2.
CPU times: total: 4.62 s
Wall time: 4.59 s


Unnamed: 0,ecli,chunks
0,ECLI:NL:RBNNE:2021:5018,"[RECHTBANK NOORD-NEDERLAND, Afdeling strafrech..."
1,ECLI:NL:RBZUT:2003:AH9598,"[RECHTBANK ZUTPHEN, Meervoudige economische st..."
2,ECLI:NL:RBZWB:2020:2646,"[RECHTBANK ZEELAND-WEST-BRABANT, Strafrecht, Z..."
3,ECLI:NL:GHAMS:2019:1601,"[afdeling strafrecht, parketnummer: 23-001795-..."
4,ECLI:NL:GHAMS:2019:1602,"[afdeling strafrecht, parketnummer: 23-001762-..."
...,...,...
754,ECLI:NL:RBMNE:2018:1565,"[RECHTBANK MIDDEN-NEDERLAND, Afdeling strafrec..."
755,ECLI:NL:RBLIM:2018:2961,"[RECHTBANK LIMBURG, Zittingsplaats Roermond, S..."
756,ECLI:NL:RBOVE:2016:2773,"[Rechtbank Overijssel, Afdeling Strafrecht , ..."
757,ECLI:NL:RBOBR:2016:2222,"[vonnis, RECHTBANK OOST-BRABANT , Zittingsplaa..."


In [23]:
%%time
temporary_rule_3_and_4_solution(trafficking_df[:10])

0
1
2
3
4
5
6
7
8
9
CPU times: total: 40.1 s
Wall time: 40.1 s


Unnamed: 0,ecli,chunks,countries_present,relevant_chunks
0,ECLI:NL:RBZWB:2020:2646,"[RECHTBANK ZEELAND-WEST-BRABANT, Strafrecht, Z...","{'United States': ['Linden', 'Allen', 'allen']...",[6x witte jerrycan inhoudsmaat 25 liter 2x zwa...
1,ECLI:NL:RBAMS:2019:4296,"[vonnis, RECHTBANK AMSTERDAM, Afdeling Publiek...","{'Peru': ['Peru'], 'Belgium': ['Leuven']}",[hij in of omstreeks de periode van 11 septemb...


In [25]:
filtered_df = pd.DataFrame(columns=['ecli', 'chunks', 'countries_present', 'relevant_chunks'])

def execute_the_thing(df):
    bb_df = temporary_rule_3_and_4_solution(df)
    filtered_df = filtered_df.append(bb_df)
    

In [26]:
%%time
t1 = Thread(target=execute_the_thing,args=(trafficking_df[:3],))
t1.start()
t2 = Thread(target=execute_the_thing,args=(trafficking_df[3:6],))
t2.start()
t3 = Thread(target=execute_the_thing,args=(trafficking_df[6:10],))
t3.start()

t1.join()
t2.join()
t3.join()


0
3
6
7
1
4
8
9
2
5


Exception in thread Thread-14 (execute_the_thing):
Traceback (most recent call last):
  File "C:\Users\casbe\AppData\Local\Programs\Python\Python310\lib\threading.py", line 1009, in _bootstrap_inner
    self.run()
  File "C:\Users\casbe\AppData\Local\Programs\Python\Python310\lib\threading.py", line 946, in run
    self._target(*self._args, **self._kwargs)
  File "C:\Users\casbe\AppData\Local\Temp/ipykernel_20288/965615678.py", line 5, in execute_the_thing
UnboundLocalError: local variable 'filtered_df' referenced before assignment
Exception in thread Thread-13 (execute_the_thing):
Traceback (most recent call last):
  File "C:\Users\casbe\AppData\Local\Programs\Python\Python310\lib\threading.py", line 1009, in _bootstrap_inner
    self.run()
  File "C:\Users\casbe\AppData\Local\Programs\Python\Python310\lib\threading.py", line 946, in run
    self._target(*self._args, **self._kwargs)
  File "C:\Users\casbe\AppData\Local\Temp/ipykernel_20288/965615678.py", line 5, in execute_the_thing
U

CPU times: total: 41.8 s
Wall time: 38.9 s


In [155]:
filtered_df

Unnamed: 0,ecli,chunks,countries_present,relevant_chunks


5
1
2
6
7
8
9
3
4
5
6
7
8
9


### Results

In [46]:
trafficking_df.to_pickle('saves/trafficking_df.pkl')

In [None]:
trafficking_df

# ML Based

### Import data

In [244]:
merged_df = pd.read_pickle("merged_df.pkl")

In [245]:
merged_df

Unnamed: 0,id,verdict_date,publication_date,verdict_type,jurisdiction_type,inhoudsindicatie,case text
0,ECLI-NL-RBNNE-2021-5018,2021-01-10,2021-11-23,uitspraak,['Strafrecht'],"Productie synthetische drugs, medeplegen, prod...",\n\nRECHTBANK NOORD-NEDERLAND\nAfdeling strafr...
1,ECLI-NL-RBZUT-2003-AH9598,2003-03-06,2003-09-07,uitspraak,['Strafrecht'],Leveren grondstoffen synthetische drugs en sto...,\n\nRECHTBANK ZUTPHEN\nMeervoudige economische...
2,ECLI-NL-RBZWB-2020-2646,2020-06-23,2020-06-23,uitspraak,['Strafrecht'],plegen van voorbereidingshandelingen ten behoe...,\n\nRECHTBANK ZEELAND-WEST-BRABANT\n\nStrafrec...
3,ECLI-NL-GHAMS-2019-1601,2019-08-05,2019-07-26,uitspraak,['Strafrecht'],medeplegen witwassen - medeplegen voorhanden h...,\n\nafdeling strafrecht\nparketnummer: 23-0017...
4,ECLI-NL-GHAMS-2019-1602,2019-08-05,2019-07-26,uitspraak,['Strafrecht'],medeplegen witwassen - medeplegen voorhanden h...,\n\nafdeling strafrecht\nparketnummer: 23-0017...
...,...,...,...,...,...,...,...
18487,ECLI-NL-RBAMS-2013-1294,2013-01-29,2013-08-10,uitspraak,['Strafrecht'],De rechtbank acht bewezen dat er sprake is van...,\n\nRECHTBANK AMSTERDAM\n\n\nVONNIS\n\n \n\n13...
18488,ECLI-NL-PHR-2020-1106,2020-11-24,2020-11-24,conclusie,['Strafrecht'],Conclusie AG. Vervolging van een politieagent ...,\n\nPROCUREUR-GENERAAL\n\n\nBIJ DE\n\n\nHOGE R...
18489,ECLI-NL-GHAMS-2017-2618,2017-06-29,2017-05-07,uitspraak,['Strafrecht'],Liquidatieproces Passage\n\n ...,\n\n\nparketnummer: 23-001217-13\ndatum uitspr...
18490,ECLI-NL-RBAMS-2013-BZ0392,2013-01-29,2013-01-02,uitspraak,['Strafrecht'],De rechtbank acht bewezen dat verdachte zich s...,\nRECHTBANK AMSTERDAM \nVONNIS \n\n13/529144-...


In [246]:
validation_df =  pd.read_excel('validation/trafficking_filter_validation_200.xlsx', index_col=0)
validation_df

Unnamed: 0_level_0,trafficking_related
ECLI,Unnamed: 1_level_1
ECLI:NL:RBSGR:2004:AP0058,False
ECLI:NL:RBARN:2012:BV8013,False
ECLI:NL:GHSHE:2021:1482,False
ECLI:NL:RBNNE:2019:1542,False
ECLI:NL:RBBRE:2004:AR4371,True
...,...
ECLI:NL:RBOBR:2014:546,False
ECLI:NL:RBNHO:2014:4763,False
ECLI:NL:GHARL:2021:6026,False
ECLI:NL:RBNNE:2022:1283,False


In [247]:
df = pd.DataFrame(columns=['ecli', 'text', 'val'])
for index, row in validation_df.iterrows():
    ecli = index
    text = merged_df[merged_df['id'] == ecli.replace(':', '-')].iloc[0]['case text'].replace('\n', ' ')
    val = row['trafficking_related']
    df = df.append({'ecli': ecli, 'text': text, 'val': val}, ignore_index=True)
df

Unnamed: 0,ecli,text,val
0,ECLI:NL:RBSGR:2004:AP0058,RECHTBANK te ‘s-GRAVENHAGE nevenzittingsplaa...,False
1,ECLI:NL:RBARN:2012:BV8013,RECHTBANK ARNHEM Sector strafrecht Meervoud...,False
2,ECLI:NL:GHSHE:2021:1482,GERECHTSHOF 's-HERTOGENBOSCH Team familie-...,False
3,ECLI:NL:RBNNE:2019:1542,RECHTBANK NOORD-NEDERLAND Afdeling strafrech...,False
4,ECLI:NL:RBBRE:2004:AR4371,RECHTBANK BREDA Parketnummer(s): 02/004325-0...,True
...,...,...,...
195,ECLI:NL:RBOBR:2014:546,uitspraak RECHTBANK OOST-BRABANT Stra...,False
196,ECLI:NL:RBNHO:2014:4763,RECHTBANK NOORD-HOLLAND Afdeling Publiekrec...,False
197,ECLI:NL:GHARL:2021:6026,Afdeling strafrecht Parketnummer:\t21-0022...,False
198,ECLI:NL:RBNNE:2022:1283,Uitspraak RECHTBANK NOORD-NEDERLAND Afdeli...,False


### TF-IDF

In [314]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
corpus = list(df['text'])
X = vectorizer.fit_transform(corpus)
print(f"Amount of features in corpus: {len(vectorizer.get_feature_names())}\n")


train_percentage_percentage = 70
print(f"Using an {train_percentage_percentage}% train/test split")
cutoff_val = int(len(df) * (train_percentage_percentage / 100))
xarray = X.toarray()
X_train = xarray[:cutoff_val]
X_test = xarray[cutoff_val:]

y_train = list(df['val'])[:cutoff_val]
y_test = list(df['val'])[cutoff_val:]

print(f"X Train len: {len(X_train)} \nX Test len: {len(X_test)}")
print(f"y Train len: {len(y_train)} \ny Test len: {len(y_test)}")

Amount of features in corpus: 26592

Using an 70% train/test split
X Train len: 140 
X Test len: 60
y Train len: 140 
y Test len: 60


### Keras || classifies all 0

In [279]:
from keras.models import Sequential
from keras import layers

input_dim = X_train.shape[1]

model = Sequential()
model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

In [280]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 10)                265930    
                                                                 
 dense_5 (Dense)             (None, 1)                 11        
                                                                 
Total params: 265,941
Trainable params: 265,941
Non-trainable params: 0
_________________________________________________________________


In [288]:
y_train_keras = list(map(int, y_train))
y_test_keras = list(map(int, y_test))

In [297]:
history = model.fit(X_train, np.array(y_train),
                    epochs=100,
                    verbose=True,
                    validation_data=(X_test, np.array(y_test)),
                    batch_size=10)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100


Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [298]:
loss, accuracy = model.evaluate(X_train, np.array(y_train), verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, np.array(y_test), verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 1.0000
Testing Accuracy:  0.8500


### Logistic Regression || classifies all 0

In [270]:
from sklearn.linear_model import LogisticRegression

In [271]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)
score

0.85

In [272]:
classifier.predict(X_test)

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False])

In [275]:
print(f"Trained on {len(X_train)} and validated on {len(X_test)}\n")
print(f"Accuracy is {model.score(X_test, y_test)}")

pred = classifier.predict(X_test)
pred_list = list(pred)
val_list = list(map(int, list(validation_df[-len(list(pred)):]['trafficking_related'])))

pred = model.predict(X_test)
print(f"Predicted {pred_list.count(0)} as False \nPredicted {pred_list.count(1)} as True \n")

tn, fp, fn, tp = confusion_matrix(val_list, pred_list).ravel()
print(f"Precision: {tp / (tp + fp)}")
print(f"Recall: {tp / (tp + fn)}\n")


Trained on 160 and validated on 40

Accuracy is 0.975
Predicted 40 as False 
Predicted 0 as True 

Precision: nan
Recall: 0.0



### XGB

In [327]:
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix

model = XGBClassifier()
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

In [318]:
print(f"Trained on {len(X_train)} and validated on {len(X_test)}\n")
print(f"Accuracy is {model.score(X_test, y_test)}")

pred = model.predict(X_test)
pred_list = list(pred)
val_list = list(map(int, list(validation_df[-len(list(pred)):]['trafficking_related'])))

pred = model.predict(X_test)
print(f"Predicted {pred_list.count(0)} of {val_list.count(0)} as False \nPredicted {pred_list.count(1)} of {val_list.count(1)} as True \n")

tn, fp, fn, tp = confusion_matrix(val_list, pred_list).ravel()
print(f"Precision: {tp / (tp + fp)}")
print(f"Recall: {tp / (tp + fn)}\n")

i = 0
for index, row in validation_df[-len(list(pred)):].iterrows():
    if pred[i] != row['trafficking_related']:
        print(f"Classified {index} as {pred[i]} but was {row['trafficking_related']}")
    i += 1

        

Trained on 140 and validated on 60

Accuracy is 0.95
Predicted 56 of 53 as False 
Predicted 4 of 7 as True 

Precision: 1.0
Recall: 0.5714285714285714

Classified ECLI:NL:RBNHO:2018:7891 as 0 but was True
Classified ECLI:NL:HR:2016:684 as 0 but was True
Classified ECLI:NL:RBGEL:2016:1716 as 0 but was True


In [269]:
importances = pd.Series(data=model.feature_importances_, index=vectorizer.get_feature_names())
importances.sort_values(ascending=False)[:20]

douane           0.101960
grondgebied      0.085303
feitelijk        0.081685
raad             0.075222
verdachten       0.050031
21               0.047656
stuk             0.047321
binnenzijde      0.046238
betrokken        0.042018
verklaringen     0.029468
parketnummer     0.027301
woonplaats       0.025944
moesten          0.025666
doorgenummerd    0.025662
gezegd           0.024834
gedetineerd      0.024479
onderlinge       0.018708
betrokkenheid    0.018204
daarin           0.016858
wel              0.013697
dtype: float32

### Apply to all

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
corpus = list(df['text'])
X = vectorizer.fit_transform(corpus)
print(f"Amount of features in corpus: {len(vectorizer.get_feature_names())}\n")


train_percentage_percentage = 70
print(f"Using an {train_percentage_percentage}% train/test split")
cutoff_val = int(len(df) * (train_percentage_percentage / 100))
xarray = X.toarray()
X_train = xarray[:cutoff_val]
X_test = xarray[cutoff_val:]

y_train = list(df['val'])[:cutoff_val]
y_test = list(df['val'])[cutoff_val:]

print(f"X Train len: {len(X_train)} \nX Test len: {len(X_test)}")
print(f"y Train len: {len(y_train)} \ny Test len: {len(y_test)}")

In [323]:
merged_df_corpus = [text.replace('\n', ' ') for text in list(merged_df['case text'])]



18492

In [329]:
X = vectorizer.fit_transform(merged_df_corpus)
print(f"Amount of features in corpus: {len(vectorizer.get_feature_names())}\n")

Amount of features in corpus: 339295



In [330]:
xarray = X.toarray()


In [331]:
model.predict(xarray)

ValueError: Feature shape mismatch, expected: 26592, got 339295