# Create dataset

In [1]:
import pandas as pd
import os

In [2]:
dataPath = os.getcwd() + '/court case data/testdata/'
caseCount = len(os.listdir(dataPath))
data = []
try:
    os.remove(dataPath + ".DS_Store")
except:
    print("No file DS_Store")
for filename in os.listdir(dataPath):
    f = open(os.path.join(dataPath, filename), encoding='utf-8')
    data.append([filename.replace('.txt', ''), f.read()])

verdict_df = pd.DataFrame(data, columns=["id", "case text"])
cases_df = pd.read_csv('./court case data/testdata.csv')
merged_df = cases_df.join(verdict_df.set_index('id'), on='id', how='left')

merged_df["verdict_date"] = pd.to_datetime(merged_df["verdict_date"])
merged_df["publication_date"] = pd.to_datetime(merged_df["publication_date"])

No file DS_Store


In [3]:
merged_df

Unnamed: 0,id,verdict_date,publication_date,verdict_type,jurisdiction_type,case text
0,ECLI-NL-RBNNE-2021-5018,2021-01-10,2021-11-23,uitspraak,['Strafrecht'],\n\nRECHTBANK NOORD-NEDERLAND\nAfdeling strafr...
1,ECLI-NL-RBZUT-2003-AH9598,2003-03-06,2003-09-07,uitspraak,['Strafrecht'],\n\nRECHTBANK ZUTPHEN\nMeervoudige economische...
2,ECLI-NL-RBZWB-2020-2646,2020-06-23,2020-06-23,uitspraak,['Strafrecht'],\n\nRECHTBANK ZEELAND-WEST-BRABANT\n\nStrafrec...
3,ECLI-NL-GHAMS-2019-1601,2019-08-05,2019-07-26,uitspraak,['Strafrecht'],\n\nafdeling strafrecht\nparketnummer: 23-0017...
4,ECLI-NL-GHAMS-2019-1602,2019-08-05,2019-07-26,uitspraak,['Strafrecht'],\n\nafdeling strafrecht\nparketnummer: 23-0017...
...,...,...,...,...,...,...
18119,ECLI-NL-RBAMS-2013-1294,2013-01-29,2013-08-10,uitspraak,['Strafrecht'],\n\nRECHTBANK AMSTERDAM\n\n\nVONNIS\n\n \n\n13...
18120,ECLI-NL-PHR-2020-1106,2020-11-24,2020-11-24,conclusie,['Strafrecht'],\n\nPROCUREUR-GENERAAL\n\n\nBIJ DE\n\n\nHOGE R...
18121,ECLI-NL-GHAMS-2017-2618,2017-06-29,2017-05-07,uitspraak,['Strafrecht'],\n\n\nparketnummer: 23-001217-13\ndatum uitspr...
18122,ECLI-NL-RBAMS-2013-BZ0392,2013-01-29,2013-01-02,uitspraak,['Strafrecht'],\nRECHTBANK AMSTERDAM \nVONNIS \n\n13/529144-...


# Split documents

### Old

In [None]:
# -*- coding: utf-8 -*-
import re
alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr|mr|mevr|mvr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov|nl)"
articles = "[artikel ][0-9][.][0-9]"

def split_into_sentences2(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    text = re.sub(articles,"[artikelnummer]",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    return sentences

### New

In [74]:
import re

In [75]:
def split_into_sentences(text):
    sentences = re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])|\n', text)
    sentences = [x for x in sentences if len(x) > 1]
    return sentences


Two ways of splitting the documents:
    For Word2Vec, we need sentences to be an array of words.
    For the rest, just the sentence is enough.

In [86]:
tenlastelegging_words = ['tenlastelegging', 'telastelegging', 'tenlasteleggingen', 'telastlegging']

def trim_by_tenlastelegging(doc):
    trimmed_doc = doc
    stop = False
    for keyword in tenlastelegging_words:
        if not stop and keyword in doc:
            keyword_index = doc.find(keyword)
            trimmed_doc = doc[keyword_index:]
            stop = True
    return trimmed_doc

In [88]:
sentence_list_by_word = []
sentence_list = []

for i in range(len(merged_df)):
    doc = merged_df.iloc[i]['case text']
    trimmed_doc = trim_by_tenlastelegging(doc)
    sentences = split_into_sentences(trimmed_doc)
    sentence_list.append(sentences)
    for j in sentences:
        word_list = [x for x in j.lower().rstrip().replace('.', '').split(' ') if len(x)>0]
        sentence_list_by_word.append(word_list)
        
print(len(sentence_list))
print(len(sentence_list_by_word))

18124
4714882


Testers:

In [None]:
for i in sentence_list[:1]:
    for j in i:
        print(j, '\n')

In [71]:
for i in sentence_list_by_word[:1]:
    for j in i:
        print(j, '\n')

rechtbank 

noord-nederland 



In [82]:
print(len(sentence_list))

18124


In [83]:
print(len(sentence_list_by_word))

4714882


# Create Word2Vec model

In [84]:
from gensim.models import Word2Vec
from gensim.test.utils import common_texts

Create and save model

In [85]:
dutch_word2vec_model = Word2Vec(sentences=sentence_list_by_word, vector_size=100, window=5, min_count=1, workers=4)
dutch_word2vec_model.save("word2vec_dutch_court_cases.model")

Load model

In [89]:
dutch_word2vec_model = Word2Vec.load("word2vec_dutch_court_cases.model")

In [90]:
sims = dutch_word2vec_model.wv.most_similar('tenlastelegging', topn=100)
print([i[0] for i in sims])
# print(sims)

['telastelegging', 'tenlasteleggingen', 'telastlegging', 'tenlastelegging)', 'beschuldiging', 'bewijsmiddelen', 'dagvaarding', 'verfeitelijking', 'bewijsvoering', 'bewijsconstructie', 'zinsnede', 'jeugdforensische', 'delictsomschrijving', 'bewezenverklaring', 'kwalificatieve', 'strafmotivering', 'doorgestreept', 'feitsomschrijving', 'bewijsbijlage', 'aanvrage', 'bewijsoverweging', 'strategienota', 'eab', 'tenlastelegging-', '‘uitbuiting’', 'pleitnotitie-', 'tenlastegelegde', 'bewijsmotivering', 'misslag', 'transcriptie:', 'documentaire', 'ontslagbrief', 'delictsomschrijvingen', 'pl0100-2019261171-62', 'pleegperiode', 'algemeenheden', 'kwalificatie', '‘varken’', 'verdediging', 'locatiefoto’s', 'nfi-rapport(1)', 'gedachtestreepjes', 'bob-aanvragen', 'taalfouten', 'hasholie', 'ovc-gesprekken', 'type-', 'raadsman', 'wettekst', 'afgedraaide', 'wamverzekering', '«toepassen»', 'cassatieschriftuur', 'herzieningsaanvraag', 'verdragsartikelen', 'bewezenverklaringen', '9051000', 'partieel', 'stat

### Create list of drugs, smuggle, quantity keywords with Word2Vec model

In [91]:
def create_word2vec_relevant_words(words, matches):
    word2vec_list = []
    for word in words:
        results = dutch_word2vec_model.wv.most_similar(word, topn=100)
        for i in results:
            word2vec_list.append(i[0])
            
    word2vec_list = list(set([i for i in word2vec_list if word2vec_list.count(i)>matches]))
    return word2vec_list

Drugs list:

In [92]:
list_of_drugs = ['xtc', 'mdma', 'cocaine', 'wiet', 'speed', 'bmk', 'pmk']
word2vec_drug_list = create_word2vec_relevant_words(list_of_drugs, 2)

print(len(word2vec_drug_list))
print(word2vec_drug_list)

77
['weed', 'methamfetamine', 'cocaine', 'safrol', 'ketamine', 'coke', 'pillen', 'gbl', 'manitol', 'xtc', 'xtc-tabletten', 'cocaïne;', 'ecstasy', 'diazepam', 'cocaïne', 'eindproduct', 'mdma', 'mefedron', 'mapa', 'amfetaminen', 'xtc-pillen', 'mdma-pillen', 'speed', 'n-formylamfetamine', 'crystal', 'morfine', 'heroïne)', 'cannabis', 'amfetaminepasta', 'mdma-poeder', 'cocaïne28', 'azijnzuuranhydride', 'lsd', 'hash', 'amfetamineolie', 'kristallen', 'amfetamine’', '(met)amfetamine', '34-methyleendioxymethamfetamine', 'drugs', '2c-b', 'pil', 'harddrugs', 'hashish', 'cocaïne)', 'hasjiesj', 'hennep', 'cocaïne?', 'mdma;', 'mdma/xtc', 'amfetaminesulfaat', '(mdma)', 'crack', 'paracetamol', 'ghb', 'hasj', 'methadon', 'amfetaminebase', 'cafeïne', 'methanol', 'metamfetamine', 'd-metamfetamine', 'marihuana', 'apaa', 'amfetamine', 'fenacetine', 'apaan', 'opium', 'heroïne;', 'pseudo-efedrine', 'mdma-kristallen', 'heroïne', 'mdma)', 'lidocaïne', 'pep', 'tabletten', 'amfetamine)']


In [93]:
drugs_to_exclude = ['pasta', 'kristallen', 'poedervorm']
word2vec_drug_list = [drug for drug in word2vec_drug_list if drug not in drugs_to_exclude]
print(len(word2vec_drug_list))

76


Smuggle keyword list:

In [94]:
list_of_smuggle_words = ['smokkel', 'invoer', 'uitvoer', 'import', 'export', 'transport']
word2vec_smuggle_list = create_word2vec_relevant_words(list_of_smuggle_words, 3)

word2vec_smuggle_list = list(set(word2vec_smuggle_list + list_of_smuggle_words ))
print(len(word2vec_smuggle_list))
print(word2vec_smuggle_list)

43
['smokkel', 'afzet', 'productie', 'handel', 'gesmokkelde', 'uithalen', 'verhandeling', 'bewerking', 'verwerking', 'leverancier', 'productie/bewerking', 'invoeren', 'uitvoer', 'doorverkoop', 'import', 'bronland', 'transport', 'cocaïnehandel', 'produceren', 'fabricage', 'exporteren', 'invoer', 'versnijden', 'doorlevering', 'hennephandel', 'verkoop', 'transsport', 'doorvoer', 'produktie', 'levering', 'bronlanden', 'binnensmokkelen', 'importeren', 'hennepteelt', '(invoer', 'versnijding', 'transporteren', 'vervaardiging', 'aanvoer', 'leverantie', 'export', 'in-/uitvoer', 'kweek']


In [95]:
# drugs_to_exclude = ['pasta', 'kristallen', 'poedervorm']
# word2vec_drug_list = [drug for drug in word2vec_drug_list if drug not in drugs_to_exclude]
# print(len(word2vec_drug_list))

Quantity keyword list:

In [96]:
list_of_quantity_words = ['tabletten', 'kilo', 'gram', 'pakketten']
word2vec_quantity_list = create_word2vec_relevant_words(list_of_quantity_words, 2)

word2vec_quantity_list = list(set(word2vec_quantity_list + list_of_quantity_words))
print(len(word2vec_quantity_list))
print(word2vec_quantity_list)

26
['pakjes', 'bollen', 'zakjes', 'milliliter', 'pillen', 'kilogram)', 'gram', 'ton', 'kilo', 'gram;', 'stuks', 'gripzakjes', 'ponypacks', 'joints', 'pakketten', 'bolletjes', 'kilo)', 'wikkels', 'blokken', 'tabletten', 'xtc-pillen', 'slikkersbollen', 'mdma-pillen', 'gram)', 'mg)', 'kg)']


Country list:

In [97]:
list_of_countries = ['duitsland', 'colombia', 'alicante']
word2vec_country_list = create_word2vec_relevant_words(list_of_countries, 1)

print(len(word2vec_country_list))
print(word2vec_country_list)

89
['sevilla', 'caracas', 'singapore', 'griekenland', 'dubai', 'spanje', 'canada', 'hongarije', 'iran', 'kinshasa', 'china', 'kenia', 'argentinië', 'curaçao', 'portugal', 'bosnië', 'tanger', 'peru', 'amerika', 'venezuela', 'istanbul', 'belgië', 'brussel', 'buitenland', 'republiek', 'turkije', 'chili', 'jamaica', 'panama', 'zwitserland', 'zuid-amerika', 'ierland', 'ecuador', 'hamburg', 'zweden', 'servië', 'guayaquil', 'malaga', 'polen', 'lissabon', 'oekraïne', 'australië', 'trinidad', 'gevlogen', 'dominicaanse', 'nederland', 'madrid', 'mexico', 'suriname', 'natal', 'congo', 'belgië)', 'costa', 'guatemala', 'antwerpen', 'tsjechië', 'londen', 'rica', 'roemenië', 'parijs', 'lima', 'pakistan', 'syrië', 'noorwegen', 'bulgarije', 'sydney', 'ghana', 'groot-brittannië', 'barcelona', 'denemarken', 'slowakije', '[land]', 'afrika', 'marokko', 'luxemburg', 'engeland', 'duitsland)', 'berlijn', 'bogota', 'brazilië', 'finland', 'kroatië', 'frankrijk', 'oostenrijk', 'thailand', 'europa', 'paramaribo', 

# Create SpaCy model

In [135]:
import spacy
from spacy import displacy
# !python -m spacy download nl_core_news_md
nlp = spacy.load('nl_core_news_md')


In [136]:
my_file = open("drugs list.txt", "r", encoding='utf-8')
my_file = my_file.readlines()
drugs_list = []
for i in my_file:
    drugs_list.append(i.replace('\n', ''))
drugs_list = list(set(drugs_list + word2vec_drug_list))
    
my_file = open("countries list.txt", "r", encoding='utf-8')
my_file = my_file.readlines()
countries_list = []
for i in my_file:
    countries_list.append(i.replace('\n', ''))
countries_list = list(set(countries_list + word2vec_country_list))

my_file = open("countries_to_exclude.txt", "r", encoding='utf-8')
my_file = my_file.readlines()
countries_to_exclude = []
for i in my_file:
    countries_to_exclude.append(i.replace('\n', ''))


In [137]:
def configure_spacy_model():
    # Create dict of drug pattern and quantity pattern
    pattern_list = []
    
    for i in countries_to_exclude:
        pattern_list.append({"label": "EXCL", "pattern": [{"lower": i.lower()}]})
    
    for i in drugs_list:
        pattern_list.append({"label": "DRUG", "pattern": [{"lower": i.lower()}]})
    
#     quantity_rule = {"label": "QUANTITY", "pattern": [{"IS_DIGIT": True}, {"LOWER": "gram"}]}
#     pattern_list.append(quantity_rule)
    for i in word2vec_quantity_list:
        pattern_list.append({"label": "QUANTITY", "pattern": [{"IS_DIGIT": True}, {"LOWER": i}]})
        pattern_list.append({"label": "QUANTITY", "pattern": [{"ENT_TYPE": "CARDINAL"}, {"LOWER": i}]})
    
    for i in countries_list:
        pattern_list.append({"label": "GPE", "pattern": [{"lower": i.replace(' ', '').lower()}]})
    
    # Add drug and quantity rules to the model
    config = {
   "phrase_matcher_attr": None,
   "validate": True,
   "overwrite_ents": True,
   "ent_id_sep": "||",
    }
    ruler = nlp.add_pipe("entity_ruler", config=config)

    #List of Entities and Patterns
#     patterns = drugs_ent_list
    ruler.add_patterns(pattern_list)

In [138]:
configure_spacy_model()

# Select cases and chunks to keep

In [101]:
print(len(sentence_list) == len(merged_df))

True


For every case, split the sentences. If a sentence in a case contains a drug, a smuggle word, and a location: keep chunk and save to trafficking_df

In [102]:
relevant_chunk_list = []
ecli_list = []

for index, case in enumerate(sentence_list):
    chunk_list = []
    trafficking_related = False
    for chunk in case:
        word_list = [x for x in chunk.lower().rstrip().replace('.', '').split(' ') if len(x)>0]
        if any(drug in word_list for drug in word2vec_drug_list) and any(smuggle_word in word_list for smuggle_word in word2vec_smuggle_list):
            ents = nlp(chunk).ents
            if any(ent.label_ == "GPE" or ent.label_ == "LOC" for ent in ents):
                trafficking_related = True
                chunk_list.append(chunk)
    if trafficking_related:
        relevant_chunk_list.append(chunk_list)
        ecli_list.append(merged_df.iloc[index]['id'].replace('-', ':'))

trafficking_df = pd.DataFrame({'id': pd.Series(ecli_list), 'chunks': pd.Series(relevant_chunk_list)})       

In [103]:
print(f"{len(trafficking_df)} cases kept from original {len(merged_df)} cases.")


2502 cases kept from original 18124 cases.


# Create rule-based NER & POS tagging model

### Old

In [249]:
def extract_chunk_info(txt):
    source_country = None
    total_info = []
    for token in nlp(txt):
        info = {}
        drug_info = {}
        countries = []
        
        if token.ent_type_ == "DRUG":
            info = {"drug": token.text}
            
            ## Get source and destination
            for ancestor in token.ancestors:
                for nephew in ancestor.children:
                    if nephew.ent_type_ == "GPE" or nephew.ent_type_ == "LOC":
                        countries.append(nephew)
                        for child in nephew.children:
                            if child.dep_ == "conj" and child.ent_type_ == "GPE" or child.ent_type_ == "LOC":
                                countries.append(child.text)
                            elif child.pos_ == "ADP" and child.dep_ == "case":
                                adj = child.text
            if len(countries) > 0 :
                try:
                    info[adj] = countries
                except:
                    info['land'] = countries
                        
            ## Get volume
            for ancestors in token.ancestors:
                for nephew in ancestors.children:
                    if nephew.ent_type_ == "QUANTITY" or nephew.ent_type_ == "CARDINAL":
                        for second_nephew in nephew.children:
                            if second_nephew.is_digit != nephew.is_digit:
                                if second_nephew.is_digit:
                                    info['volume'] = second_nephew.text
                                    info['volume_type'] = nephew.text
                                else:
                                    info['volume'] = nephew.text
                                    info['volume_type'] = second_nephew.text
            if 'volume' not in info:
                for child in token.children:
                    if (child.dep_ == "det" and child.like_num) or (child.dep_ == "nummod"):
                        info['volume'] = child.text
                                
        if len(info) > 1:
#             print(info)
            total_info.append(info)
    return total_info

### New

In [114]:
# Get linguistic distance between token a and token b. After iter 10 it is deemed a too far distance.
def get_linguistic_distance(a, b):
    tokens_to_consider = [b]
    found = False
    iters = 0
    while not found:
        for token in tokens_to_consider:
            tokens_to_add = []
            for ancestor in token.ancestors:
                if ancestor not in tokens_to_add and ancestor not in tokens_to_consider:
                    tokens_to_add.append(ancestor)
            for child in token.children:
                if child not in tokens_to_add and child not in tokens_to_consider:
                    tokens_to_add.append(child)
            tokens_to_consider = tokens_to_consider + tokens_to_add
        for x in tokens_to_consider:
            if a.orth == x.orth:
                found = True
        iters += 1
        if iters == 10:
            found = True
    return iters

In [115]:
def get_adposition_from_loc(token):
    for child in token.children:
        if child.pos_ == "ADP" and child.dep_ == "case":
            return child.text

In [140]:
def extract_chunk_info(txt):
    result = {}
    print('START CHUNK')
    for token in nlp(txt):
        if token.ent_type_ == "DRUG":
            print(f" \n Extracting info for {token.text}.")
            drug_info = extract_info_from_drug(token, txt)
            result[token.text] = drug_info
    return result
            
adj_list = []          
def extract_info_from_drug(drug, txt):
    volumes = []
    locations = {}
    irrelevant_locations = []
    for token in nlp(txt):
        
        # Extract countries
        if token.ent_type_ == "GPE":
            dist = get_linguistic_distance(drug, token)
#             if dist < 15:
            adj = get_adposition_from_loc(token)
            print(f"    {adj}: {token.text}, dist: {dist}, conj: {token.conjuncts}")
            locs = [token.text]
            for loc in token.conjuncts:
                locs.append(loc.text)
            if adj not in locations:
                locations[adj] = locs
                if adj not in adj_list:
                    adj_list.append(adj)
            else:
                for loc in locs:
                    if loc not in locations[adj]:
                        locations[adj].append(loc)
#             else:
#                 irrelevant_locations.append(token.text)
#                 print(f"{token.text} is irrelevant.")
        
        # Extract volume
        if token.ent_type_ == "QUANTITY":
            volume = {}
            dist = get_linguistic_distance(drug, token)
            second_token = ""
#             if dist < 10:
            quantity = {}
            for ancestor in token.ancestors:
                if ancestor.ent_type_ == "QUANTITY":
                    second_token = ancestor
            for child in token.children:
                if child.ent_type_ == "QUANTITY":
                    second_token = child

            ## Decide volume and volume_type
            if not isinstance(second_token, str):
                if nlp(token.text)[0].ent_type_ == "CARDINAL":
                    volume['volume'] = token.text
                    volume['volume_type'] = second_token.text
                    volume['dist'] = dist
                elif nlp(second_token.text)[0].ent_type_ == "CARDINAL":
                    volume['volume'] = second_token.text
                    volume['volume_type'] = token.text
                    volume['dist'] = dist

                #Only append when not already in volumes
                if volume not in volumes:
                    volumes.append(volume)
    #             else:
    #                 print(f"{token.text} is irrelevant.")
                
        
        
            
    print(volumes)

    result = {}
    if bool(locations):
        result['locations'] = locations
#     if len(volumes) > 0:
    result["volume"] = volumes
    if len(irrelevant_locations) > 0:
        result["irrelevant_locations"] = irrelevant_locations
    
    return result
    
        


### Without logging

In [117]:
def extract_chunk_info_without_log(txt):
    result = {}
    for token in nlp(txt):
        if token.ent_type_ == "DRUG":
            drug_info = extract_info_from_drug_without_log(token, txt)
            result[token.text] = drug_info
    return result
            
adj_list = []          
def extract_info_from_drug_without_log(drug, txt):
    volumes = []
    locations = {}
    irrelevant_locations = []
    for token in nlp(txt):
        
        # Extract countries
        if token.ent_type_ == "GPE":
            dist = get_linguistic_distance(drug, token)
#             if dist < 15:
            adj = get_adposition_from_loc(token)
            locs = [token.text]
            for loc in token.conjuncts:
                locs.append(loc.text)
            if adj not in locations:
                locations[adj] = locs
                if adj not in adj_list:
                    adj_list.append(adj)
            else:
                for loc in locs:
                    if loc not in locations[adj]:
                        locations[adj].append(loc)
#             else:
#                 irrelevant_locations.append(token.text)
#                 print(f"{token.text} is irrelevant.")
        
        # Extract volume
        if token.ent_type_ == "QUANTITY":
            volume = {}
            dist = get_linguistic_distance(drug, token)
            second_token = ""
#             if dist < 10:
            quantity = {}
            for ancestor in token.ancestors:
                if ancestor.ent_type_ == "QUANTITY":
                    second_token = ancestor
            for child in token.children:
                if child.ent_type_ == "QUANTITY":
                    second_token = child

            ## Decide volume and volume_type
            if not isinstance(second_token, str):
                if nlp(token.text)[0].ent_type_ == "CARDINAL":
                    volume['volume'] = token.text
                    volume['volume_type'] = second_token.text
                    volume['dist'] = dist
                elif nlp(second_token.text)[0].ent_type_ == "CARDINAL":
                    volume['volume'] = second_token.text
                    volume['volume_type'] = token.text
                    volume['dist'] = dist

                #Only append when not already in volumes
                if volume not in volumes:
                    volumes.append(volume)
    #             else:
    #                 print(f"{token.text} is irrelevant.")
                
        
        
            

    result = {}
    if bool(locations):
        result['locations'] = locations
#     if len(volumes) > 0:
    result["volume"] = volumes
    if len(irrelevant_locations) > 0:
        result["irrelevant_locations"] = irrelevant_locations
    
    return result
    
        


# Results

In [141]:
def get_info_per_chunk_per_case():
    id_list = []
    info_list = []
    for i in range(len(trafficking_df[:10])):
        chunks = trafficking_df.iloc[i]['chunks']
        id = trafficking_df.iloc[i]['id']
        id_list.append(id)
        print(id)
        results = []
        for chunk in chunks:
            info = extract_chunk_info(chunk)
            print(f"   Info: {info} \n")
            results.append(info)
        info_list.append(results)

    return pd.DataFrame(list(zip(id_list, info_list)),columns =['id', 'info'])

info_df = get_info_per_chunk_per_case()

ECLI:NL:RBNNE:2021:5018
START CHUNK
 
 Extracting info for BenzylMethylKeton.
[]
 
 Extracting info for BMK.
[]
 
 Extracting info for amfetamine.
[]
 
 Extracting info for MAPA.
[]
 
 Extracting info for BMK.
[]
 
 Extracting info for amfetamine.
[]
 
 Extracting info for BMK.
[]
 
 Extracting info for BMK.
[]
   Info: {'BenzylMethylKeton': {'volume': []}, 'BMK': {'volume': []}, 'amfetamine': {'volume': []}, 'MAPA': {'volume': []}} 

START CHUNK
 
 Extracting info for MAPA.
[]
 
 Extracting info for BMK.
[]
 
 Extracting info for amfetamine.
[]
   Info: {'MAPA': {'volume': []}, 'BMK': {'volume': []}, 'amfetamine': {'volume': []}} 

ECLI:NL:RBZUT:2003:AH9598
START CHUNK
 
 Extracting info for drugs.
    None: Haarlem, dist: 4, conj: (Amstelveen,)
    None: Amstelveen, dist: 4, conj: (Haarlem,)
[]
   Info: {'drugs': {'locations': {None: ['Haarlem', 'Amstelveen']}, 'volume': []}} 

ECLI:NL:RBZWB:2020:2646
START CHUNK
 
 Extracting info for BMK.
[]
 
 Extracting info for Benzylmethylketon

 
 Extracting info for cocaïne.
    in: Rotterdam, dist: 6, conj: ()
[]
   Info: {'cocaïne': {'locations': {'in': ['Rotterdam']}, 'volume': []}} 

START CHUNK
 
 Extracting info for cocaïne.
    in: Rotterdam, dist: 2, conj: ()
[]
   Info: {'cocaïne': {'locations': {'in': ['Rotterdam']}, 'volume': []}} 

START CHUNK
 
 Extracting info for cocaïne.
[]
   Info: {'cocaïne': {'volume': []}} 

START CHUNK
 
 Extracting info for cocaïne.
    in: Nederland, dist: 1, conj: ()
[]
   Info: {'cocaïne': {'locations': {'in': ['Nederland']}, 'volume': []}} 

ECLI:NL:RBAMS:2017:9089
START CHUNK
 
 Extracting info for cocaïne.
    in: Nederland, dist: 2, conj: ()
    in: Nederland, dist: 4, conj: ()
[]
   Info: {'cocaïne': {'locations': {'in': ['Nederland']}, 'volume': []}} 

START CHUNK
 
 Extracting info for cocaïne.
    in: Nederland, dist: 1, conj: ()
[]
   Info: {'cocaïne': {'locations': {'in': ['Nederland']}, 'volume': []}} 

START CHUNK
 
 Extracting info for cocaïne.
    in: Rotterdam, dist: 6

In [33]:
info_df.iloc[0]['info']

[{'drugs': {'locations': {None: ['Haarlem', 'Amstelveen']}, 'volume': []}}]

In [167]:
correct_id_list = []
corrected_from_list = []
corrected_to_list = []
corrected_drug_list = []

def parse_locations(df):
    for i in range(len(df)):
        from_location = []
        to_location = []
        drugs_present = []
        curr = info_df.iloc[i]
        print(curr["id"])
        for chunk in curr['info']:
#             print(chunk)
            for drug in chunk.keys():
                if 'locations' in chunk[drug]:
                    locations = chunk[drug]['locations']
                    drugs_present.append(drug)
                    print(drug)
    #                 print(locations)
                    for adj in from_adjectives:
                        if adj in locations:
    #                         print(f"{adj}: {locations[adj]}")
    #                         print(locations[adj])
                            for loc in locations[adj]:
                                if loc not in from_location and not loc == "Nederland":
    #                                 print(f"{adj} {loc} labeled as from")
                                    from_location.append(loc)
                    for adj in to_adjectives:
                        if adj in locations:
    #                         print(f"{adj}: {locations[adj]}")
    #                         print(locations[adj])
                            for loc in locations[adj]:
                                if loc not in to_location and not loc == "Nederland":
    #                                 print(f"{adj} {loc} labeled as to")
                                    to_location.append(loc)
        if len(from_location) == 0:
            from_location.append("Nederland")
        if len(to_location) == 0:
            to_location.append("Nederland")
        print(f"From: {from_location}, To: {to_location}")
        correct_id_list.append(curr['id'])
        corrected_from_list.append(from_location)
        corrected_to_list.append(to_location)
        corrected_drug_list.append(drugs_present)
        
            
        
parse_locations(info_df)    
    

ECLI:NL:RBZWB:2020:2646
From: ['Nederland'], To: ['Nederland']
ECLI:NL:PHR:2007:BA1113
From: ['Nederland'], To: ['Nederland']
ECLI:NL:GHAMS:2018:2662
cocaïne
From: ['Argentinië'], To: ['Nederland']
ECLI:NL:RBAMS:2017:9087
cocaïne
cocaïne
cocaïne
cocaïne
From: ['Rotterdam'], To: ['Rotterdam']
ECLI:NL:RBAMS:2017:9085
cocaïne
cocaïne
cocaïne
From: ['Nederland'], To: ['Rotterdam']
ECLI:NL:RBAMS:2017:9086
cocaïne
From: ['Nederland'], To: ['Nederland']
ECLI:NL:RBAMS:2017:9088
cocaïne
cocaïne
cocaïne
From: ['Nederland'], To: ['Rotterdam']
ECLI:NL:RBAMS:2017:9089
cocaïne
cocaïne
cocaïne
cocaïne
cocaïne
cocaïne
From: ['Nederland'], To: ['Rotterdam']
ECLI:NL:RBAMS:2017:9090
cocaïne
cocaïne
cocaïne
cocaïne
From: ['Nederland'], To: ['Nederland']
ECLI:NL:RBAMS:2017:9091
cocaïne
cocaïne
cocaïne
From: ['Nederland'], To: ['Rotterdam']
ECLI:NL:RBAMS:2017:9092
cocaïne
cocaïne
cocaïne
From: ['Nederland'], To: ['Rotterdam']
ECLI:NL:OGHACMB:2018:277
cocaïne
From: ['Nederland'], To: ['Jamaica']
ECLI:NL:RBLE

ECLI:NL:RBUTR:2007:BC1124
heroïne
From: ['Nederland'], To: ['Engeland']
ECLI:NL:RBGEL:2016:4563
cocaïne
From: ['Nederland'], To: ['Nederland']
ECLI:NL:GHSGR:2012:BY1881
heroïne
cocaïne
heroïne
cocaïne
harddrugs
From: ['Nederland'], To: ['buitenland']
ECLI:NL:PHR:2010:BK8490
harddrugs
From: ['Nederland'], To: ['Nederland']
ECLI:NL:RBGEL:2020:1349
cocaïne
From: ['Nederland'], To: ['Nederland']
ECLI:NL:RBBRE:2011:BP4346
BMK
amfetamine
amfetamine
From: ['Nederland'], To: ['Ulicoten']
ECLI:NL:RBZWB:2020:2721
MDMA
PMK
aceton
From: ['Nederland'], To: ['Roosendaal']
ECLI:NL:PHR:2012:BX9555
hasjiesj
heroïne
From: ['Nederland'], To: ['Luxemburg']
ECLI:NL:RBGRO:2010:BM9850
wiet
cocaïne
From: ['Nederland'], To: ['Nederland']
ECLI:NL:RBUTR:2012:7103
cocaïne
heroïne
cocaïne
heroïne
cocaïne
heroïne
From: ['Nieuwegein'], To: ['Nieuwegein']
ECLI:NL:RBZWB:2021:3406
amfetamine
From: ['Nederland'], To: ['Nederland']
ECLI:NL:RBAMS:2021:3457
heroïne
cocaïne
From: ['Nederland'], To: ['Amsterdam']
ECLI:NL:PHR

From: ['Nederland'], To: ['Roermond', 'België', 'BMK']
ECLI:NL:GHSHE:2009:BJ9370
cocaïne
cocaïne
From: ['Nederland'], To: ['buitenland']
ECLI:NL:PHR:2007:BA5037
cocaine
heroine
paracetamol
fenacetine
inositol
From: ['Rotterdam', 'tezamen'], To: ['Nederland']
ECLI:NL:PHR:2004:AO5050
harddrugs
From: ['Nederland'], To: ['Nederland']
ECLI:NL:RBGEL:2021:7218
crystal
meth
crystal
meth
crystal
meth
From: ['Nederland'], To: ['Emst']
ECLI:NL:RBROT:2011:BR4143
heroïne
From: ['Nederland'], To: ['Rotterdam']
ECLI:NL:RBMNE:2020:66
hasj
hennep
hasj
hasj
From: ['Nederland'], To: ['Spanje', 'Amsterdam']
ECLI:NL:RBGEL:2021:5948
heroïne
From: ['Nederland'], To: ['Nederland']
ECLI:NL:PHR:2007:BB3067
XTC-pillen
XTC-pillen
pillen
XTC-pillen
XTC-pillen
pillen
From: ['Antwerpen'], To: ['Lübeck', 'België', 'Duitsland']
ECLI:NL:RBNNE:2017:2946
cocaïne
XTC
speed
cocaïne
harddrugs
amfetamine
cocaïne
cocaïne
cocaïne
cocaïne
cocaïne
cocaïne
cocaïne
cocaïne
cocaïne
cocaïne
cocaïne
amfetamine
cocaïne
cocaïne
amfetam

From: ['Nederland'], To: ['Gilze']
ECLI:NL:RBLIM:2013:5263
From: ['Nederland'], To: ['Nederland']
ECLI:NL:GHSHE:2009:BX7214
amfetamine
From: ['Prinsenbeek'], To: ['Nederland']
ECLI:NL:RBGRO:2008:BC6010
cocaine
cocaïne
From: ['Nederland'], To: ['Duitsland', 'Amsterdam']
ECLI:NL:RBAMS:2014:7077
cocaïne
XTC/MDMA
cocaïne
amfetamine
From: ['Nederland.56'], To: ['Amsterdam', 'België']
ECLI:NL:RBLIM:2013:9592
harddrugs
From: ['Nederland'], To: ['Frankrijk']
ECLI:NL:GHAMS:2020:1041
cocaïne
From: ['Nederland'], To: ['Nederland']
ECLI:NL:RBROT:2017:1852
cocaïne
cocaïne
From: ['Midden-Amerika'], To: ['Nederland']
ECLI:NL:RBMNE:2015:7051
hennep
From: ['hennephandel', '-bezit'], To: ['Nederland']
ECLI:NL:PHR:2011:BP7871
heroïne
heroïne
heroïne
heroïne
heroïne
heroïne
From: ['Nederland'], To: ['Amsterdam', 'Groot-Brittannië', 'Rotterdam', 'Zwolle', 'Amersfoort']
ECLI:NL:GHARN:2005:AV1339
From: ['Nederland'], To: ['Nederland']
ECLI:NL:RBMNE:2016:6905
cocaïne
heroïne
harddrugs
harddrugs
From: ['Amersf

amfetamine
From: ['Eext', 'Goes', 'Bergen', 'Roosendaal', 'Goes-'], To: ['Nederland']
ECLI:NL:RBNNE:2021:1652
BMK
Benzylmethylketon
amfetamine
BMK
N-formylamfetamine
amfetamine
amfetamine
amfetamine
BMK
Benzylmethylketon
amfetamine
BMK
N-formylamfetamine
amfetamine
From: ['Eext'], To: ['Nederland']
ECLI:NL:GHAMS:2019:5162
cocaïne
cocaïne
cocaïne
cocaïne
cocaïne
cocaïne
From: ['Venezuela', 'Chili'], To: ['Nederland']
ECLI:NL:RBLIM:2019:1679
MDMA
GHB
MDMA
From: ['Nederland'], To: ['Duitsland', 'Geleen']
ECLI:NL:RBROT:2018:8542
heroïne
cocaïne
cocaïne
From: ['Antwerpen', 'Colombia'], To: ['Nederland']
ECLI:NL:RBAMS:2017:3949
metamfetamine
From: ['Delft'], To: ['Nederland']
ECLI:NL:RBGEL:2014:2341
heroïne
heroïne
From: ['Nederland'], To: ['buitenland']
ECLI:NL:RBDHA:2013:CA1921
cannabis
cannabis
From: ['Nederland'], To: ['Nederland']
ECLI:NL:RBAMS:2010:BN1279
cocaine
cocaïne
cocaïne
From: ['Amsterdam'], To: ['Nederland']
ECLI:NL:PHR:2010:BK0972
cocaïne
From: ['Nederland'], To: ['Aruba']
EC

harddrugs
weed
weed
harddrugs
From: ['Nederland'], To: ['Europa', 'Duitsland', 'buitenland']
ECLI:NL:RBLIM:2019:1674
MDMA
GHB
MDMA
MDMA
From: ['Nederland'], To: ['Duitsland', 'Geleen']
ECLI:NL:RBLIM:2021:5489
amfetamine
wiet
From: ['Nederland'], To: ['België']
ECLI:NL:PHR:2021:277
amfetamine
From: ['Nederland'], To: ['Nederland']
ECLI:NL:RBROT:2017:5125
cocaïne
From: ['Brazilië'], To: ['Nederland']
ECLI:NL:GHSHE:2014:955
hennep
hasjiesj
hennep
hasjiesj
From: ['Pakistan', 'hennep/hasjiesj'], To: ['Italië', 'Engeland']
ECLI:NL:RBAMS:2022:935
cocaïne
From: ['Antwerpen'], To: ['Nederland']
ECLI:NL:RBROT:2021:11863
cocaïne
From: ['Nederland'], To: ['Nederland']
ECLI:NL:RBROT:2021:199
cocaïne
From: ['Zuid-Amerika'], To: ['Nederland']
ECLI:NL:GHSHE:2020:611
cocaïne
From: ['Nederland'], To: ['Engeland']
ECLI:NL:RBAMS:2019:5609
heroïne
From: ['Nederland'], To: ['Spanje']
ECLI:NL:RBROT:2019:4228
cocaïne
From: ['Nederland'], To: ['Nederland']
ECLI:NL:RBAMS:2017:8642
cocaïne
heroïne
From: ['Nederl

ECLI:NL:RBOVE:2018:5085
harddrugs
hennep
From: ['Nederland'], To: ['buitenland']
ECLI:NL:RBAMS:2016:6282
hasj
coke
Hasj
From: ['Marokko'], To: ['Spanje']
ECLI:NL:RBAMS:2016:2339
cocaïne
cocaïne
cocaïne
marihuana
From: ['Zuid-Amerika'], To: ['België', 'Europa', 'Medellin']
ECLI:NL:GHAMS:2015:255
heroïne
From: ['Nederland'], To: ['Engeland']
ECLI:NL:RBNHO:2014:12947
cocaïne
From: ['Volendam'], To: ['Nederland']
ECLI:NL:GHDHA:2014:2173
cocaïne
From: ['Nederland'], To: ['Nederland']
ECLI:NL:PHR:2014:281
cocaïne
cocaïne
harddrugs
From: ['Aken'], To: ['Duitsland', 'Zweden']
ECLI:NL:RBHAA:2011:BQ8214
cocaïne
From: ['Nederland'], To: ['Nederland']
ECLI:NL:RBARN:2011:BQ2183
hennep
From: ['Nederland'], To: ['Nederland']
ECLI:NL:RBROT:2010:BO2925
diazepam
From: ['Nederland'], To: ['China']
ECLI:NL:RBHAA:2010:BL7581
hennep
hasjiesj
hennep
hasj
hennep
hennep
From: ['Marokko'], To: ['Spanje', 'Engeland', 'buitenland']
ECLI:NL:GHAMS:2009:BI6135
heroïne
XTC-pillen
heroïne
From: ['Nederland'], To: ['En

In [146]:
print(len(correct_id_list))
print(len(corrected_from_list))
print(len(corrected_to_list))
print(len(corrected_drug_list))

1000
1000
1000
1000


In [166]:
def parse_volume(df):
    for i in range(len(df[:15])):
        correct_volume = []
        drugs_present = []
        curr = info_df.iloc[i]
        print(curr["id"])
        for chunk in curr['info']:
#             print(chunk)
            for drug in chunk.keys():
                if 'volume' in chunk[drug]:
                    volumes = chunk[drug]['volume']
                    for volume in volumes:
                        if len(volume) > 0:
                            
                    
                    
parse_volume(info_df)

ECLI:NL:RBZWB:2020:2646
{'volume': '241', 'volume_type': 'liter', 'dist': 2}
{'volume': '8', 'volume_type': 'liter', 'dist': 2}
{'volume': '15', 'volume_type': 'liter', 'dist': 2}
{'volume': '5', 'volume_type': 'liter', 'dist': 2}
{'volume': '14', 'volume_type': 'liter', 'dist': 2}
{'volume': '241', 'volume_type': 'liter', 'dist': 10}
{'volume': '8', 'volume_type': 'liter', 'dist': 10}
{'volume': '15', 'volume_type': 'liter', 'dist': 10}
{'volume': '5', 'volume_type': 'liter', 'dist': 10}
{'volume': '14', 'volume_type': 'liter', 'dist': 10}
{'volume': '241', 'volume_type': 'liter', 'dist': 3}
{'volume': '8', 'volume_type': 'liter', 'dist': 3}
{'volume': '15', 'volume_type': 'liter', 'dist': 3}
{'volume': '5', 'volume_type': 'liter', 'dist': 3}
{'volume': '14', 'volume_type': 'liter', 'dist': 3}
{'volume': '241', 'volume_type': 'liter', 'dist': 3}
{'volume': '8', 'volume_type': 'liter', 'dist': 3}
{'volume': '15', 'volume_type': 'liter', 'dist': 2}
{'volume': '15', 'volume_type': 'liter

In [174]:
def fuse_chunks(chunks): 
    # Loop through chunks
    drug_dict = {}
    for chunk in chunks:
        chunk_info = extract_chunk_info_without_log(chunk)

        # Loop through drugs
        for drug in chunk_info:
            lowerdrug = drug.lower()
            if lowerdrug not in drug_dict:
                drug_dict[lowerdrug] = {'locations': [], 'volumes': []}
            if len(chunk_info[drug]['volume']) > 0:
                drug_dict[lowerdrug]['volumes'].append(chunk_info[drug]['volume'])
            if 'locations' in chunk_info[drug]:
                drug_dict[lowerdrug]['locations'].append(chunk_info[drug]['locations'])
    return drug_dict

In [None]:
def fuse_locations(fused_chunks):
    for drug in fused_chunks:
        adjectives = {}
        for location_entry in fused_chunks[drug]['locations']:
            for adjective in location_entry:
                if adjective not in adjectives:
                    adjectives[adjective] = []
                for country in location_entry[adjective]:
                    if country not in adjectives[adjective]:
                        adjectives[adjective].append(country)
        fused_chunks[drug]['locations'] = adjectives
    return fused_chunks

In [251]:
def fuse_volumes(fused_locations):
    for drug in fused_locations:
        final_volume = {}
        dist = 100
        for volumes in fused_locations[drug]['volumes']:
            if len(volumes) > 0:
                for volume in volumes:
                    if 'dist' in volume:
                        if volume['dist'] < dist:
                            dist = volume['dist']
                            final_volume = volume
        fused_locations[drug]['volumes'] = final_volume
        return fused_locations


# for i in range(len(trafficking_df[:30])):
#     curr = trafficking_df.iloc[i]
#     print(curr['id'])
    
#     fused_chunks = fuse_chunks(curr['chunks'])
#     fused_locations = fuse_locations(fused_chunks)
# #     fuse_volumes(fused_locations)
    
#     print('\n')
    
    
for index, row in trafficking_df[150:180].iterrows():
    print(row['id'])
    fused_chunks = fuse_chunks(row['chunks'])
    fused_locations = fuse_locations(fused_chunks)
    fused_volumes = fuse_volumes(fused_locations)
    
    print(fused_volumes)
    for drug in fused_volumes:
        print(len(fused_volumes[drug]['volumes']))
    

ECLI:NL:RBOBR:2018:74
{'drugs': {'locations': {'te': ['Best', 'Rotterdam']}, 'volumes': {}}, 'xtc-tabletten': {'locations': {}, 'volumes': []}}
0
0
ECLI:NL:GHAMS:2021:1722
{'cocaïne': {'locations': {'in': ['Peru', 'Zuid-Amerika', 'Amsterdam'], 'naar': ['Zweden']}, 'volumes': {'volume': '387', 'volume_type': 'kilo', 'dist': 1}}, 'amfetamine': {'locations': {'naar': ['Zweden']}, 'volumes': [[{'volume': '47', 'volume_type': 'kilo', 'dist': 4}, {'volume': '387', 'volume_type': 'kilo', 'dist': 2}, {'volume': '387', 'volume_type': 'kilo', 'dist': 1}]]}}
3
1
ECLI:NL:RBROT:2010:BO3803
{'drugs': {'locations': {'in': ['Nederland']}, 'volumes': {}}}
0
ECLI:NL:RBDHA:2017:6512
{'harddrugs': {'locations': {'in': ['gemeente', 'Braassem'], None: ['Kaag', 'Braassem', 'gemeente']}, 'volumes': {}}}
0
ECLI:NL:RBUTR:2009:BK1429
{'drugs': {'locations': {'te': ['Maastricht', 'één']}, 'volumes': {}}}
0
ECLI:NL:RBUTR:2009:BK1499
{'drugs': {'locations': {'te': ['Maastricht', 'één']}, 'volumes': {}}}
0
ECLI:NL:R

# Sandbox

In [120]:
trafficking_df

Unnamed: 0,id,chunks
0,ECLI:NL:RBZUT:2003:AH9598,[hij op tijdstippen in de periode 6 februari 2...
1,ECLI:NL:RBZWB:2020:2646,[- een (compleet) in werking zijnde laboratori...
2,ECLI:NL:PHR:2007:BA1113,[In de maand december 1998 ontstond onder ande...
3,ECLI:NL:GHAMS:2018:2662,[Dit feit heeft betrekking op een geslaagde in...
4,ECLI:NL:RBAMS:2017:9087,[Verdachte heeft in de ochtend van 5 april 201...
...,...,...
2585,ECLI:NL:GHSHE:2020:1730,[Gezien het vorenstaande is het hof van oordee...
2586,ECLI:NL:HR:1998:ZD1191,"[""4. hij in de periode van 1 januari 1993 tot ..."
2587,ECLI:NL:RBNHO:2013:10924,[Met de officier van justitie en de raadsvrouw...
2588,ECLI:NL:GHSHE:2021:3205,[hij verdachte in of omstreeks de periode van ...


In [147]:
case = list(trafficking_df[trafficking_df['id'] == 'ECLI:NL:GHAMS:2018:2662']['chunks'])[0]

In [148]:
for chunk in case:
    displacy.render(nlp(chunk), style = 'ent')
    extract_chunk_info(chunk)

START CHUNK
 
 Extracting info for cocaïne.
    vanuit: Argentinië, dist: 2, conj: ()
[{'volume': '900', 'volume_type': 'gram'}]


In [None]:
def get_entities(txt):
    doc = nlp(txt)
    for ent in doc.ents:
        print(ent, ent.label_)

Get all loc entities

In [47]:
def list_entities(entity):
    entity_list = []
    for i in range(len(trafficking_df)):
        chunks = list(trafficking_df.iloc[i]['chunks'])
        for chunk in chunks:
            doc = nlp(chunk)
            for ent in doc.ents:
                if ent.label_ == entity:
#                     print(trafficking_df.iloc[i]['id'])
                    if ent.text not in entity_list:
                        entity_list.append(ent.text)
    return entity_list

In [187]:
loc_list = list_entities("LOC")


In [188]:
print(loc_list)

['gaswasser', 'Noordzee', 'Ettenseweg', 'Klein Horendonk', 'Maasdelta', 'Rotterdamse haven', 'medeveroordeelde', 'Prins Clausstraat', 'Westerschelde', 'Daartegenover', 'Wvmc', 'NFI-onderzoek', 'woning.2', 'inrichting/opbouw', 'Verderop', 'ZD04', 'Westelijke Havendijk', 'IJmond', 'Leuckartmethode', 'Eindhovenseweg', 'Zuid', 'koeriersauto', 'Tonsdijk', 'mengsels/substanties', 'Zuid-Limburgse', 'Industrieweg', 'Oost', 'micro-cellulose', 'Opiumwetbesluit', 'Wieringerwaardstraat', 'Luikerweg', 'verwijzingsbevel', 'Pijp', 'Lier', 'auditu-verklaringen', 'Keton', 'Rijn', 'Josemans-arrest', 'Bouchereau-arrest', 'schuur.5', 'Vogelweide', 'Ermelo4', 'Stationsweg', 'beek', 'Roteb', 'Gezinsherenigingsrichtlijn', 'Korvelseweg', 'Berkel', 'Arnhem', 'reactieketels L2', 'Gravesend', 'Sint Hubertuslaan', 'Tweede Groenedijk', 'Ermerweg', 'Diemen-Zuid', 'Noord-Beemster', 'Douanelaboratorium', 'Efedrine', 'enkelslag tabletteermachine', 'Einlassung', 'hennepkweek', 'AE-1-2017', 'Spaanse Polder', 'Nagel-onde

In [None]:
gpe_list = list_entities("GPE")
print(gpe_list)

In [49]:
for i in gpe_list:
    print(i)

Argentinië
Rotterdam
Nederland
Jamaica
Hoogeveen
gemeente Hoogeveen
Utrecht
Frankrijk
België
Duitsland
Bonn
Landgraaf
buitenland
Curaçao
Leende
drugs
Wernhout
Roden
Leek
Drachten
Tolbert
Tilburg
Roermond
Sprundel
Nederhorst
Putten
Zeewolde
Leuven
Neede/Borculo
Borculo
Baarn
MDMA‑kristallen
Langdonk
Roosendaal
Neede/
namen
Mallorca
Uden
Turnhout
Oisterwijk
Neede
Oost-Nederland
Dominicaanse
Medeveroordeelde
Zuid-Amerika
Ecuador
Italië
Schuinesloot
Zeeland
Esch
Rilland
Oosterhout
Montfoort
Ahaus
Moergestel
Arnhem
Brazilië
Oss
Antwerpen
gemeente Kaag
Braassem
Harderwijk
Mono
Peru
Amsterdam
Zweden
Lienden
Ommen
Eindhoven
gemeente Heerlen
Suriname
gemeente Venlo
Europa
gemeente Haarlem
Liessel
Alkmaar
Breda
shal
Colombia
fenetylline
Nuenen gemeente Nuenen
Seville 1
Polen
Knegsel
Noord-Nederland
Leeuwarden
Pakistan
Azië
Vriezenveen
Liempde gemeente Boxtel
Belgie
Hechtel-Eksel
Akersloot
Amstelveen
Gastel
Malaga
Marokko
Spanje
Tanger
gemeente Nieuwkoop
Houten
Engeland
Denemarken
Finland
Bergen
