# Create dataset

In [1]:
import pandas as pd
import os

In [2]:
dataPath = os.getcwd() + '/court case data/testdata/'
caseCount = len(os.listdir(dataPath))
data = []
try:
    os.remove(dataPath + ".DS_Store")
except:
    print("No file DS_Store")
for filename in os.listdir(dataPath):
    f = open(os.path.join(dataPath, filename), encoding='utf-8')
    data.append([filename.replace('.txt', ''), f.read()])

verdict_df = pd.DataFrame(data, columns=["id", "case text"])
cases_df = pd.read_csv('./court case data/testdata.csv')
merged_df = cases_df.join(verdict_df.set_index('id'), on='id', how='left')

merged_df["verdict_date"] = pd.to_datetime(merged_df["verdict_date"])
merged_df["publication_date"] = pd.to_datetime(merged_df["publication_date"])

No file DS_Store


In [3]:
merged_df

Unnamed: 0,id,verdict_date,publication_date,verdict_type,jurisdiction_type,case text
0,ECLI-NL-RBNNE-2021-5018,2021-01-10,2021-11-23,uitspraak,['Strafrecht'],\n\nRECHTBANK NOORD-NEDERLAND\nAfdeling strafr...
1,ECLI-NL-RBZUT-2003-AH9598,2003-03-06,2003-09-07,uitspraak,['Strafrecht'],\n\nRECHTBANK ZUTPHEN\nMeervoudige economische...
2,ECLI-NL-RBZWB-2020-2646,2020-06-23,2020-06-23,uitspraak,['Strafrecht'],\n\nRECHTBANK ZEELAND-WEST-BRABANT\n\nStrafrec...
3,ECLI-NL-GHAMS-2019-1601,2019-08-05,2019-07-26,uitspraak,['Strafrecht'],\n\nafdeling strafrecht\nparketnummer: 23-0017...
4,ECLI-NL-GHAMS-2019-1602,2019-08-05,2019-07-26,uitspraak,['Strafrecht'],\n\nafdeling strafrecht\nparketnummer: 23-0017...
...,...,...,...,...,...,...
18119,ECLI-NL-RBAMS-2013-1294,2013-01-29,2013-08-10,uitspraak,['Strafrecht'],\n\nRECHTBANK AMSTERDAM\n\n\nVONNIS\n\n \n\n13...
18120,ECLI-NL-PHR-2020-1106,2020-11-24,2020-11-24,conclusie,['Strafrecht'],\n\nPROCUREUR-GENERAAL\n\n\nBIJ DE\n\n\nHOGE R...
18121,ECLI-NL-GHAMS-2017-2618,2017-06-29,2017-05-07,uitspraak,['Strafrecht'],\n\n\nparketnummer: 23-001217-13\ndatum uitspr...
18122,ECLI-NL-RBAMS-2013-BZ0392,2013-01-29,2013-01-02,uitspraak,['Strafrecht'],\nRECHTBANK AMSTERDAM \nVONNIS \n\n13/529144-...


# Split documents

### Old

In [None]:
# -*- coding: utf-8 -*-
import re
alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr|mr|mevr|mvr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov|nl)"
articles = "[artikel ][0-9][.][0-9]"

def split_into_sentences2(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    text = re.sub(articles,"[artikelnummer]",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    return sentences

### New

In [4]:
import re

In [5]:
def split_into_sentences(text):
    sentences = re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])|\n', text)
    sentences = [x for x in sentences if len(x) > 1]
    return sentences


Two ways of splitting the documents:
    For Word2Vec, we need sentences to be an array of words.
    For the rest, just the sentence is enough.

In [6]:
tenlastelegging_words = ['tenlastelegging', 'telastelegging', 'tenlasteleggingen', 'telastlegging']

def trim_by_tenlastelegging(doc):
    trimmed_doc = doc
    stop = False
    for keyword in tenlastelegging_words:
        if not stop and keyword in doc:
            keyword_index = doc.find(keyword)
            trimmed_doc = doc[keyword_index:]
            stop = True
    return trimmed_doc

In [7]:
sentence_list_by_word = []
sentence_list = []

for i in range(len(merged_df)):
    doc = merged_df.iloc[i]['case text']
    trimmed_doc = trim_by_tenlastelegging(doc)
    sentences = split_into_sentences(trimmed_doc)
    sentence_list.append(sentences)
    for j in sentences:
        word_list = [x for x in j.lower().rstrip().replace('.', '').split(' ') if len(x)>0]
        sentence_list_by_word.append(word_list)
        
print(len(sentence_list))
print(len(sentence_list_by_word))

18124
4714882


Testers:

In [None]:
for i in sentence_list[:1]:
    for j in i:
        print(j, '\n')

In [71]:
for i in sentence_list_by_word[:1]:
    for j in i:
        print(j, '\n')

rechtbank 

noord-nederland 



In [82]:
print(len(sentence_list))

18124


In [83]:
print(len(sentence_list_by_word))

4714882


# Create Word2Vec model

In [8]:
from gensim.models import Word2Vec
from gensim.test.utils import common_texts

Create and save model

In [None]:
dutch_word2vec_model = Word2Vec(sentences=sentence_list_by_word, vector_size=100, window=5, min_count=1, workers=4)
dutch_word2vec_model.save("word2vec_dutch_court_cases.model")

Load model

In [9]:
dutch_word2vec_model = Word2Vec.load("word2vec_dutch_court_cases.model")

In [10]:
sims = dutch_word2vec_model.wv.most_similar('iraaks', topn=100)
print([i[0] for i in sims])
# print(sims)

['oost-maarland', 'hydroxypropylme-thylcelluloseftalaat', 'gelingen', 'siegburg', '17:48:35', 'cheng', 'ecli:nl:hr:2012:bw5879', '4982738', '4988208', 'ecli:nl:hr:2018:854', "dient'", 'opschrift?', 'r-2199a', '(fn:', 'spoor\t\t\t:', '12:40:22', 'uitspraak\t\t:', 'r-2230', 'a131#10', 'ecli:nl:rbams:2021:3243', 'tijdstip:\t\t15-03-2008', '2020-04-29', 'yep', 'aktiebolaget', '18-094602-18:', 'buen', '‘smerige', '(tolknummer', '80437\t€', 'ecli:nl:hr:2014:1174', 'ta006\t\t\t[telefoonnummer', 'aaji4890nl', 'jurgens\t', '(pag:', 'ibn-6', 'tewari)', 'paramethadion', 'kjccf40900264', 'ecli:nl:rbdha:2018:14330', 'kauwgoms', 'mr\thm', 'gram\tparacetamol', '16/661746-13:', '2703', 'broier', '(zaak:', 'oooohhh', 'wouters-debougnoux', "'treasurer'", '\tking', '({})', 'ecli:ep:ba:1994:t02969319940728', 'r-2195', 'aaiy5728nl', '\tvwet', 'r-2206', 'klok-', 'handtekening)"', '4982963', 'día', 'kutweek', '17679\t', '6:6:26', '1]3\t\t[medeverdachte', 'heerlijk!"', '1308\t\t\t€', '5505206', 'gemiddeld’', 

### Create list of drugs, smuggle, quantity keywords with Word2Vec model

In [11]:
def create_word2vec_relevant_words(words, matches):
    word2vec_list = []
    for word in words:
        results = dutch_word2vec_model.wv.most_similar(word, topn=100)
        for i in results:
            word2vec_list.append(i[0])
            
    word2vec_list = list(set([i for i in word2vec_list if word2vec_list.count(i)>matches]))
    return word2vec_list

Drugs list:

In [12]:
list_of_drugs = ['xtc', 'mdma', 'cocaine', 'wiet', 'speed', 'bmk', 'pmk']
word2vec_drug_list = create_word2vec_relevant_words(list_of_drugs, 2)

print(len(word2vec_drug_list))
print(word2vec_drug_list)

77
['2c-b', 'amfetaminen', 'pep', 'weed', '(met)amfetamine', 'mdma', 'pil', 'mdma/xtc', 'mapa', 'amfetamine’', 'tabletten', 'cafeïne', 'mdma-poeder', 'crack', 'heroïne', 'speed', 'cocaïne;', 'mdma)', 'ecstasy', 'paracetamol', 'hashish', 'amfetamine', 'lsd', 'xtc-pillen', 'mdma-pillen', 'marihuana', 'apaan', 'hasjiesj', 'hasj', 'azijnzuuranhydride', 'cocaïne', 'xtc', 'methadon', 'n-formylamfetamine', 'gbl', 'manitol', 'harddrugs', 'cocaïne?', 'cannabis', 'methamfetamine', 'ghb', 'amfetaminesulfaat', 'xtc-tabletten', 'eindproduct', 'safrol', 'hash', 'mefedron', 'pillen', 'lidocaïne', 'd-metamfetamine', 'coke', 'fenacetine', 'amfetaminebase', 'opium', 'mdma-kristallen', 'amfetamine)', '(mdma)', 'heroïne)', 'kristallen', 'amfetamineolie', 'diazepam', 'pseudo-efedrine', 'heroïne;', 'hennep', 'ketamine', 'amfetaminepasta', 'apaa', 'mdma;', 'cocaïne28', 'morfine', 'metamfetamine', 'methanol', 'crystal', 'cocaine', '34-methyleendioxymethamfetamine', 'drugs', 'cocaïne)']


In [13]:
drugs_to_exclude = ['pasta', 'kristallen', 'poedervorm']
word2vec_drug_list = [drug for drug in word2vec_drug_list if drug not in drugs_to_exclude]
print(len(word2vec_drug_list))

76


Smuggle keyword list:

In [14]:
list_of_smuggle_words = ['smokkel', 'invoer', 'uitvoer', 'import', 'export', 'transport']
word2vec_smuggle_list = create_word2vec_relevant_words(list_of_smuggle_words, 3)

word2vec_smuggle_list = list(set(word2vec_smuggle_list + list_of_smuggle_words ))
print(len(word2vec_smuggle_list))
print(word2vec_smuggle_list)

43
['handel', 'uithalen', 'produceren', 'verhandeling', 'doorverkoop', 'kweek', 'in-/uitvoer', 'import', 'invoer', 'produktie', 'doorvoer', 'smokkel', 'export', 'invoeren', 'vervaardiging', 'bewerking', 'verkoop', 'verwerking', 'transport', 'importeren', 'bronlanden', 'uitvoer', 'afzet', 'productie', 'doorlevering', 'exporteren', 'transporteren', 'levering', 'leverancier', 'versnijding', 'hennepteelt', 'transsport', 'hennephandel', '(invoer', 'cocaïnehandel', 'versnijden', 'fabricage', 'binnensmokkelen', 'gesmokkelde', 'aanvoer', 'productie/bewerking', 'bronland', 'leverantie']


In [15]:
# drugs_to_exclude = ['pasta', 'kristallen', 'poedervorm']
# word2vec_drug_list = [drug for drug in word2vec_drug_list if drug not in drugs_to_exclude]
# print(len(word2vec_drug_list))

Quantity keyword list:

In [16]:
list_of_quantity_words = ['tabletten', 'kilo', 'gram', 'pakketten']
word2vec_quantity_list = create_word2vec_relevant_words(list_of_quantity_words, 2)

word2vec_quantity_list = list(set(word2vec_quantity_list + list_of_quantity_words))
print(len(word2vec_quantity_list))
print(word2vec_quantity_list)

26
['pakjes', 'zakjes', 'milliliter', 'wikkels', 'gripzakjes', 'xtc-pillen', 'kg)', 'kilogram)', 'kilo', 'gram;', 'mdma-pillen', 'stuks', 'joints', 'tabletten', 'gram', 'gram)', 'bollen', 'pillen', 'ton', 'mg)', 'slikkersbollen', 'bolletjes', 'kilo)', 'blokken', 'pakketten', 'ponypacks']


Country list:

In [17]:
list_of_countries = ['duitsland', 'colombia', 'alicante']
word2vec_country_list = create_word2vec_relevant_words(list_of_countries, 1)

print(len(word2vec_country_list))
print(word2vec_country_list)

89
['polen', 'zwitserland', 'argentinië', 'sevilla', 'china', 'caracas', 'madrid', 'marokko', 'turkije', 'thailand', 'curaçao', 'bosnië', 'brussel', 'syrië', 'europa', 'portugal', 'warschau', 'jamaica', 'denemarken', 'roemenië', 'trinidad', 'antwerpen', 'buitenland', 'peru', 'parijs', 'natal', 'griekenland', 'sydney', 'iran', 'mexico', 'lissabon', 'kinshasa', 'dominicaanse', 'pakistan', 'lima', 'rica', 'servië', 'luxemburg', 'barcelona', 'duitsland)', 'istanbul', 'kenia', 'noorwegen', 'republiek', 'bulgarije', 'bogota', '[land]', 'canada', 'hongarije', 'oostenrijk', 'venezuela', 'congo', 'italië', 'engeland', 'tsjechië', 'tanger', 'spanje', 'paramaribo', 'berlijn', 'slowakije', 'guayaquil', 'malaga', 'ierland', 'gevlogen', 'kroatië', 'chili', 'singapore', 'panama', 'finland', 'londen', 'oekraïne', 'ghana', 'belgië', 'zuid-amerika', 'zweden', 'hamburg', 'belgië)', 'australië', 'groot-brittannië', 'costa', 'ecuador', 'nederland', 'brazilië', 'frankrijk', 'afrika', 'suriname', 'guatemala'

# Create SpaCy model

In [18]:
import spacy
from spacy import displacy
345# !python -m spacy download nl_core_news_md
nlp = spacy.load('nl_core_news_md')


In [19]:
my_file = open("drugs list.txt", "r", encoding='utf-8')
my_file = my_file.readlines()
drugs_list = []
for i in my_file:
    drugs_list.append(i.replace('\n', ''))
drugs_list = list(set(drugs_list + word2vec_drug_list))
    
my_file = open("countries list.txt", "r", encoding='utf-8')
my_file = my_file.readlines()
countries_list = []
for i in my_file:
    countries_list.append(i.replace('\n', ''))
countries_list = list(set(countries_list + word2vec_country_list))

my_file = open("countries_to_exclude.txt", "r", encoding='utf-8')
my_file = my_file.readlines()
countries_to_exclude = []
for i in my_file:
    countries_to_exclude.append(i.replace('\n', ''))


In [20]:
def configure_spacy_model():
    # Create dict of drug pattern and quantity pattern
    pattern_list = []
    
    for i in countries_to_exclude:
        pattern_list.append({"label": "EXCL", "pattern": [{"lower": i.lower()}]})
    
    for i in drugs_list:
        pattern_list.append({"label": "DRUG", "pattern": [{"lower": i.lower()}]})
    
#     quantity_rule = {"label": "QUANTITY", "pattern": [{"IS_DIGIT": True}, {"LOWER": "gram"}]}
#     pattern_list.append(quantity_rule)
    for i in word2vec_quantity_list:
        pattern_list.append({"label": "QUANTITY", "pattern": [{"IS_DIGIT": True}, {"LOWER": i}]})
        pattern_list.append({"label": "QUANTITY", "pattern": [{"ENT_TYPE": "CARDINAL"}, {"LOWER": i}]})
    
    for i in countries_list:
        pattern_list.append({"label": "GPE", "pattern": [{"lower": i.replace(' ', '').lower()}]})
    
    # Add drug and quantity rules to the model
    config = {
   "phrase_matcher_attr": None,
   "validate": True,
   "overwrite_ents": True,
   "ent_id_sep": "||",
    }
    ruler = nlp.add_pipe("entity_ruler", config=config)

    #List of Entities and Patterns
#     patterns = drugs_ent_list
    ruler.add_patterns(pattern_list)

In [21]:
configure_spacy_model()

# Select cases and chunks to keep

In [22]:
print(len(sentence_list) == len(merged_df))

True


For every case, split the sentences. If a sentence in a case contains a drug, a smuggle word, and a location: keep chunk and save to trafficking_df

In [51]:
country_translation_dict = {}
countries_that_give_error = []

In [52]:
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent = "geoapiExercises")

def google_approves(loc):
    if loc in country_translation_dict:
        return True
    if loc in countries_that_give_error:
        return False
    else:
        try:
            location = geolocator.geocode(loc, language='en')
            country_name = location.raw['display_name'].split(',')[-1].replace(' ', '')
            country_translation_dict[loc] = country_name
            return True
        except:
            print(f"{loc} is not a location.")
            countries_that_give_error.append(loc)
            return False


In [53]:
print(len(sentence_list))

18124


In [54]:
%%time

relevant_chunk_list = []
ecli_list = []

for index, case in enumerate(sentence_list):
    chunk_list = []
    trafficking_related = False
    for chunk in case:
        word_list = [x for x in chunk.lower().rstrip().replace('.', '').split(' ') if len(x)>0]
        if any(drug in word_list for drug in word2vec_drug_list) and any(smuggle_word in word_list for smuggle_word in word2vec_smuggle_list):
            ents = nlp(chunk).ents
            if any(ent.label_ == "GPE" for ent in ents):
                stop = False
                for ent in ents:
                    if not stop and ent.label_ == "GPE" and google_approves(ent.text):
                        trafficking_related = True
                        chunk_list.append(chunk)
                        stop = True
    if trafficking_related:
        relevant_chunk_list.append(chunk_list)
        ecli_list.append(merged_df.iloc[index]['id'].replace('-', ':'))

trafficking_df = pd.DataFrame({'id': pd.Series(ecli_list), 'chunks': pd.Series(relevant_chunk_list)})  

Veluwerand is not a location.
gemeente Valkenswaard is not a location.
Medeveroordeelde is not a location.
schuur/schuren is not a location.
fenetylline is not a location.
Liempde gemeente Boxtel is not a location.
Deugdelijke is not a location.
gemeente Nieuwkoop is not a location.
hennephandel is not a location.
gronddelict is not a location.
tenaamgestelde is not a location.
overheidscontroles is not a location.
Fosforzuur is not a location.
Duitland is not a location.
gemeente Maassluis is not a location.
gemeente Zandvoort is not a location.
Rappange is not a location.
gemeente Oude-IJsselstreek is not a location.
gemeente Eindhoven is not a location.
gemeente Mill is not a location.
luchtafzuigingen is not a location.
soortgevallen is not a location.
Gokoun is not a location.
Handhavingsprotocol is not a location.
huisartsenjournaal is not a location.
gemeente Alkmaar is not a location.
rechtbank Roermond is not a location.
Verenigd Koninkrijk.2Dat is not a location.
stoffen/chem

In [55]:
print(f"{len(trafficking_df)} cases kept from original {len(merged_df)} cases.")


2223 cases kept from original 18124 cases.


In [None]:
relevant_chunk_list = []
ecli_list = []

for index, case in enumerate(sentence_list):
    chunk_list = []
    trafficking_related = False
    for chunk in case:
        word_list = [x for x in chunk.lower().rstrip().replace('.', '').split(' ') if len(x)>0]
        if any(drug in word_list for drug in word2vec_drug_list) and any(smuggle_word in word_list for smuggle_word in word2vec_smuggle_list):
            ents = nlp(chunk).ents
            if any(ent.label_ == "GPE" or ent.label_ == "LOC" for ent in ents):
                trafficking_related = True
                chunk_list.append(chunk)
    if trafficking_related:
        relevant_chunk_list.append(chunk_list)
        ecli_list.append(merged_df.iloc[index]['id'].replace('-', ':'))

trafficking_df = pd.DataFrame({'id': pd.Series(ecli_list), 'chunks': pd.Series(relevant_chunk_list)})  

In [24]:
print(f"{len(trafficking_df)} cases kept from original {len(merged_df)} cases.")


2355 cases kept from original 18124 cases.


# Create rule-based NER & POS tagging model

### Old

In [249]:
def extract_chunk_info(txt):
    source_country = None
    total_info = []
    for token in nlp(txt):
        info = {}
        drug_info = {}
        countries = []
        
        if token.ent_type_ == "DRUG":
            info = {"drug": token.text}
            
            ## Get source and destination
            for ancestor in token.ancestors:
                for nephew in ancestor.children:
                    if nephew.ent_type_ == "GPE" or nephew.ent_type_ == "LOC":
                        countries.append(nephew)
                        for child in nephew.children:
                            if child.dep_ == "conj" and child.ent_type_ == "GPE" or child.ent_type_ == "LOC":
                                countries.append(child.text)
                            elif child.pos_ == "ADP" and child.dep_ == "case":
                                adj = child.text
            if len(countries) > 0 :
                try:
                    info[adj] = countries
                except:
                    info['land'] = countries
                        
            ## Get volume
            for ancestors in token.ancestors:
                for nephew in ancestors.children:
                    if nephew.ent_type_ == "QUANTITY" or nephew.ent_type_ == "CARDINAL":
                        for second_nephew in nephew.children:
                            if second_nephew.is_digit != nephew.is_digit:
                                if second_nephew.is_digit:
                                    info['volume'] = second_nephew.text
                                    info['volume_type'] = nephew.text
                                else:
                                    info['volume'] = nephew.text
                                    info['volume_type'] = second_nephew.text
            if 'volume' not in info:
                for child in token.children:
                    if (child.dep_ == "det" and child.like_num) or (child.dep_ == "nummod"):
                        info['volume'] = child.text
                                
        if len(info) > 1:
#             print(info)
            total_info.append(info)
    return total_info

### New

In [56]:
# Get linguistic distance between token a and token b. After iter 10 it is deemed a too far distance.
def get_linguistic_distance(a, b):
    tokens_to_consider = [b]
    found = False
    iters = 0
    while not found:
        for token in tokens_to_consider:
            tokens_to_add = []
            for ancestor in token.ancestors:
                if ancestor not in tokens_to_add and ancestor not in tokens_to_consider:
                    tokens_to_add.append(ancestor)
            for child in token.children:
                if child not in tokens_to_add and child not in tokens_to_consider:
                    tokens_to_add.append(child)
            tokens_to_consider = tokens_to_consider + tokens_to_add
        for x in tokens_to_consider:
            if a.orth == x.orth:
                found = True
        iters += 1
        if iters == 10:
            found = True
    return iters

In [57]:
def get_adposition_from_loc(token):
    for child in token.children:
        if child.pos_ == "ADP" and child.dep_ == "case":
            return child.text

In [58]:
def extract_chunk_info(txt):
    result = {}
    print('START CHUNK')
    for token in nlp(txt):
        if token.ent_type_ == "DRUG":
            print(f" \n Extracting info for {token.text}.")
            drug_info = extract_info_from_drug(token, txt)
            result[token.text] = drug_info
    return result
            
adj_list = []          
def extract_info_from_drug(drug, txt):
    volumes = []
    locations = {}
    irrelevant_locations = []
    for token in nlp(txt):
        
        # Extract countries
        if token.ent_type_ == "GPE":
            dist = get_linguistic_distance(drug, token)
#             if dist < 15:
            adj = get_adposition_from_loc(token)
            print(f"    {adj}: {token.text}, dist: {dist}, conj: {token.conjuncts}")
            locs = [token.text]
            for loc in token.conjuncts:
                locs.append(loc.text)
            if adj not in locations:
                locations[adj] = locs
                if adj not in adj_list:
                    adj_list.append(adj)
            else:
                for loc in locs:
                    if loc not in locations[adj]:
                        locations[adj].append(loc)
#             else:
#                 irrelevant_locations.append(token.text)
#                 print(f"{token.text} is irrelevant.")
        
        # Extract volume
        if token.ent_type_ == "QUANTITY":
            volume = {}
            dist = get_linguistic_distance(drug, token)
            second_token = ""
#             if dist < 10:
            quantity = {}
            for ancestor in token.ancestors:
                if ancestor.ent_type_ == "QUANTITY":
                    second_token = ancestor
            for child in token.children:
                if child.ent_type_ == "QUANTITY":
                    second_token = child

            ## Decide volume and volume_type
            if not isinstance(second_token, str):
                if nlp(token.text)[0].ent_type_ == "CARDINAL":
                    volume['volume'] = token.text
                    volume['volume_type'] = second_token.text
                    volume['dist'] = dist
                elif nlp(second_token.text)[0].ent_type_ == "CARDINAL":
                    volume['volume'] = second_token.text
                    volume['volume_type'] = token.text
                    volume['dist'] = dist

                #Only append when not already in volumes
                if volume not in volumes:
                    volumes.append(volume)
    #             else:
    #                 print(f"{token.text} is irrelevant.")
                
        
        
            
    print(volumes)

    result = {}
    if bool(locations):
        result['locations'] = locations
#     if len(volumes) > 0:
    result["volume"] = volumes
    if len(irrelevant_locations) > 0:
        result["irrelevant_locations"] = irrelevant_locations
    
    return result
    
        


### Without logging

In [59]:
def extract_chunk_info_without_log(txt):
    result = {}
    for token in nlp(txt):
        if token.ent_type_ == "DRUG":
            drug_info = extract_info_from_drug_without_log(token, txt)
            result[token.text] = drug_info
    return result
            
adj_list = []          
def extract_info_from_drug_without_log(drug, txt):
    volumes = []
    locations = {}
    irrelevant_locations = []
    for token in nlp(txt):
        
        # Extract countries
        if token.ent_type_ == "GPE":
            dist = get_linguistic_distance(drug, token)
#             if dist < 15:
            adj = get_adposition_from_loc(token)
            locs = [token.text]
            for loc in token.conjuncts:
                locs.append(loc.text)
            if adj not in locations:
                locations[adj] = locs
                if adj not in adj_list:
                    adj_list.append(adj)
            else:
                for loc in locs:
                    if loc not in locations[adj]:
                        locations[adj].append(loc)
#             else:
#                 irrelevant_locations.append(token.text)
#                 print(f"{token.text} is irrelevant.")
        
        # Extract volume
        if token.ent_type_ == "QUANTITY":
            volume = {}
            dist = get_linguistic_distance(drug, token)
            second_token = ""
#             if dist < 10:
            quantity = {}
            for ancestor in token.ancestors:
                if ancestor.ent_type_ == "QUANTITY":
                    second_token = ancestor
            for child in token.children:
                if child.ent_type_ == "QUANTITY":
                    second_token = child

            ## Decide volume and volume_type
            if not isinstance(second_token, str):
                if nlp(token.text)[0].ent_type_ == "CARDINAL":
                    volume['volume'] = token.text
                    volume['volume_type'] = second_token.text
                    volume['dist'] = dist
                elif nlp(second_token.text)[0].ent_type_ == "CARDINAL":
                    volume['volume'] = second_token.text
                    volume['volume_type'] = token.text
                    volume['dist'] = dist

                #Only append when not already in volumes
                if volume not in volumes:
                    volumes.append(volume)
    #             else:
    #                 print(f"{token.text} is irrelevant.")
                
        
        
            

    result = {}
    if bool(locations):
        result['locations'] = locations
#     if len(volumes) > 0:
    result["volume"] = volumes
    if len(irrelevant_locations) > 0:
        result["irrelevant_locations"] = irrelevant_locations
    
    return result
    
        


# Results

In [60]:
def fuse_chunks(chunks): 
    # Loop through chunks
    drug_dict = {}
    for chunk in chunks:
        chunk_info = extract_chunk_info_without_log(chunk)

        # Loop through drugs
        if chunk_info is not None:
            for drug in chunk_info:
                lowerdrug = drug.lower()
                if lowerdrug not in drug_dict:
                    drug_dict[lowerdrug] = {'locations': [], 'volumes': []}
                if len(chunk_info[drug]['volume']) > 0:
                    drug_dict[lowerdrug]['volumes'].append(chunk_info[drug]['volume'])
                if 'locations' in chunk_info[drug]:
                    drug_dict[lowerdrug]['locations'].append(chunk_info[drug]['locations'])
    return drug_dict

In [61]:
def fuse_locations(fused_chunks):
    for drug in fused_chunks:
        adjectives = {}
        for location_entry in fused_chunks[drug]['locations']:
            for adjective in location_entry:
                if adjective not in adjectives:
                    adjectives[adjective] = []
                for country in location_entry[adjective]:
                    if country not in adjectives[adjective]:
                        adjectives[adjective].append(country)
        fused_chunks[drug]['locations'] = adjectives
    return fused_chunks

In [62]:
def fuse_volumes(fused_locations):
    for drug in fused_locations:
        final_volume = {}
        dist = 100
        for volumes in fused_locations[drug]['volumes']:
            if len(volumes) > 0:
                for volume in volumes:
                    if 'dist' in volume:
                        if volume['dist'] < dist:
                            dist = volume['dist']
                            final_volume = volume
        fused_locations[drug]['volumes'] = final_volume
        return fused_locations

In [63]:
# country_translation_dict = {}
# countries_that_give_error = []

In [66]:
print(country_translation_dict)

{'Haarlem': 'Netherlands', 'Nederland': 'Netherlands', 'Argentinië': 'Argentina', 'Rotterdam': 'Netherlands', 'Jamaica': 'Jamaica', 'Utrecht': 'Netherlands', 'gemeente Tytsjerksteradiel': 'Netherlands', 'Duitsland': 'Germany', 'Bonn': 'Germany', 'Landgraaf': 'Netherlands', 'Oud-Gastel': 'Netherlands', 'Curaçao': 'Netherlands', 'Leende': 'Netherlands', 'Brabant': 'UnitedStates', 'Rijsbergen': 'Netherlands', 'Wernhout': 'Netherlands', 'Hoogeveen': 'Netherlands', 'Den Haag': 'Netherlands', 'Tilburg': 'Netherlands', 'Schiphol': 'Netherlands', 'Roermond': 'Netherlands', 'Sprundel': 'Netherlands', 'Essen': 'Germany', 'België': 'Belgium', 'Neede': 'Netherlands', 'Borculo': 'Netherlands', 'Kockengen': 'Netherlands', 'Nederhorst': 'Netherlands', 'Putten': 'Netherlands', 'Zeewolde': 'Netherlands', 'Leuven': 'Belgium', 'Baarn': 'Netherlands', 'Caldic': 'Netherlands', 'Maassluis': 'Netherlands', 'Roosendaal': 'Netherlands', 'Langdonk': 'Netherlands', 'Reek': 'Netherlands', 'gemeente Ermelo': 'Neth

In [67]:
def translate_locations(data):
    geolocator = Nominatim(user_agent = "geoapiExercises")
    if data is not None:
        for drug in data:
            if 'locations' in data[drug]:
                data[drug]['original_locations'] = data[drug]['locations'].copy()
                for adjective in data[drug]['locations']:
                    country_list = []
                    locations = data[drug]['locations'][adjective]
                    for loc in locations:
                        if loc not in country_translation_dict and loc not in countries_that_give_error:    
                            try:
                                location = geolocator.geocode(loc, language='en')
                                country_name = location.raw['display_name'].split(',')[-1].replace(' ', '')
                                country_translation_dict[loc] = country_name
                                if country_name not in country_list:
                                    country_list.append(country_name)
                            except Exception as e:
                                print(f"{loc} is not a location.")
                                countries_that_give_error.append(loc)
                        else:
                            if loc in country_translation_dict:
                                if country_translation_dict[loc] not in country_list:
                                    country_list.append(country_translation_dict[loc])
                    data[drug]['locations'][adjective] = country_list
    return data

In [68]:
def get_location_directions(data):
    from_adjectives = ['uit', 'vanuit', 'van']
    to_adjectives = ['naar']
    via_adjectives = ['via']
    if data is not None:
        for drug in data:
            fromlocs = []
            tolocs = []
            vialocs = []
            locations = data[drug]['locations']
            for adj in locations:
                if adj in from_adjectives:
                    for loc in locations[adj]:
                        if loc not in fromlocs:
                            fromlocs.append(loc)
                elif adj in to_adjectives:
                    for loc in locations[adj]:
                        if loc not in tolocs:
                            tolocs.append(loc)
                elif adj in via_adjectives:
                    for loc in locations[adj]:
                        if loc not in vialocs:
                            vialocs.append(loc)

            data[drug]['locations'] = {'from': fromlocs, 'to': tolocs, 'via': vialocs}
    return data

In [69]:
final_df = pd.DataFrame(columns=['ecli', 'drug', 'from', 'to', 'via'])    

    
    
from geopy.geocoders import Nominatim
                        
for index, row in trafficking_df.iterrows():
    id = row['id']
#     print(id)
    
    fused_chunks = fuse_chunks(row['chunks'])
    fused_locations = fuse_locations(fused_chunks)
    fused_volumes = fuse_volumes(fused_locations)
    translated_locations = translate_locations(fused_volumes)
    location_directions = get_location_directions(translated_locations)
    
#     print(location_directions)
    if location_directions is not None:
        for drug in location_directions:
            curr = location_directions[drug]
            fromloc = curr['locations']['from']
            toloc = curr['locations']['to']
            vialoc = curr['locations']['via']
            
            row = {'ecli': id, 'drug': drug, 'from': fromloc, 'to': toloc, 'via': vialoc}
            final_df = final_df.append(row, ignore_index=True)
        
final_df


doorzoekingen is not a location.
zoutzuur is not a location.
zwavelzuur is not a location.
mierenzuur is not a location.
diepvriezers is not a location.
vacuümmachine is not a location.
Neede/Borculo is not a location.
voorhanden is not a location.
uitblijft is not a location.
locaties is not a location.
medeverdachten is not a location.
gefaciliteerd is not a location.
gemaakt is not a location.
vervoerd is not a location.
1 is not a location.
4 is not a location.
  is not a location.
[ is not a location.
voorbereidingshandelingen is not a location.
opslaglocaties is not a location.
Verenigde is not a location.
voorbereidings-handelingen is not a location.
Nederlnd is not a location.
	 is not a location.
undercoveragenten is not a location.
] is not a location.
2 is not a location.
Uitleveringsbesluit is not a location.
Groot-Britannië is not a location.
uitvoer is not a location.
5 is not a location.
Beerzeveld is not a location.
telefoonnummers is not a location.
Nederlandtezamen is

Unnamed: 0,ecli,drug,from,to,via
0,ECLI:NL:RBZUT:2003:AH9598,drugs,[],[],[]
1,ECLI:NL:PHR:2007:BA1113,drugs,[],[],[]
2,ECLI:NL:GHAMS:2018:2662,cocaïne,[Argentina],[],[]
3,ECLI:NL:RBAMS:2017:9087,cocaïne,[],[],[]
4,ECLI:NL:RBAMS:2017:9085,cocaïne,[],[],[]
...,...,...,...,...,...
3847,ECLI:NL:HR:1998:ZD1191,hennep,[],[],[]
3848,ECLI:NL:HR:1998:ZD1191,marihuana,[],[],[]
3849,ECLI:NL:RBNHO:2013:10924,cocaïne,[],[],[]
3850,ECLI:NL:GHSHE:2021:3205,cocaïne,"[Netherlands, DominicanRepublic]",[DominicanRepublic],[]


In [144]:
final_df[final_df['ecli'] == 'ECLI:NL:RBOVE:2018:4873']

Unnamed: 0,ecli,drug,from,to,via


In [142]:
total_from_country_dict = {}
total_to_country_dict = {}

for index, row in final_df.iterrows():
    for i in row['from']:
        if i not in total_from_country_dict:
            total_from_country_dict[i] = 0
        total_from_country_dict[i] = total_from_country_dict[i] + 1
        
    for i in row['to']:
        if i not in total_to_country_dict:
            total_to_country_dict[i] = 0
        total_to_country_dict[i] = total_to_country_dict[i] + 1




print(sorted(total_from_country_dict.items(), key = lambda kv: kv[1]))
print('\n')
print(sorted(total_to_country_dict.items(), key = lambda kv: kv[1]))

[('Benin', 1), ('Australia', 1), ('Bolivia', 1), ('UnitedArabEmirates', 1), ('Czechia', 1), ('Liberia', 1), ('Angola', 1), ('Liechtenstein', 1), ('Greece', 1), ('Senegal', 1), ('Nigeria', 1), ('Asia', 2), ('Norway', 2), ('Ghana', 2), ('Iran', 2), ('Laos', 2), ('Ireland', 2), ('Panama', 2), ('Argentina', 3), ('Iceland', 3), ('Albania', 3), ('India', 4), ('Kenya', 4), ('Israel', 4), ('Denmark', 4), ('Mexico', 5), ('Romania', 5), ('Italy', 5), ('Chile', 5), ('Sweden', 6), ('Ukraine', 6), ('Thailand', 7), ('TrinidadandTobago', 8), ('Switzerland', 8), ('DemocraticRepublicoftheCongo', 9), ('Ethiopia', 9), ('UnitedKingdom', 12), ('Peru', 14), ('Venezuela', 14), ('Turkey', 16), ('Spain', 16), ('Ecuador', 16), ('China', 18), ('Suriname', 18), ('Germany', 20), ('Pakistan', 23), ('France', 23), ('UnitedStates', 25), ('Brazil', 26), ('Morocco', 39), ('DominicanRepublic', 53), ('Poland', 54), ('Colombia', 55), ('SouthAmerica', 67), ('Belgium', 121), ('Netherlands', 616)]


[('Jamaica', 1), ('Hungar

In [None]:
from geopy.geocoders import Nominatim

geolocator = Nominatim(user_agent = "geoapiExercises")
location = geolocator.geocode("colombia")
country_name = location.raw['display_name'].split(',')[-1].replace(' ', '')
print("Country Name: ", country_name)

#### Get complete adjective list and country list

In [133]:
complete_adjective_list = []

In [134]:
for index, row in trafficking_df.iterrows():
    fused_chunks = fuse_chunks(row['chunks'])
    for drug in fused_chunks:
        locations = fused_chunks[drug]['locations']
        for adjective in locations:
            for adj in adjective.keys():
                complete_adjective_list.append(adj)
            
                
            
    

In [135]:
len(complete_adjective_list)

8049

In [136]:
list(set(complete_adjective_list))

['ten',
 'tussen',
 'Op',
 'inzake',
 'buiten',
 'aan',
 'onder',
 'nabij',
 'langs',
 'over',
 'vanaf',
 'van',
 'naar',
 'volgens',
 None,
 'rondom',
 'jegens',
 'per',
 'binnen',
 'te',
 'middels',
 'met',
 'omtrent',
 'achter',
 'door',
 'ter',
 'uit',
 'vanuit',
 'via',
 'tegen',
 'op',
 'bij',
 'in',
 'In',
 'om',
 'voor']

# Sandbox

In [120]:
trafficking_df

Unnamed: 0,id,chunks
0,ECLI:NL:RBZUT:2003:AH9598,[hij op tijdstippen in de periode 6 februari 2...
1,ECLI:NL:RBZWB:2020:2646,[- een (compleet) in werking zijnde laboratori...
2,ECLI:NL:PHR:2007:BA1113,[In de maand december 1998 ontstond onder ande...
3,ECLI:NL:GHAMS:2018:2662,[Dit feit heeft betrekking op een geslaagde in...
4,ECLI:NL:RBAMS:2017:9087,[Verdachte heeft in de ochtend van 5 april 201...
...,...,...
2585,ECLI:NL:GHSHE:2020:1730,[Gezien het vorenstaande is het hof van oordee...
2586,ECLI:NL:HR:1998:ZD1191,"[""4. hij in de periode van 1 januari 1993 tot ..."
2587,ECLI:NL:RBNHO:2013:10924,[Met de officier van justitie en de raadsvrouw...
2588,ECLI:NL:GHSHE:2021:3205,[hij verdachte in of omstreeks de periode van ...


In [147]:
case = list(trafficking_df[trafficking_df['id'] == 'ECLI:NL:GHAMS:2018:2662']['chunks'])[0]

In [148]:
for chunk in case:
    displacy.render(nlp(chunk), style = 'ent')
    extract_chunk_info(chunk)

START CHUNK
 
 Extracting info for cocaïne.
    vanuit: Argentinië, dist: 2, conj: ()
[{'volume': '900', 'volume_type': 'gram'}]


In [None]:
def get_entities(txt):
    doc = nlp(txt)
    for ent in doc.ents:
        print(ent, ent.label_)

Get all loc entities

In [47]:
def list_entities(entity):
    entity_list = []
    for i in range(len(trafficking_df)):
        chunks = list(trafficking_df.iloc[i]['chunks'])
        for chunk in chunks:
            doc = nlp(chunk)
            for ent in doc.ents:
                if ent.label_ == entity:
#                     print(trafficking_df.iloc[i]['id'])
                    if ent.text not in entity_list:
                        entity_list.append(ent.text)
    return entity_list

In [187]:
loc_list = list_entities("LOC")


In [188]:
print(loc_list)

['gaswasser', 'Noordzee', 'Ettenseweg', 'Klein Horendonk', 'Maasdelta', 'Rotterdamse haven', 'medeveroordeelde', 'Prins Clausstraat', 'Westerschelde', 'Daartegenover', 'Wvmc', 'NFI-onderzoek', 'woning.2', 'inrichting/opbouw', 'Verderop', 'ZD04', 'Westelijke Havendijk', 'IJmond', 'Leuckartmethode', 'Eindhovenseweg', 'Zuid', 'koeriersauto', 'Tonsdijk', 'mengsels/substanties', 'Zuid-Limburgse', 'Industrieweg', 'Oost', 'micro-cellulose', 'Opiumwetbesluit', 'Wieringerwaardstraat', 'Luikerweg', 'verwijzingsbevel', 'Pijp', 'Lier', 'auditu-verklaringen', 'Keton', 'Rijn', 'Josemans-arrest', 'Bouchereau-arrest', 'schuur.5', 'Vogelweide', 'Ermelo4', 'Stationsweg', 'beek', 'Roteb', 'Gezinsherenigingsrichtlijn', 'Korvelseweg', 'Berkel', 'Arnhem', 'reactieketels L2', 'Gravesend', 'Sint Hubertuslaan', 'Tweede Groenedijk', 'Ermerweg', 'Diemen-Zuid', 'Noord-Beemster', 'Douanelaboratorium', 'Efedrine', 'enkelslag tabletteermachine', 'Einlassung', 'hennepkweek', 'AE-1-2017', 'Spaanse Polder', 'Nagel-onde

In [None]:
gpe_list = list_entities("GPE")
print(gpe_list)

In [49]:
for i in gpe_list:
    print(i)

Argentinië
Rotterdam
Nederland
Jamaica
Hoogeveen
gemeente Hoogeveen
Utrecht
Frankrijk
België
Duitsland
Bonn
Landgraaf
buitenland
Curaçao
Leende
drugs
Wernhout
Roden
Leek
Drachten
Tolbert
Tilburg
Roermond
Sprundel
Nederhorst
Putten
Zeewolde
Leuven
Neede/Borculo
Borculo
Baarn
MDMA‑kristallen
Langdonk
Roosendaal
Neede/
namen
Mallorca
Uden
Turnhout
Oisterwijk
Neede
Oost-Nederland
Dominicaanse
Medeveroordeelde
Zuid-Amerika
Ecuador
Italië
Schuinesloot
Zeeland
Esch
Rilland
Oosterhout
Montfoort
Ahaus
Moergestel
Arnhem
Brazilië
Oss
Antwerpen
gemeente Kaag
Braassem
Harderwijk
Mono
Peru
Amsterdam
Zweden
Lienden
Ommen
Eindhoven
gemeente Heerlen
Suriname
gemeente Venlo
Europa
gemeente Haarlem
Liessel
Alkmaar
Breda
shal
Colombia
fenetylline
Nuenen gemeente Nuenen
Seville 1
Polen
Knegsel
Noord-Nederland
Leeuwarden
Pakistan
Azië
Vriezenveen
Liempde gemeente Boxtel
Belgie
Hechtel-Eksel
Akersloot
Amstelveen
Gastel
Malaga
Marokko
Spanje
Tanger
gemeente Nieuwkoop
Houten
Engeland
Denemarken
Finland
Bergen
