# Create dataset

In [None]:
import pandas as pd
import os

In [None]:
dataPath = os.getcwd() + '/court case data/testdata/'
caseCount = len(os.listdir(dataPath))
data = []
try:
    os.remove(dataPath + ".DS_Store")
except:
    print("No file DS_Store")
for filename in os.listdir(dataPath):
    f = open(os.path.join(dataPath, filename), encoding='utf-8')
    data.append([filename.replace('.txt', ''), f.read()])

verdict_df = pd.DataFrame(data, columns=["id", "case text"])
cases_df = pd.read_csv('./court case data/testdata.csv')
merged_df = cases_df.join(verdict_df.set_index('id'), on='id', how='left')

merged_df["verdict_date"] = pd.to_datetime(merged_df["verdict_date"])
merged_df["publication_date"] = pd.to_datetime(merged_df["publication_date"])

In [None]:
merged_df

# Split documents

### Old

In [None]:
# -*- coding: utf-8 -*-
import re
alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr|mr|mevr|mvr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov|nl)"
articles = "[artikel ][0-9][.][0-9]"

def split_into_sentences2(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    text = re.sub(articles,"[artikelnummer]",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    return sentences

### New

In [None]:
import re

In [None]:
def split_into_sentences(text):
    sentences = re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])|\n', text)
    sentences = [x for x in sentences if len(x) > 1]
    return sentences


Two ways of splitting the documents:
    For Word2Vec, we need sentences to be an array of words.
    For the rest, just the sentence is enough.

In [None]:
tenlastelegging_words = ['tenlastelegging', 'telastelegging', 'tenlasteleggingen', 'telastlegging']

def trim_by_tenlastelegging(doc):
    trimmed_doc = doc
    stop = False
    for keyword in tenlastelegging_words:
        if not stop and keyword in doc:
            keyword_index = doc.find(keyword)
            trimmed_doc = doc[keyword_index:]
            stop = True
    return trimmed_doc

In [None]:
sentence_list_by_word = []
sentence_list = []

for i in range(len(merged_df)):
    doc = merged_df.iloc[i]['case text']
    trimmed_doc = trim_by_tenlastelegging(doc)
    sentences = split_into_sentences(trimmed_doc)
    sentence_list.append(sentences)
    for j in sentences:
        word_list = [x for x in j.lower().rstrip().replace('.', '').split(' ') if len(x)>0]
        sentence_list_by_word.append(word_list)
        
print(len(sentence_list))
print(len(sentence_list_by_word))

In [None]:
print(len(sentence_list))

In [None]:
print(len(sentence_list_by_word))

# Create Word2Vec model

In [None]:
from gensim.models import Word2Vec
from gensim.test.utils import common_texts

Create and save model

In [None]:
# dutch_word2vec_model = Word2Vec(sentences=sentence_list_by_word, vector_size=100, window=5, min_count=1, workers=4)
# dutch_word2vec_model.save("word2vec_dutch_court_cases.model")

Load model

In [None]:
dutch_word2vec_model = Word2Vec.load("word2vec_dutch_court_cases.model")

In [None]:
sims = dutch_word2vec_model.wv.most_similar('xtc', topn=100)
print([i[0] for i in sims])
# print(sims)

### Create list of drugs, smuggle, quantity keywords with Word2Vec model

In [None]:
def create_word2vec_relevant_words(words, matches):
    word2vec_list = []
    for word in words:
        results = dutch_word2vec_model.wv.most_similar(word, topn=100)
        for i in results:
            word2vec_list.append(i[0])
            
    word2vec_list = list(set([i for i in word2vec_list if word2vec_list.count(i)>matches]))
    return word2vec_list

Drugs list:

In [None]:
list_of_drugs = ['xtc', 'mdma', 'cocaine', 'wiet', 'speed', 'bmk', 'pmk']
word2vec_drug_list = create_word2vec_relevant_words(list_of_drugs, 2)

print(len(word2vec_drug_list))
print(word2vec_drug_list)

In [None]:
drugs_to_exclude = ['pasta', 'kristallen', 'poedervorm']
word2vec_drug_list = [drug for drug in word2vec_drug_list if drug not in drugs_to_exclude]
print(len(word2vec_drug_list))

Smuggle keyword list:

In [None]:
list_of_smuggle_words = ['smokkel', 'invoer', 'uitvoer', 'import', 'export', 'transport']
word2vec_smuggle_list = create_word2vec_relevant_words(list_of_smuggle_words, 3)

word2vec_smuggle_list = list(set(word2vec_smuggle_list + list_of_smuggle_words ))
print(len(word2vec_smuggle_list))
print(word2vec_smuggle_list)

In [None]:
# drugs_to_exclude = ['pasta', 'kristallen', 'poedervorm']
# word2vec_drug_list = [drug for drug in word2vec_drug_list if drug not in drugs_to_exclude]
# print(len(word2vec_drug_list))

Quantity keyword list:

In [None]:
list_of_quantity_words = ['tabletten', 'kilo', 'gram', 'pakketten']
word2vec_quantity_list = create_word2vec_relevant_words(list_of_quantity_words, 2)

word2vec_quantity_list = list(set(word2vec_quantity_list + list_of_quantity_words))
print(len(word2vec_quantity_list))
print(word2vec_quantity_list)

Country list:

In [None]:
list_of_countries = ['duitsland', 'colombia', 'alicante']
word2vec_country_list = create_word2vec_relevant_words(list_of_countries, 1)

print(len(word2vec_country_list))
print(word2vec_country_list)

# Create SpaCy model

In [None]:
import spacy
from spacy import displacy
# !python -m spacy download nl_core_news_md
nlp = spacy.load('nl_core_news_md')


In [None]:
my_file = open("drugs list.txt", "r", encoding='utf-8')
my_file = my_file.readlines()
drugs_list = []
for i in my_file:
    drugs_list.append(i.replace('\n', ''))
drugs_list = list(set(drugs_list + word2vec_drug_list))
    
my_file = open("countries list.txt", "r", encoding='utf-8')
my_file = my_file.readlines()
countries_list = []
for i in my_file:
    countries_list.append(i.replace('\n', ''))
countries_list = list(set(countries_list + word2vec_country_list))

my_file = open("countries_to_exclude.txt", "r", encoding='utf-8')
my_file = my_file.readlines()
countries_to_exclude = []
for i in my_file:
    countries_to_exclude.append(i.replace('\n', ''))


In [None]:
with open('countries_that_give_error.txt', 'w') as f:
    for item in countries_that_give_error:
        f.write("%s\n" % item)

In [None]:
def configure_spacy_model():
    # Create dict of drug pattern and quantity pattern
    pattern_list = []
    
    for i in countries_to_exclude:
        pattern_list.append({"label": "EXCL", "pattern": [{"lower": i.lower()}]})
    
    for i in drugs_list:
        pattern_list.append({"label": "DRUG", "pattern": [{"lower": i.lower()}]})
    
#     quantity_rule = {"label": "QUANTITY", "pattern": [{"IS_DIGIT": True}, {"LOWER": "gram"}]}
#     pattern_list.append(quantity_rule)
    for i in word2vec_quantity_list:
        pattern_list.append({"label": "QUANTITY", "pattern": [{"IS_DIGIT": True}, {"LOWER": i}]})
        pattern_list.append({"label": "QUANTITY", "pattern": [{"ENT_TYPE": "CARDINAL"}, {"LOWER": i}]})
    
    for i in countries_list:
        pattern_list.append({"label": "GPE", "pattern": [{"lower": i.replace(' ', '').lower()}]})
    
    # Add drug and quantity rules to the model
    config = {
   "phrase_matcher_attr": None,
   "validate": True,
   "overwrite_ents": True,
   "ent_id_sep": "||",
    }
    ruler = nlp.add_pipe("entity_ruler", config=config)

    #List of Entities and Patterns
#     patterns = drugs_ent_list
    ruler.add_patterns(pattern_list)

In [None]:
configure_spacy_model()

# Select cases and chunks to keep

In [None]:
print(len(sentence_list) == len(merged_df))

For every case, split the sentences. If a sentence in a case contains a drug, a smuggle word, and a location: keep chunk and save to trafficking_df

In [None]:
country_translation_dict = {}
countries_that_give_error = []

In [None]:
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent = "geoapiExercises")

def get_google_country(loc):
    if loc in country_translation_dict:
        return country_translation_dict[loc]
    if loc in countries_that_give_error:
        return "None"
    else:
        try:
            location = geolocator.geocode(loc, language='en')
            country_name = location.raw['display_name'].split(',')[-1]
            country_translation_dict[loc] = country_name
            return country_translation_dict[loc]
        except:
            print(f"{loc} is not a location.")
            countries_that_give_error.append(loc)
            return "None"


In [None]:
%%time

relevant_chunk_list = []
ecli_list = []

from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent = "geoapiExercises")

for index, case in enumerate(sentence_list):
    chunk_list = []
    trafficking_related = False
    for chunk in case:
        word_list = [x for x in chunk.lower().rstrip().replace('.', '').split(' ') if len(x)>0]
        if any(drug in word_list for drug in word2vec_drug_list) and any(smuggle_word in word_list for smuggle_word in word2vec_smuggle_list):
            ents = nlp(chunk).ents
            if any(ent.label_ == "GPE" for ent in ents):
                stop = False
                for ent in ents:
                    if not stop and ent.label_ == "GPE":
                        country = get_google_country(ent.text)
                        if country != "Netherlands" and country != "None":
                            trafficking_related = True
                            chunk_list.append(chunk)
                            stop = True
    if trafficking_related:
        relevant_chunk_list.append(chunk_list)
        ecli_list.append(merged_df.iloc[index]['id'].replace('-', ':'))

trafficking_df = pd.DataFrame({'id': pd.Series(ecli_list), 'chunks': pd.Series(relevant_chunk_list)})  

In [None]:
print(f"{len(trafficking_df)} cases kept from original {len(merged_df)} cases.")


# Create rule-based NER & POS tagging model

### Old

In [None]:
def extract_chunk_info(txt):
    source_country = None
    total_info = []
    for token in nlp(txt):
        info = {}
        drug_info = {}
        countries = []
        
        if token.ent_type_ == "DRUG":
            info = {"drug": token.text}
            
            ## Get source and destination
            for ancestor in token.ancestors:
                for nephew in ancestor.children:
                    if nephew.ent_type_ == "GPE" or nephew.ent_type_ == "LOC":
                        countries.append(nephew)
                        for child in nephew.children:
                            if child.dep_ == "conj" and child.ent_type_ == "GPE" or child.ent_type_ == "LOC":
                                countries.append(child.text)
                            elif child.pos_ == "ADP" and child.dep_ == "case":
                                adj = child.text
            if len(countries) > 0 :
                try:
                    info[adj] = countries
                except:
                    info['land'] = countries
                        
            ## Get volume
            for ancestors in token.ancestors:
                for nephew in ancestors.children:
                    if nephew.ent_type_ == "QUANTITY" or nephew.ent_type_ == "CARDINAL":
                        for second_nephew in nephew.children:
                            if second_nephew.is_digit != nephew.is_digit:
                                if second_nephew.is_digit:
                                    info['volume'] = second_nephew.text
                                    info['volume_type'] = nephew.text
                                else:
                                    info['volume'] = nephew.text
                                    info['volume_type'] = second_nephew.text
            if 'volume' not in info:
                for child in token.children:
                    if (child.dep_ == "det" and child.like_num) or (child.dep_ == "nummod"):
                        info['volume'] = child.text
                                
        if len(info) > 1:
#             print(info)
            total_info.append(info)
    return total_info

### New

In [None]:
# Get linguistic distance between token a and token b. After iter 10 it is deemed a too far distance.
def get_linguistic_distance(a, b):
    tokens_to_consider = [b]
    found = False
    iters = 0
    while not found:
        for token in tokens_to_consider:
            tokens_to_add = []
            for ancestor in token.ancestors:
                if ancestor not in tokens_to_add and ancestor not in tokens_to_consider:
                    tokens_to_add.append(ancestor)
            for child in token.children:
                if child not in tokens_to_add and child not in tokens_to_consider:
                    tokens_to_add.append(child)
            tokens_to_consider = tokens_to_consider + tokens_to_add
        for x in tokens_to_consider:
            if a.orth == x.orth:
                found = True
        iters += 1
        if iters == 10:
            found = True
    return iters

In [None]:
def get_adposition_from_loc(token):
    for child in token.children:
        if child.pos_ == "ADP" and child.dep_ == "case":
            return child.text

In [None]:
def extract_chunk_info(txt):
    result = {}
    print('START CHUNK')
    for token in nlp(txt):
        if token.ent_type_ == "DRUG":
            print(f" \n Extracting info for {token.text}.")
            drug_info = extract_info_from_drug(token, txt)
            result[token.text] = drug_info
    return result
            
adj_list = []          
def extract_info_from_drug(drug, txt):
    volumes = []
    locations = {}
    irrelevant_locations = []
    for token in nlp(txt):
        
        # Extract countries
        if token.ent_type_ == "GPE":
            dist = get_linguistic_distance(drug, token)
#             if dist < 15:
            adj = get_adposition_from_loc(token)
            print(f"    {adj}: {token.text}, dist: {dist}, conj: {token.conjuncts}")
            locs = [token.text]
            for loc in token.conjuncts:
                locs.append(loc.text)
            if adj not in locations:
                locations[adj] = locs
                if adj not in adj_list:
                    adj_list.append(adj)
            else:
                for loc in locs:
                    if loc not in locations[adj]:
                        locations[adj].append(loc)
#             else:
#                 irrelevant_locations.append(token.text)
#                 print(f"{token.text} is irrelevant.")
        
        # Extract volume
        if token.ent_type_ == "QUANTITY":
            volume = {}
            dist = get_linguistic_distance(drug, token)
            second_token = ""
#             if dist < 10:
            quantity = {}
            for ancestor in token.ancestors:
                if ancestor.ent_type_ == "QUANTITY":
                    second_token = ancestor
            for child in token.children:
                if child.ent_type_ == "QUANTITY":
                    second_token = child

            ## Decide volume and volume_type
            if not isinstance(second_token, str):
                if nlp(token.text)[0].ent_type_ == "CARDINAL":
                    volume['volume'] = token.text
                    volume['volume_type'] = second_token.text
                    volume['dist'] = dist
                elif nlp(second_token.text)[0].ent_type_ == "CARDINAL":
                    volume['volume'] = second_token.text
                    volume['volume_type'] = token.text
                    volume['dist'] = dist

                #Only append when not already in volumes
                if volume not in volumes:
                    volumes.append(volume)
    #             else:
    #                 print(f"{token.text} is irrelevant.")
                
        
        
            
    print(volumes)

    result = {}
    if bool(locations):
        result['locations'] = locations
#     if len(volumes) > 0:
    result["volume"] = volumes
    if len(irrelevant_locations) > 0:
        result["irrelevant_locations"] = irrelevant_locations
    
    return result
    
        


### Without logging

In [None]:
def extract_chunk_info_without_log(txt):
    result = {}
    for token in nlp(txt):
        if token.ent_type_ == "DRUG":
            drug_info = extract_info_from_drug_without_log(token, txt)
            result[token.text] = drug_info
    return result
            
adj_list = []          
def extract_info_from_drug_without_log(drug, txt):
    volumes = []
    locations = {}
    irrelevant_locations = []
    for token in nlp(txt):
        
        # Extract countries
        if token.ent_type_ == "GPE":
            dist = get_linguistic_distance(drug, token)
#             if dist < 15:
            adj = get_adposition_from_loc(token)
            locs = [token.text]
            for loc in token.conjuncts:
                locs.append(loc.text)
            if adj not in locations:
                locations[adj] = locs
                if adj not in adj_list:
                    adj_list.append(adj)
            else:
                for loc in locs:
                    if loc not in locations[adj]:
                        locations[adj].append(loc)
#             else:
#                 irrelevant_locations.append(token.text)
#                 print(f"{token.text} is irrelevant.")
        
        # Extract volume
        if token.ent_type_ == "QUANTITY":
            volume = {}
            dist = get_linguistic_distance(drug, token)
            second_token = ""
#             if dist < 10:
            quantity = {}
            for ancestor in token.ancestors:
                if ancestor.ent_type_ == "QUANTITY":
                    second_token = ancestor
            for child in token.children:
                if child.ent_type_ == "QUANTITY":
                    second_token = child

            ## Decide volume and volume_type
            if not isinstance(second_token, str):
                if nlp(token.text)[0].ent_type_ == "CARDINAL":
                    volume['volume'] = token.text
                    volume['volume_type'] = second_token.text
                    volume['dist'] = dist
                elif nlp(second_token.text)[0].ent_type_ == "CARDINAL":
                    volume['volume'] = second_token.text
                    volume['volume_type'] = token.text
                    volume['dist'] = dist

                #Only append when not already in volumes
                if volume not in volumes:
                    volumes.append(volume)
    #             else:
    #                 print(f"{token.text} is irrelevant.")
                
        
        
            

    result = {}
    if bool(locations):
        result['locations'] = locations
#     if len(volumes) > 0:
    result["volume"] = volumes
    if len(irrelevant_locations) > 0:
        result["irrelevant_locations"] = irrelevant_locations
    
    return result
    
        


# Results

In [None]:
def fuse_chunks(chunks): 
    # Loop through chunks
    drug_dict = {}
    for chunk in chunks:
        chunk_info = extract_chunk_info_without_log(chunk)

        # Loop through drugs
        if chunk_info is not None:
            for drug in chunk_info:
                lowerdrug = drug.lower()
                if lowerdrug not in drug_dict:
                    drug_dict[lowerdrug] = {'locations': [], 'volumes': []}
                if len(chunk_info[drug]['volume']) > 0:
                    drug_dict[lowerdrug]['volumes'].append(chunk_info[drug]['volume'])
                if 'locations' in chunk_info[drug]:
                    drug_dict[lowerdrug]['locations'].append(chunk_info[drug]['locations'])
    return drug_dict

In [None]:
def fuse_locations(fused_chunks):
    for drug in fused_chunks:
        adjectives = {}
        for location_entry in fused_chunks[drug]['locations']:
            for adjective in location_entry:
                if adjective not in adjectives:
                    adjectives[adjective] = []
                for country in location_entry[adjective]:
                    if country not in adjectives[adjective]:
                        adjectives[adjective].append(country)
        fused_chunks[drug]['locations'] = adjectives
    return fused_chunks

In [None]:
def fuse_volumes(fused_locations):
    for drug in fused_locations:
        final_volume = {}
        dist = 100
        for volumes in fused_locations[drug]['volumes']:
            if len(volumes) > 0:
                for volume in volumes:
                    if 'dist' in volume:
                        if volume['dist'] < dist:
                            dist = volume['dist']
                            final_volume = volume
        fused_locations[drug]['volumes'] = final_volume
        return fused_locations

In [None]:
# country_translation_dict = {}
# countries_that_give_error = []

In [None]:
def translate_locations(data):
    geolocator = Nominatim(user_agent = "geoapiExercises")
    if data is not None:
        for drug in data:
            if 'locations' in data[drug]:
                data[drug]['original_locations'] = data[drug]['locations'].copy()
                for adjective in data[drug]['locations']:
                    country_list = []
                    locations = data[drug]['locations'][adjective]
                    for loc in locations:
                        if loc not in country_translation_dict and loc not in countries_that_give_error:    
                            try:
                                location = geolocator.geocode(loc, language='en')
                                country_name = location.raw['display_name'].split(',')[-1]
                                country_translation_dict[loc] = country_name
                                if country_name not in country_list:
                                    country_list.append(country_name)
                            except Exception as e:
                                print(f"{loc} is not a location.")
                                countries_that_give_error.append(loc)
                        else:
                            if loc in country_translation_dict:
                                if country_translation_dict[loc] not in country_list:
                                    country_list.append(country_translation_dict[loc])
                    data[drug]['locations'][adjective] = country_list
    return data

In [None]:
def get_location_directions(data):
    from_adjectives = ['uit', 'vanuit', 'van']
    to_adjectives = ['naar']
    via_adjectives = ['via']
    if data is not None:
        for drug in data:
            fromlocs = []
            tolocs = []
            vialocs = []
            locations = data[drug]['locations']
            for adj in locations:
                if adj in from_adjectives:
                    for loc in locations[adj]:
                        if loc not in fromlocs:
                            fromlocs.append(loc)
                elif adj in to_adjectives:
                    for loc in locations[adj]:
                        if loc not in tolocs:
                            tolocs.append(loc)
                elif adj in via_adjectives:
                    for loc in locations[adj]:
                        if loc not in vialocs:
                            vialocs.append(loc)

            data[drug]['locations'] = {'from': fromlocs, 'to': tolocs, 'via': vialocs}
    return data

In [None]:
final_df = pd.DataFrame(columns=['ecli', 'drug', 'relevant_countries'])    

    
                            
for index, row in trafficking_df.iterrows():
    id = row['id']
#     print(id)
    
    fused_chunks = fuse_chunks(row['chunks'])
    fused_locations = fuse_locations(fused_chunks)
    fused_volumes = fuse_volumes(fused_locations)
    translated_locations = translate_locations(fused_volumes)
#     location_directions = get_location_directions(translated_locations)
    
#     print(location_directions)
    if translated_locations is not None:
        for drug in translated_locations:
            relevant_countries = []
            curr = translated_locations[drug]
            for adjective in curr['locations']:
                locs = curr['locations'][adjective]
                for loc in locs:
                    if loc not in relevant_countries:
                        relevant_countries.append(loc)
                    
            
            
            row = {'ecli': id, 'drug': drug, 'relevant_countries': relevant_countries}
            final_df = final_df.append(row, ignore_index=True)
        
final_df

In [None]:
vectorcounts = {}

for index, row in final_df.iterrows():
    locs = row['relevant_countries']
    for loc in locs:
        if loc != "Netherlands":
            if loc not in vectorcounts:
                vectorcounts[loc] = [0]
            vectorcounts[loc][0] = vectorcounts[loc][0] + 1
            
print(vectorcounts)

In [63]:
final_df.to_csv('final_df.csv', index=False)

In [62]:
import plotly.express as px
import numpy as np
import pandas as pd

np.random.seed(12)
gapminder = px.data.gapminder().query("year==2007")
#gapminder['counts'] = np.nan

d = vectorcounts

yourdata = pd.DataFrame(d).T.reset_index()
yourdata.columns=['country', 'count']

df=pd.merge(gapminder, yourdata, how='left', on='country')

fig = px.choropleth(df, locations="iso_alpha",
                    color="count", 
                    hover_name="country", # column to add to hover information
                    color_continuous_scale=px.colors.sequential.Plasma)

fig.show()

In [None]:
final_df = pd.DataFrame(columns=['ecli', 'drug', 'from', 'to', 'via'])    

    
    
from geopy.geocoders import Nominatim
                        
for index, row in trafficking_df.iterrows():
    id = row['id']
#     print(id)
    
    fused_chunks = fuse_chunks(row['chunks'])
    fused_locations = fuse_locations(fused_chunks)
    fused_volumes = fuse_volumes(fused_locations)
    translated_locations = translate_locations(fused_volumes)
    location_directions = get_location_directions(translated_locations)
    
#     print(location_directions)
    if location_directions is not None:
        for drug in location_directions:
            curr = location_directions[drug]
            fromloc = curr['locations']['from']
            toloc = curr['locations']['to']
            vialoc = curr['locations']['via']
            
            row = {'ecli': id, 'drug': drug, 'from': fromloc, 'to': toloc, 'via': vialoc}
            final_df = final_df.append(row, ignore_index=True)
        
final_df


In [None]:
from geopy.geocoders import Nominatim

geolocator = Nominatim(user_agent = "geoapiExercises")
location = geolocator.geocode("verenigde staten")
country_name = location.raw['display_name'].split(',')[-1]
print(country_name)
print(len(country_name))

#### Get complete adjective list and country list

In [None]:
complete_adjective_list = []

In [None]:
for index, row in trafficking_df.iterrows():
    fused_chunks = fuse_chunks(row['chunks'])
    for drug in fused_chunks:
        locations = fused_chunks[drug]['locations']
        for adjective in locations:
            for adj in adjective.keys():
                complete_adjective_list.append(adj)
            
                
            
    

In [None]:
len(complete_adjective_list)

In [None]:
list(set(complete_adjective_list))

# Sandbox

In [None]:
trafficking_df

In [None]:
case = list(trafficking_df[trafficking_df['id'] == 'ECLI:NL:GHAMS:2018:2662']['chunks'])[0]

In [None]:
for chunk in case:
    displacy.render(nlp(chunk), style = 'ent')
    extract_chunk_info(chunk)

In [None]:
def get_entities(txt):
    doc = nlp(txt)
    for ent in doc.ents:
        print(ent, ent.label_)

Get all loc entities

In [None]:
def list_entities(entity):
    entity_list = []
    for i in range(len(trafficking_df)):
        chunks = list(trafficking_df.iloc[i]['chunks'])
        for chunk in chunks:
            doc = nlp(chunk)
            for ent in doc.ents:
                if ent.label_ == entity:
#                     print(trafficking_df.iloc[i]['id'])
                    if ent.text not in entity_list:
                        entity_list.append(ent.text)
    return entity_list

In [None]:
loc_list = list_entities("LOC")


In [None]:
print(loc_list)

In [None]:
gpe_list = list_entities("GPE")
print(gpe_list)

In [None]:
for i in gpe_list:
    print(i)