### Config

In [None]:
import csv
import requests
import json
import urllib.request
import os
import re
from bs4 import BeautifulSoup
import pandas as pd
from IPython.display import clear_output
import matplotlib.pyplot as plt
from threading import Thread
from nltk.corpus import stopwords
dutch_stopwords = stopwords.words("dutch")
complete_case_url = 'https://uitspraken.rechtspraak.nl/inziendocument?id='


In [None]:
!python --version
!jupyter --version

In [None]:
query = 'drugs'
base_verdicts_url = 'https://uitspraken.rechtspraak.nl/api/zoek' 
complete_case_url = 'https://uitspraken.rechtspraak.nl/inziendocument?id='
case_count = 10000000 ## Amount of cases to retrieve. If you want max, just do an insanely high number, it stops when it fetched all cases. Backend does not accept values like 'max'.
save_text_location = os.getcwd() + '/court case data/testdata/' # You have to have created this folder first. 
cases_df = pd.DataFrame()

index = ["id", "verdict_date", "publication_date", "verdict_type", "jurisdiction_type"]


files = {
        "StartRow": 0,
        "PageSize": case_count,
        "ShouldReturnHighlights":'true',
        "ShouldCountFacets":'true',
        "SortOrder":"Relevance",
        "SearchTerms":[{"Term":query,"Field":"AlleVelden"}],
        "Contentsoorten":[],
        "Rechtsgebieden":[],
        "Instanties":[],
        "DatumPublicatie":[],
        "DatumUitspraak":[],
        "Advanced":{"PublicatieStatus":"AlleenGepubliceerd"},
        "CorrelationId":"9abc658b0ce64f8786992af6965aabc4",
        "Proceduresoorten":[]
    }

# Court Case Scraper

### Config

In [None]:
query = 'drugs'
base_verdicts_url = 'https://uitspraken.rechtspraak.nl/api/zoek' 
complete_case_url = 'https://uitspraken.rechtspraak.nl/inziendocument?id='
case_count = 10000000 ## Amount of cases to retrieve. If you want max, just do an insanely high number, it stops when it fetched all cases. Backend does not accept values like 'max'.
save_text_location = os.getcwd() + '/court case data/testdata/' # You have to have created this folder first. 
cases_df = pd.DataFrame()

index = ["id", "verdict_date", "publication_date", "verdict_type", "jurisdiction_type"]


files = {
        "StartRow": 0,
        "PageSize": case_count,
        "ShouldReturnHighlights":'true',
        "ShouldCountFacets":'true',
        "SortOrder":"Relevance",
        "SearchTerms":[{"Term":query,"Field":"AlleVelden"}],
        "Contentsoorten":[],
        "Rechtsgebieden":[],
        "Instanties":[],
        "DatumPublicatie":[],
        "DatumUitspraak":[],
        "Advanced":{"PublicatieStatus":"AlleenGepubliceerd"},
        "CorrelationId":"9abc658b0ce64f8786992af6965aabc4",
        "Proceduresoorten":[]
    }


### Old Methods

Method to extract the text from the casefiles. Search the uitsprakenBaseURL with case ID to get the html file. 
Filter through the html file with BeautifulSoup and look for either class 'uitspraak' or 'conclusie' as the standard seems to switch between cases. 
Parse html file to textfile to get a clean text file. Replace \n and commas with empty strings.

In [None]:
def getCaseText(caseId):
    response = requests.get(uitsprakenBaseUrl + caseId)
    soup = BeautifulSoup(response.text, 'html.parser')
    soup_content = soup.find("div", {"id": 'content'})
    uitspraak_html = soup_content.find('div', {'class': 'uitspraak'})
    if soup_content.find('div', {'class': 'uitspraak'}) is not None:
        uitspraak_html = soup_content.find('div', {'class': 'uitspraak'})
    elif soup_content.find('div', {'class': 'conclusie'}) is not None:
        uitspraak_html =  soup_content.find('div', {'class': 'conclusie'})
    uitspraak = uitspraak_html.get_text()
    uitspraak = uitspraak.replace(',', '')
    return uitspraak

Method to parse each case individually. Save case attributes to cases.csv and save each case text to a .txt file in /data/ folder and name it the case ID.

In [None]:
def parseCaseInfo(results):
    global cases_df
    for case in results:
        caseText = getCaseText(case['TitelEmphasis'])
        parsedId = case['TitelEmphasis'].replace(':', '-') # Case id's apparentlyl use ':'. We cannot save files with ':' in the name, so we replace them with '-'.
        f= open(save_text_location + parsedId + ".txt","w+", encoding='utf-8') # Doesn't work without encoding. 
        f.write(caseText)
        case['Case ID'] = parsedId
        cases_df = cases_df.append(case, ignore_index = True)
        print('Processed case', case['Case ID'])
    cases_df.to_csv('cases3.csv', index=False)

Method to retrieve raw data from the backend. Backend url is at the top in the Config. files = complete request payload the backend expects to receive. 


In [None]:
def queryUitspraak():
    print("Querying")
    files = {
        "StartRow": 0,
        "PageSize": case_count,
        "ShouldReturnHighlights":'true',
        "ShouldCountFacets":'true',
        "SortOrder":"Relevance",
        "SearchTerms":[{"Term":searchQuery,"Field":"AlleVelden"}],
        "Contentsoorten":[],
        "Rechtsgebieden":[],
        "Instanties":[],
        "DatumPublicatie":[],
        "DatumUitspraak":[],
        "Advanced":{"PublicatieStatus":"AlleenGepubliceerd"},
        "CorrelationId":"9abc658b0ce64f8786992af6965aabc4",
        "Proceduresoorten":[]
    }
    try:
        response = requests.post(baseUrl, json=files)
        responseJSON = json.loads(response.text)
        results = responseJSON['Results']
        print(len(results), "records!")
        parseCaseInfo(results)
    except urllib.error.HTTPError as err:
        print(err) 

queryUitspraak()

### Methods

Get cases from https://uitspraken.rechtspraak.nl/api/zoek.
Returns a collection of dictionaries.

In [None]:
def get_cases(config):
    try:
        response = requests.post(base_verdicts_url, json=config)
        responseJSON = json.loads(response.text)
        results = responseJSON['Results']
        print(len(results), "records found.")
        return results
    except urllib.error.HTTPError as err:
        print(err)   

Create a dataframe of all queried cases with the following columns: 
- Id
- Verdict date
- Publication date
- Verdict type
- Jurisdiction type

In [None]:
def create_cases_df(cases):
    cases_df = pd.DataFrame(columns = index)
    cases_df = pd.DataFrame(columns = index)
    for case in cases:
        curr_case = pd.Series([case["TitelEmphasis"].replace(":", "-"), case["Uitspraakdatum"], case["Publicatiedatum"], case["UitspraakdatumType"], case["Rechtsgebieden"]], index = index)
        cases_df = cases_df.append(curr_case, ignore_index=True)
    cases_df.to_csv('./court case data/testdata.csv', index=False)
    return cases_df
            

Save the text of all cases from the dataframe to a .txt file located in /data folder. 

In [None]:
def save_case_text(df):
#     case_count = len(df)
    i = 1
    for case in df["id"]:
        try:
            response = requests.get(complete_case_url + case.replace("-", ":"))
            soup = BeautifulSoup(response.text, 'html.parser')
            soup_content = soup.find("div", {"id": 'content'})
            uitspraak_html = soup_content.find('div', {'class': 'uitspraak'})
            if soup_content.find('div', {'class': 'uitspraak'}) is not None:
                uitspraak_html = soup_content.find('div', {'class': 'uitspraak'})
            elif soup_content.find('div', {'class': 'conclusie'}) is not None:
                uitspraak_html =  soup_content.find('div', {'class': 'conclusie'})
            uitspraak = uitspraak_html.get_text()
            uitspraak = uitspraak.replace(',', '')
            parsed_id = case.replace(":", "-")
            f= open(save_text_location + parsed_id + ".txt","w+", encoding='utf-8') # Doesn't work without encoding. 
            f.write(uitspraak)
#             print(f"{i}/{case_count} {parsed_id}")
#             clear_output(wait=True)
#             i += 1
        except Exception as e:
            print(f"Error in saving. {e}")
    print("Finished")
            

### Create dataset

In [None]:
cases = get_cases(files)
cases_df = create_cases_df(cases)
# save_case_text(cases_df)


In [None]:
def clean_string(dirty):
    
    clean_str = dirty.strip()
    clean_str = clean_str.lower()
    
    return clean_str

In [None]:
def save_case_text(df):
    for i in range(len(df)):
#         case_id = df.iloc[i]['id'].replace('-', ':')
        case_id = 'ECLI:NL:RBZWB:2020:2646'
        print(f"\n CASE: {case_id} \n")
        try:
            response = requests.get(complete_case_url + case_id)
            soup = BeautifulSoup(response.text, 'html.parser')
            soup_content = soup.find("div", {"id": 'content'})
            if df.iloc[i]['verdict_type'] == 'uitspraak':
                statement_content = soup_content.find('div', {'class': 'uitspraak'})
            elif df.iloc[i]['verdict_type'] == 'conclusie':
                statement_content = soup_content.find('div', {'class': 'conclusie'})
            
            sections = {}
            for el in statement_content.find_all(attrs={'class': 'section'}):
                section_title = clean_string(el.find('h2').text)
                print(section_title)
                sections[section_title] = el
                
#             print(sections.keys())
#             print(sections['8 De beslissing'].text)
            
            
           
        except Exception as e:
            print(f"Error in saving: {e}")

            
save_case_text(cases_df[:1])

In [None]:
%%time
save_case_text(cases_df[:1])

In [None]:
%%time
cases_df_1 = cases_df[:5000]
cases_df_2 = cases_df[5000:10000]
cases_df_3 = cases_df[10000:15000]
cases_df_4 = cases_df[15000:]

print(len(cases_df_1))
print(len(cases_df_2))

t1 = Thread(target=save_case_text ,args=(cases_df_1,))
t2 = Thread(target=save_case_text ,args=(cases_df_2,))
t3 = Thread(target=save_case_text ,args=(cases_df_3,))
t4 = Thread(target=save_case_text ,args=(cases_df_4,))


t1.start()
t2.start()
t3.start()
t4.start()

t1.join()
t2.join()
t3.join()
t4.join()

print("finished")

# Create / Filter dataset

### Update Dataset

Create updated csv with all relevant cases

In [None]:
cases = get_cases(files)
cases_df = create_cases_df(cases)


In [None]:
print(cases_df)

In [None]:
dataPath = os.getcwd() + '/court case data/testdata/'

list_of_cases = cases_df['id'].tolist()
list_of_created_cases = [case.replace('.txt', '') for case in os.listdir(dataPath)]
list_to_create = [case for case in list_of_cases if case not in list_of_created_cases]
if len(list_to_create) > 0:
    print(f"Creating {len(list_to_create)} new cases.")
    df_to_save = cases_df[cases_df['id'].isin(list_to_create)]
    save_case_text(df_to_save)
    print("Done")

### Create and clean full dataframe

Create:

In [None]:
dataPath = os.getcwd() + '/court case data/testdata/'
caseCount = len(os.listdir(dataPath))
data = []
try:
    os.remove(dataPath + ".DS_Store")
except:
    print("No file DS_Store")
for filename in os.listdir(dataPath):
    f = open(os.path.join(dataPath, filename), encoding='utf-8')
    data.append([filename.replace('.txt', ''), f.read()])

verdict_df = pd.DataFrame(data, columns=["id", "case text"])
cases_df = pd.read_csv('./court case data/testdata.csv')
merged_df = cases_df.join(verdict_df.set_index('id'), on='id', how='left')

merged_df["verdict_date"] = pd.to_datetime(merged_df["verdict_date"])
merged_df["publication_date"] = pd.to_datetime(merged_df["publication_date"])


In [None]:
print(merged_df.columns)

Clean:

In [None]:
total_len_uncleaned = 0
for i in range(len(merged_df)):
    currtext = merged_df.iloc[i]["case text"]
    try:
        length = len(currtext.split())
    except:
        print('Please update the dataset')
    total_len_uncleaned += length
print(f"Total words in uncleaned dataset: {total_len_uncleaned}")

In [None]:
cleaned_df = merged_df.copy()
cleaned_df['case text'] = cleaned_df['case text'].apply(lambda x: ' '.join([word for word in x.split() if word not in dutch_stopwords]))
print("Done")

In [None]:
total_len_cleaned = 0
for i in range(len(cleaned_df)):
    currtext = cleaned_df.iloc[i]["case text"]
    length = len(currtext.split())
    total_len_cleaned += length

print(f"Original count: {total_len_uncleaned} \n Cleaned count: {total_len_cleaned} \n Difference: {total_len_uncleaned - total_len_cleaned}")
print(f"{round((total_len_uncleaned - total_len_cleaned) / total_len_uncleaned * 100, 2)}% decrease")


### Creating some plots and metrics

In [None]:
jurisdiction_types_dict = {
    "Strafrecht": 0,
    "Civiel recht": 0,
    "Verbintenissenrecht": 0,
    "Bestuursrecht": 0,
    "Belastingrecht": 0,
    "Materieel strafrecht": 0,
    "Civiel recht": 0,
    "Personen- en familierecht": 0,
    "Ambtenarenrecht": 0,
    "Socialezekerheidsrecht": 0,
    "Insolventierecht": 0,
    "Penitentiair strafrecht": 0,
    "Bestuursprocesrecht": 0,
    "Arbeidsrecht": 0,
    "Burgerlijk procesrecht": 0,
    "Europees strafrecht": 0,
    "Omgevingsrecht": 0,
    "Internationaal strafrecht": 0,
    "Internationaal publiekrecht": 0,
    "Strafprocesrecht": 0
}

# print(cleaned_df.jurisdiction_type.unique())
for i in range(len(cleaned_df)):
    curr = cleaned_df.iloc[i]
    for j in jurisdiction_types_dict.keys():
        if j in curr['jurisdiction_type']:
            jurisdiction_types_dict[j] = jurisdiction_types_dict[j] + 1

plt.bar(range(len(jurisdiction_types_dict)), list(jurisdiction_types_dict.values()), align='center')
plt.xticks(range(len(jurisdiction_types_dict)), list(jurisdiction_types_dict.keys()))
plt.xticks(rotation=90)

In [None]:
verdict_types = cleaned_df.verdict_type.unique()
verdict_types_counts = [cleaned_df['verdict_type'].value_counts()[0], cleaned_df['verdict_type'].value_counts()[1]]

plt.bar(verdict_types, verdict_types_counts)
plt.title('Verdict type counts')


In [None]:
plt.rcParams["figure.figsize"] = (20,5)

cleaned_df_date_plot = cleaned_df["verdict_date"].value_counts().resample("1M").sum()
plt.plot(cleaned_df_date_plot)

In [None]:
print(cleaned_df["verdict_date"].value_counts())

### Filter for trafficking keywords

In [None]:
print(f"Original case count: {len(cleaned_df)}")

trafficking_df = pd.DataFrame()
# trafficking_words = ["smokkel", "transport", "invoer", "import", "export", " uitvoer "]
trafficking_words = ["smokkel", "transport", "invoer", "import", "export"]

# trafficking_words = ["smokkel", "transport"]


for i in range(len(cleaned_df)):
    curr = cleaned_df.iloc[i]
    if any(x in curr["case text"] for x in trafficking_words):
        trafficking_df = trafficking_df.append(curr)
    
print(f"Filtered for trafficking:  {len(trafficking_df)} articles.")

In [None]:
def get_links(df):
    link_list = []
    for i in range(len(df)):
        curr = df.iloc[i]
        link = complete_case_url + curr['id'].replace('-', ':')
        link_list.append(link)
    print(link_list)

In [None]:
get_links(trafficking_df)

In [None]:
print(trafficking_df[-20:])

In [None]:
countries_df = pd.read_csv('dutch-countries.csv', encoding = "ISO-8859-1")
countries_list = list(countries_df["Countries"])
# countries_list.remove('Nederland')

In [None]:
countries = []
for i in range(len(trafficking_df)):
    curr_text = trafficking_df.iloc[i]['case text']
    curr_countries = []
    for j in countries_list:
        if j in curr_text:
            curr_countries.append(j)
    countries.append(curr_countries)
    
trafficking_df['countries'] = countries

In [None]:
print(trafficking_df[-20:]['countries'])

In [None]:
from collections import Counter

resample = "1Y"
index = cleaned_df["verdict_date"].value_counts().resample(resample).sum().index

complete_country_count_df = pd.DataFrame(index=index)
for i in countries_list:
    country_mention_date = []
    for j in range(len(trafficking_df)):
        curr = trafficking_df.iloc[j]
        if i in curr['countries']:
            country_mention_date.append(curr['verdict_date'])
    if len(country_mention_date) > 20:
        counted = Counter(country_mention_date)
        sorted_dict = dict(sorted(counted.items(), key=lambda item: item[0]))
        dates = list(sorted_dict.keys())
        values = list(sorted_dict.values())
        country_df = pd.DataFrame(values, index=dates).resample(resample).sum()
        complete_country_count_df[i] = country_df
        plt.plot(country_df, label=i)
        
plt.legend()
plt.show()

        
    
    

In [None]:
plt.rcParams["figure.figsize"] = (20,10)

complete_country_count_df = complete_country_count_df.fillna(0)
plt.plot(complete_country_count_df[20:])
plt.legend(complete_country_count_df.columns.tolist())
plt.show()

### TF-IDF / K means

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from sklearn.decomposition import PCA
import numpy as np

documents = trafficking_df["case text"]
total_stop_words = stopwords.words('dutch') + stopwords.words('english')
# vectorizer = TfidfVectorizer(max_df=0.20, min_df=10, stop_words=total_stop_words , use_idf=True,  lowercase=True)
vectorizer = TfidfVectorizer(min_df=10, stop_words=total_stop_words , use_idf=True,  lowercase=True)


X = vectorizer.fit_transform(documents)
X = X.astype('float32')

Create elbow curve to determine best k value

In [None]:
distortions = []
K = range(1,20)
for k in K:
    print(k)
    kmeanModel = KMeans(n_clusters=k)
    kmeanModel.fit(X)
    distortions.append(kmeanModel.inertia_)
    
plt.figure(figsize=(16,8))
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

In [None]:
true_k = 8
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=200)

labels = model.fit_predict(X)
trafficking_df["K means cluster"] = labels

X_array = X.toarray()
pca = PCA(n_components=2).fit(X_array)
XPCA = pca.transform(X_array)

u_labels = np.unique(labels)
for i in u_labels:
    plt.scatter(XPCA[labels == i , 0] , XPCA[labels == i , 1] , label = i)
plt.legend()
plt.show()

In [None]:
print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
terms_df = pd.DataFrame()
for i in range(true_k):
    curr = []
    for ind in order_centroids[i, :19]:
        curr.append(terms[ind])
    terms_df[i] = curr

print(terms_df)

print(trafficking_df["K means cluster"].value_counts())

### 

### 

# NLP

### BERTje

In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification, AutoModel
import torch


#### Fill Mask

In [None]:
pipe = pipeline('fill-mask', model='GroNLP/bert-base-dutch-cased')

In [None]:
for res in pipe('Mijn naam is [MASK].'):
    print(res['sequence'])

#### NER

In [None]:
nlp = pipeline('ner', model='wietsedv/bert-base-dutch-cased-finetuned-conll2002-ner')


In [None]:
example = "Mijn naam is Cas en mijn moeder heet Susanne en samen wonen wij in Griekenland"
example = "- voornoemde (hoeveelheid) cocaïne aan boord van voornoemd luchtvaartuig (op Curaçao) verstopt in en/of gebracht naar en/of getracht te verstoppen in een of meer zogenaamde vuilnistrolley('s) en/of cateringtrolley('s) in het cateringgedeelte van voornoemd luchtvaartuig. "
ner_results = nlp(example)
ner_results

In [None]:
trafficking_df.iloc[4]

In [None]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("pdelobelle/robbert-v2-dutch-base")
model = AutoModel.from_pretrained("pdelobelle/robbert-v2-dutch-base")

In [None]:
nlp = pipeline('ner', model=model, tokenizer=tokenizer)

In [None]:
example = "hallo mijn naam is cas"

ner_results = nlp(example)
ner_results

### robBert

In [None]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification

In [None]:
pipe = pipeline('fill-mask', model='pdelobelle/robbert-v2-dutch-base')

In [None]:
for res in pipe('Hallo, ik ben RobBERT, een <mask> taalmodel van de KU Leuven.'):
    print(res['sequence'])

In [None]:
ner = pipeline('ner', model='pdelobelle/robbert-v2-dutch-ner')

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("pdelobelle/robbert-v2-dutch-ner")

model = AutoModelForTokenClassification.from_pretrained("pdelobelle/robbert-v2-dutch-ner")
ner = pipeline(model=model, tokenizer=tokenizer)

In [None]:
ner("Hallo mijn naam is Cas.")

### SpaCy

In [None]:
!python -m spacy download nl_core_news_md

In [None]:
import spacy
from spacy.lang.nl.examples import sentences 

nlp = spacy.load("nl_core_news_md")
doc = nlp(sentences[0])
print(doc.text)
for token in doc:
    print(token.text, token.pos_, token.dep_)

In [None]:
nlp = spacy.load('nl_core_news_md')

In [None]:
def show_ents(doc): 
    if doc.ents: 
        for ent in doc.ents: 
#             print(ent.label_)
            if ent.label_ == "GPE":
                
                print(ent.text+' - '+ent.label_+ ' - '+str(spacy.explain(ent.label_))) 
    else: print('No named entities found.')
        
def check_place(chunk):
    if doc.ents:
        has_GPE = False
        for ent in doc.ents:
            if ent.label_ == "GPE":
                has_GPE = True
    return has_GPE

In [None]:
doc = nlp(case_text_1)
show_ents(doc)


# SpaCy Dependency & NER analysis

In [None]:
from spacy import displacy


In [None]:
selected_case = merged_df[merged_df['id'] == "ECLI-NL-GHARN-2004-AO7555"].iloc[0]

In [None]:
case_text = selected_case['case text']

In [None]:
example_text = "Op 9 december 2013 is door douanemedewerkers 300 kg cocaïne in de haven van Rotterdam onderschept. De container waarin de cocaïne zat, was afkomstig uit Costa Rica en moest worden vervoerd naar de [adres] te Rotterdam om daar te worden gekeurd."




In [None]:
import re
split_case_text = re.split('\n', example_text)

def check_place(chunk):
    has_GPE = False
    if doc.ents:
        for ent in doc.ents:
            if ent.label_ == "GPE":
                has_GPE = True
    return has_GPE

def show_GPE(doc): 
    if doc.ents: 
        for ent in doc.ents: 
#             print(ent.label_)
#             if ent.label_ == "GPE":
            print(ent.text+' - '+ent.label_+ ' - '+str(spacy.explain(ent.label_))) 
    else: print('No named entities found.')
        
        
filtered_case_text = []
for i in split_case_text:
    if i != '':
        doc = nlp(i)
        if check_place(doc):
            filtered_case_text.append(i)



In [None]:

displacy.render(nlp(example_text), style="ent")

In [292]:
bb = '''Het hof heeft bij de straftoemeting in het bijzonder in aanmerking genomen -en vindt daarin de redenen die tot de keuze van een onvoorwaardelijke vrijheidsstraf van na te melden duur leiden- dat verdachte zich op grote schaal heeft schuldig gemaakt aan smokkel van cocaïne en XTC-pillen van Nederland naar Duitsland, Italië, de Verenigde Staten van Amerika, Canada en Zweden. De verdovende middelen waren kennelijk bestemd om verder te worden verspreid onder personen die aan bovengenoemde drugs verslaafd zijn. Door aldus te handelen heeft verdachte de volksgezondheid in gevaar gebracht.

Daarnaast heeft verdachte een vervalst paspoort in zijn bezit gehad, waardoor hij het maatschappelijk vertrouwen in de echtheid van reisdocumenten heeft geschaad.

Tevens neemt het hof bij de straftoemeting in aanmerking dat verdachte in het verleden bij herhaling is veroordeeld wegens het plegen van strafbare feiten.''' 

displacy.render(nlp(bb), style='ent')



In [293]:
for ent in nlp(bb).ents:
    if ent.label_ == "GPE":
        print(ent)


Nederland
Duitsland
Italië
Verenigde Staten van Amerika
Canada
Zweden


In [294]:
displacy.render(nlp(bb),style='dep')

In [295]:
type_1 = ["van", "uit", "vanuit"]
type_2 = ["naar"]

In [317]:

def extract_details(text):
    source = None
    destination = None
    drug_type = None
    for token in nlp(text):
        if token.ent_type_ == "GPE":
            # Check if related to an adposition:
            for child in token.children:
                if child.pos_ == "ADP":
                    if child.text in type_1:
                        source = token
                    elif child.text in type_2:
                        destination = token
                    else:
                        "Adposition not recognized"
    
    if destination is not None:
        for child in destination.children:
            if (child.pos_ == "NOUN" or child.pos_ == "PROPN") and child.dep_ == "conj":
                country = child.text
                for grandchild in child.children:
                    if grandchild.pos_ == "PROPN" and grandchild.dep_ == "flat":
                        country = f"{country} {grandchild.text}"
                destination = f"{destination}, {country}"

    if source is not None:
        for ancestor in source.ancestors:
            if ancestor.pos_ == "NOUN" and ancestor.dep_ == "nmod" and ancestor.ent_type_ == "DRUG":
                drug_type = ancestor.text
                for child in ancestor.children:
                    if child.pos_ == "NOUN" and child.dep_ == "conj":
                        drug_type = f"{drug_type}, {child}"


    return(f"Source: {source}\n Destination: {destination}\n Drug: {drug_type}")

In [318]:
print(extract_details(bb))

Source: None
 Destination: Duitsland, Italië
 Drug: None


In [None]:
print(extract_details(case_text))

In [None]:
for token in nlp(example_text):
    if token.ent_type_ == "GPE":
        # Check if related to an adposition:
        for child in token.children:
            if child.pos_ == "ADP":
                print(child, token)

In [None]:
for token in nlp(bb):
    if token.head.pos_ == "VERB":
        print(token)

In [None]:
displacy.serve(nlp(bb), style='dep')


In [311]:
bb = "Het hof heeft bij de straftoemeting in het bijzonder in aanmerking genomen -en vindt daarin de redenen die tot de keuze van een onvoorwaardelijke vrijheidsstraf van na te melden duur leiden- dat verdachte zich uit winstbejag aan smokkel van cocaïne naar Duitsland en Italië heeft schuldig gemaakt."
displacy.render(nlp(bb), style="ent")



In [312]:
displacy.render(nlp(bb), style="dep")

# Pipeline

## Split chunks by \n

In [585]:
import re

selected_case = merged_df[merged_df['id'] == "ECLI-NL-GHARN-2004-AO7555"].iloc[0]
case_text = selected_case['case text']
split_case_text = re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])|\n', case_text)


In [586]:
my_file = open("drugs list.txt", "r", encoding='utf-8')
my_file = my_file.readlines()
drugs_list = []
for i in my_file:
    drugs_list.append(i.replace('\n', ''))

## Keep chunk if contains drug

In [587]:
relevant_chunks = []
for i in split_case_text:
    if any(drug in i for drug in drugs_list):
        relevant_chunks.append(i)
    
for i in relevant_chunks:
    print(i, "\n")

hij in of omstreeks de periode van 1 januari 2001 tot en met 19 januari 2002 te Hengelo (O) en/althans elders in Nederland tezamen en in vereniging met een ander althans alleen meermalen althans eenmaal (telkens) opzettelijk buiten het grondgebied van Nederland heeft gebracht hoeveelheden/een hoeveelheid van een materiaal bevattende heroïne (diacetylmorfine) en/of een hoeveelheid van een materiaal bevattende cocaïne en/of een hoeveelheid XTC-pillen in elk geval een hoeveelheid van een materiaal bevattende MDA en/of MDMA en/of MDEA zijnde heroïne en/of cocaïne en/of MDA/MDMA/MDEA een middel vermeld op de bij de Opiumwet behorende lijst I dan wel aangewezen krachtens het tweede of derde lid van artikel 2 van die wet te weten: 

- op of omstreeks 18 januari 2002 te Hengelo (O) althans in arrondissement (ongeveer) een hoeveelheid (ongeveer 3 kilogram althans 1953 gram) cocaïne en/of 

- op of omstreeks 27 juli 2001 te Hengelo (O) althans in het arrondissement Almelo een hoeveelheid (ongeve

In [588]:
def get_chunks(case):
    parsed_id = case.replace(':', '-')
    selected_case = merged_df[merged_df['id'] == parsed_id].iloc[0]
    case_text = selected_case['case text']
    split_case_text = re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])|\n', case_text)
    
    relevant_chunks = []
    for i in split_case_text:
        if any(drug in i for drug in drugs_list):
            relevant_chunks.append(i)
    return relevant_chunks

## Analyze individual sentences

In [589]:
import spacy
from spacy import displacy
# !python -m spacy download nl_core_news_md
nlp = spacy.load('nl_core_news_md')

In [591]:
drugs_ent_list = []
for i in drugs_list:
    drugs_ent_list.append({"label": "DRUG", "pattern": [{"lower": i}]})

quantity_rule = {"label": "QUANTITY", "pattern": [{"IS_DIGIT": True}, {"LOWER": "gram"}]}

drugs_ent_list.append(quantity_rule)


Add drugs to the model

In [592]:
config = {
   "phrase_matcher_attr": None,
   "validate": True,
   "overwrite_ents": True,
   "ent_id_sep": "||",
}
ruler = nlp.add_pipe("entity_ruler", config=config)

#List of Entities and Patterns
patterns = [
                {"label": "DRUG", "pattern": "MDMA"},
                {"label": "DRUG", "pattern": "cocaïne"}
                
            ]
patterns = [
                {"label": "DRUG", "pattern": [{'lower': 'mdma'}]},
                {"label": "DRUG", "pattern": "cocaïne"}
                
            ]
patterns = drugs_ent_list


ruler.add_patterns(patterns)


In [593]:
def render_entities(txt):
    doc = nlp(txt)
    displacy.render(doc, style="ent")
    
def get_entities(txt):
    doc = nlp(txt)
    for ent in doc.ents:
        print(ent, ent.label_)

In [594]:
for i in relevant_chunks:
#     render_entities(i)
    get_entities(i)
    print("\n")

1 januari 2001 DATE
19 januari 2002 DATE
Hengelo GPE
Nederland GPE
Nederland GPE
heroïne DRUG
diacetylmorfine DRUG
cocaïne DRUG
XTC-pillen DRUG
MDMA DRUG
MDEA DRUG
heroïne DRUG
cocaïne DRUG
tweede ORDINAL
derde ORDINAL
2 CARDINAL


18 januari 2002 DATE
Hengelo GPE
3 kilogram QUANTITY
1953 gram QUANTITY
cocaïne DRUG


27 juli 2001 DATE
Hengelo GPE
Almelo GPE
62.000 CARDINAL
XTC-pillen DRUG
MDMA DRUG
MDEA DRUG


18 december 2001 DATE
Hengelo GPE
Almelo GPE
XTC-pillen DRUG
MDMA DRUG
MDEA DRUG
tweede ORDINAL
derde ORDINAL
2 CARDINAL


14 december 2001 DATE
december 2001 DATE
Hengelo GPE
Almelo GPE
zeven CARDINAL
7 CARDINAL
cocaïne DRUG
tweede ORDINAL
derde ORDINAL
2 CARDINAL


XTC-pillen DRUG
MDMA DRUG
MDEA DRUG
Nederland GPE


1 januari 2001 DATE
19 januari 2002 DATE
Hengelo GPE
Nederland GPE
Nederland GPE
cocaïne DRUG
I PERSON


18 januari 2002 DATE
Hengelo GPE
Almelo GPE
1953 gram QUANTITY
cocaïne DRUG


14 december 2001 DATE
december 2001 DATE
Hengelo GPE
Almelo GPE
zeven CARDINAL
coca

In [595]:

def extract_info(txt):
    source_country = None
    
    for token in nlp(txt):
        info = {}
        drug_info = {}
        countries = []
        
        if token.ent_type_ == "DRUG":
            info = {"drug": token}
            
            ## Get source and destination
            for ancestor in token.ancestors:
                for nephew in ancestor.children:
                    if nephew.ent_type_ == "GPE" or nephew.ent_type_ == "LOC":
                        countries.append(nephew)
                        for child in nephew.children:
                            if child.dep_ == "conj" and child.ent_type_ == "GPE" or child.ent_type_ == "LOC":
                                countries.append(child)
                            elif child.pos_ == "ADP" and child.dep_ == "case":
                                adj = child
            if len(countries) > 0 :
                info[adj] = countries
                        
            ## Get volume
            for ancestors in token.ancestors:
                for nephew in ancestors.children:
                    if nephew.ent_type_ == "QUANTITY" or nephew.ent_type_ == "CARDINAL":
                        for second_nephew in nephew.children:
                            if second_nephew.is_digit != nephew.is_digit:
                                if second_nephew.is_digit:
                                    info['volume'] = second_nephew
                                    info['volume_type'] = nephew
                                else:
                                    info['volume'] = nephew
                                    info['volume_type'] = second_nephew
            if 'volume' not in info:
                print('yo')
                                
                                
            

        if len(info) > 0:
            print(info)
    
extract_info(relevant_chunks[1])

{'drug': cocaïne, te: [Hengelo], 'volume': 1953, 'volume_type': gram}


In [596]:
for i in relevant_chunks:
    extract_info(i)

{'drug': heroïne, in: [Nederland]}
{'drug': diacetylmorfine, in: [Nederland]}
{'drug': cocaïne, in: [Nederland]}
{'drug': XTC-pillen, in: [Nederland]}
{'drug': MDMA, in: [Nederland]}
{'drug': MDEA, in: [Nederland]}
{'drug': heroïne, in: [Nederland]}
{'drug': cocaïne, in: [Nederland]}
{'drug': cocaïne, te: [Hengelo], 'volume': 1953, 'volume_type': gram}
{'drug': XTC-pillen}
{'drug': MDMA}
{'drug': MDEA}
{'drug': XTC-pillen}
{'drug': MDMA}
{'drug': MDEA}
{'drug': cocaïne}
{'drug': XTC-pillen}
{'drug': MDMA}
{'drug': MDEA}
{'drug': cocaïne}
{'drug': cocaïne, 'volume': 1953, 'volume_type': gram}
{'drug': cocaïne}
{'drug': cocaïne, naar: [Duitsland, Italië]}
{'drug': cocaïne}


In [597]:
displacy.render(nlp(relevant_chunks[6]), style="dep")

In [598]:

for i in get_chunks("ECLI:NL:GHARN:2004:AO7558"):
    extract_info(i)

{'drug': heroïne, in: [Nederland]}
{'drug': diacetylmorfine, in: [Nederland]}
{'drug': cocaïne, in: [Nederland]}
{'drug': XTC-pillen, in: [Nederland]}
{'drug': MDMA, in: [Nederland]}
{'drug': MDEA, in: [Nederland]}
{'drug': cocaïne, in: [Nederland]}
{'drug': cocaïne, te: [Hengelo], 'volume': 1953, 'volume_type': gram}
{'drug': XTC-pillen, te: [Hengelo]}
{'drug': MDMA}
{'drug': MDEA}
{'drug': XTC-pillen}
{'drug': MDMA}
{'drug': MDEA}
{'drug': cocaïne, 'volume': 7, 'volume_type': zeven}
{'drug': XTC-pillen}
{'drug': MDMA}
{'drug': MDEA}
{'drug': XTC-pillen}
{'drug': MDMA}
{'drug': MDEA}
{'drug': XTC-pillen}
{'drug': MDMA}
{'drug': MDEA}
{'drug': XTC-pillen}
{'drug': MDMA}
{'drug': MDEA}
{'drug': XTC-pillen}
{'drug': MDMA}
{'drug': MDEA}
{'drug': XTC-pillen}
{'drug': MDMA}
{'drug': MDEA}
{'drug': XTC-pillen}
{'drug': MDMA}
{'drug': MDEA}
{'drug': XTC-pillen}
{'drug': MDMA}
{'drug': MDEA}
{'drug': XTC-pillen}
{'drug': MDMA}
{'drug': MDEA}
{'drug': cocaïne}
{'drug': XTC-pillen}
{'drug': MDM

In [599]:
for i in get_chunks("ECLI:NL:RBOBR:2014:5570"):
    extract_info(i)

{'drug': hennep}
{'drug': hasjiesj}
{'drug': hennep}
{'drug': hasjiesj}
{'drug': hennep}
{'drug': hasjiesj}
{'drug': hennep}
{'drug': hasjiesj}
{'drug': XTC-pillen}
{'drug': hennep}
{'drug': hasjiesj}
{'drug': hennep}
{'drug': hasjiesj}
{'drug': XTC-pillen}
{'drug': hasjiesj}
{'drug': hennep, 'volume': 550, 'volume_type': gram}
{'drug': XTC-pillen}
{'drug': hennep}
{'drug': hasjiesj}
{'drug': hennep, 'volume': 550, 'volume_type': gram}
{'drug': hasjiesj}
{'drug': hennep}
{'drug': hasjiesj}
{'drug': hennep}
{'drug': hasjiesj}
{'drug': XTC-pillen}
{'drug': hennep}
{'drug': hasjiesj}
{'drug': hennep}
{'drug': hasjiesj}


In [600]:
get_chunks("ECLI:NL:RBOBR:2014:5570")

['Opiumwet ongeveer 550 gram in elk geval een hoeveelheid hennep en/of ',
 'ongeveer 6531 gram in elk geval een hoeveelheid hasjiesj zijnde hennep ',
 'en/of hasjiesj (telkens) een middel als bedoeld in de bij die wet behorende ',
 'waarin die hennep en/of hasjiesj (telkens) heimelijk in de bodem waren ',
 'Verdachte bekent dat hij de twee koffers waarin de aangetroffen hennep en hasjiesj die hij op de luchthaven bij zich had wilde uitvoeren naar het buitenland.',
 'Verdachte ontkent dat hij wist dat in de koffers naast XTC-pillen ook hennep en hasjiesj was verstopt.',
 'De verdediging heeft bepleit dat verdachte dan ook geen opzet ook niet in voorwaardelijke zin had op het uitvoeren van de hennep en hasjiesj. ',
 'Onder die bodembekleding werden 3 geplastificeerde pakketten aangetroffen met daarin totaal ongeveer 56.702 XTC-pillen 6531 gram hasjiesj en 550 gram hennep.',
 'Gelet op deze omstandigheden concludeert de rechtbank dat verdachte toen hij de koffers meenam om uit te voeren w

In [601]:
for i in get_chunks("ECLI:NL:RBOBR:2014:5570"):
#     render_entities(i)
    get_entities(i)
    print("\n")

550 gram QUANTITY
hennep DRUG


6531 gram QUANTITY
hasjiesj DRUG
hennep DRUG


hasjiesj DRUG


hennep DRUG
hasjiesj DRUG


twee CARDINAL
hennep DRUG
hasjiesj DRUG


XTC-pillen DRUG
hennep DRUG
hasjiesj DRUG


hennep DRUG
hasjiesj DRUG


3 CARDINAL
56.702 CARDINAL
XTC-pillen DRUG
6531 gram QUANTITY
hasjiesj DRUG
550 gram QUANTITY
hennep DRUG


XTC-pillen DRUG
hennep DRUG
hasjiesj DRUG


16 april 2014 DATE
Eindhoven GPE
Nederland GPE
1 CARDINAL
5 CARDINAL
550 gram QUANTITY
hennep DRUG


6531 gram QUANTITY
hasjiesj DRUG
hennep DRUG
hasjiesj DRUG
twee CARDINAL
hennep DRUG
hasjiesj DRUG
Portugal GPE


XTC-pillen DRUG
hennep DRUG
hasjiesj DRUG


hennep DRUG
hasjiesj DRUG


De hennephandel LOC




In [602]:
displacy.render(nlp(get_chunks("ECLI:NL:RBOBR:2014:5570")[7]), style="dep")