# Create dataset

### Create

In [None]:
import pandas as pd
import os

In [None]:
dataPath = os.getcwd() + '/court case data/testdata/'
caseCount = len(os.listdir(dataPath))
data = []
try:
    os.remove(dataPath + ".DS_Store")
except:
    print("No file DS_Store")
for filename in os.listdir(dataPath):
    f = open(os.path.join(dataPath, filename), encoding='utf-8')
    data.append([filename.replace('.txt', ''), f.read()])

verdict_df = pd.DataFrame(data, columns=["id", "case text"])
cases_df = pd.read_csv('./court case data/testdata.csv')
merged_df = cases_df.join(verdict_df.set_index('id'), on='id', how='left')

merged_df["verdict_date"] = pd.to_datetime(merged_df["verdict_date"])
merged_df["publication_date"] = pd.to_datetime(merged_df["publication_date"])

In [None]:
def add_inhoudsindicatie_to_case_text(df):
    for index, row in df.iterrows():
        fulltext = row['case text'] + row['inhoudsindicatie']
        df.at[index,'case text']= fulltext
    return df

merged_df = add_inhoudsindicatie_to_case_text(merged_df)

In [None]:
merged_df.to_pickle("merged_df.pkl")

In [None]:
merged_df

### Load

In [2]:
import pandas as pd
import os

In [3]:
merged_df = pd.read_pickle("merged_df.pkl")
merged_df

Unnamed: 0,id,verdict_date,publication_date,verdict_type,jurisdiction_type,inhoudsindicatie,case text
0,ECLI-NL-RBNNE-2021-5018,2021-01-10,2021-11-23,uitspraak,['Strafrecht'],"Productie synthetische drugs, medeplegen, prod...",\n\nRECHTBANK NOORD-NEDERLAND\nAfdeling strafr...
1,ECLI-NL-RBZUT-2003-AH9598,2003-03-06,2003-09-07,uitspraak,['Strafrecht'],Leveren grondstoffen synthetische drugs en sto...,\n\nRECHTBANK ZUTPHEN\nMeervoudige economische...
2,ECLI-NL-RBZWB-2020-2646,2020-06-23,2020-06-23,uitspraak,['Strafrecht'],plegen van voorbereidingshandelingen ten behoe...,\n\nRECHTBANK ZEELAND-WEST-BRABANT\n\nStrafrec...
3,ECLI-NL-GHAMS-2019-1601,2019-08-05,2019-07-26,uitspraak,['Strafrecht'],medeplegen witwassen - medeplegen voorhanden h...,\n\nafdeling strafrecht\nparketnummer: 23-0017...
4,ECLI-NL-GHAMS-2019-1602,2019-08-05,2019-07-26,uitspraak,['Strafrecht'],medeplegen witwassen - medeplegen voorhanden h...,\n\nafdeling strafrecht\nparketnummer: 23-0017...
...,...,...,...,...,...,...,...
18457,ECLI-NL-RBAMS-2013-1294,2013-01-29,2013-08-10,uitspraak,['Strafrecht'],De rechtbank acht bewezen dat er sprake is van...,\n\nRECHTBANK AMSTERDAM\n\n\nVONNIS\n\n \n\n13...
18458,ECLI-NL-PHR-2020-1106,2020-11-24,2020-11-24,conclusie,['Strafrecht'],Conclusie AG. Vervolging van een politieagent ...,\n\nPROCUREUR-GENERAAL\n\n\nBIJ DE\n\n\nHOGE R...
18459,ECLI-NL-GHAMS-2017-2618,2017-06-29,2017-05-07,uitspraak,['Strafrecht'],Liquidatieproces Passage\n\n ...,\n\n\nparketnummer: 23-001217-13\ndatum uitspr...
18460,ECLI-NL-RBAMS-2013-BZ0392,2013-01-29,2013-01-02,uitspraak,['Strafrecht'],De rechtbank acht bewezen dat verdachte zich s...,\nRECHTBANK AMSTERDAM \nVONNIS \n\n13/529144-...


# Split sentences

In [4]:
import re

In [5]:
def split_into_sentences(text):
    sentences = re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])|\n', text)
    sentences = [x for x in sentences if len(x) > 1]
    return sentences

In [8]:
sentence_list_by_word = []
sentence_list = []

for i in range(len(merged_df)):
    doc = merged_df.iloc[i]['case text']
    sentences = split_into_sentences(doc)
    sentence_list.append(sentences)
    for j in sentences:
        word_list = [x for x in j.lower().rstrip().replace('.', '').split(' ') if len(x)>0]
        sentence_list_by_word.append(word_list)
        
print(len(sentence_list))
print(len(sentence_list_by_word))

18462
5301875


# Create Word2Vec model

In [10]:
from gensim.models import Word2Vec
from gensim.test.utils import common_texts

In [11]:
dutch_word2vec_model = Word2Vec(sentences=sentence_list_by_word, vector_size=100, window=5, min_count=1, workers=4)
dutch_word2vec_model.save("word2vec_dutch_court_cases.model")

In [12]:
dutch_word2vec_model = Word2Vec.load("word2vec_dutch_court_cases.model")

In [13]:
sims = dutch_word2vec_model.wv.most_similar('xtc', topn=100)
print([i[0] for i in sims])


['mdma', 'speed', 'lsd', 'heroïne', 'amfetamine', 'ketamine', 'xtc-pillen', 'mdma)', 'ghb', 'amfetaminen', 'diazepam', 'methadon', 'xtc-tabletten', 'oxazepam', 'ecstasy', '(mdma)', 'morfine', 'xtc)', 'cocaïne', 'crack', 'cafeïne', 'pillen', 'methamfetamine', 'metamfetamine', 'marihuana', 'a-pvp', 'lidocaïne', 'steroïden', 'anabole', 'pep', 'xtc/mdma', 'paracetamol', 'amfetamines', 'fenacetine', 'pillen)', 'temazepam', 'crystal', 'gbl', 'kamagra', '(xtc)', 'methylfenidaat', 'opiaten', 'inositol', 'mdma,', 'cannabis', '(amfetamine)', 'procaïne', 'weed', '(cocaïne', 'ritalin', '(speed)', '(mdma', 'harddrugs', 'amfetamine)', 'viagra', '(met)amfetamine', 'xtc-', 'versnijdingsmiddel', 'tabletten', 'cocaine', 'capsules', 'hasj', 'coke', 'oxycodon', '2c-b', 'mdma-poeder', 'xtc,', 'cocaïne;', 'xtc-pil', 'pure', 'dexamfetamine', 'valium', '5793847)', 'wiet', '‘vieze’', '34-methyleendioxymethamfetamine', 'meth', 'amfetamine;', 'pmma', 'drugs', 'xtc-pillen,', 'mdma;', 'mdma/xtc', 'speed/amfetamine

# Sandbox