In [1]:
# Analysis of HUDOC documents
# Corpus from 31/01/2019, scraped using https://github.com/ajbarker93/caseScrape

In [2]:
# Import libraries
import json
import os
from os import listdir
from os.path import isfile, join
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from price_parser import Price
from sklearn.linear_model import LinearRegression
from forex_python.converter import CurrencyRates
import seaborn as sns
import datefinder

In [3]:
# Check drive contents
rootdir="/users/adambarker/case_scrape/"
os.chdir(rootdir)
os.listdir()

['docToText',
 'results.pkl',
 '.DS_Store',
 'results0.pkl',
 'README.md',
 'utilities',
 '.gitignore',
 'docToText2',
 'summaries',
 '.ipynb_checkpoints',
 'documents',
 '.git',
 'Analysis.ipynb']

In [4]:
# Import pickle file of results content
results = pd.read_pickle("./results0.pkl")

# Remove all rows where the content failed
results = results.loc[results['content']!='']

In [5]:
# Append violations

reload = 1
if reload:
    # Add additional fields
    results['violation'] = ''
    results['conclusion'] = ''
    results['importance'] = ''
    results['originatingbody'] = '' 
    results['case_date'] = ''

    # Load summaries
    onlyfiles = [f for f in listdir(rootdir+"/summaries/") if isfile(join(rootdir+"/summaries/", f))]
    onlyjsons = [f for f in onlyfiles if f.endswith('.json')]

    # Examine summaries and choose the list of P1-1 relevant cases
    for onlyjson in onlyjsons:
        
        try:
            
            with open(rootdir+"/summaries/"+onlyjson) as json_file: # first open the batches of summaries

                contents = json.load(json_file)
                contents2 = contents['results']

                for i in range(0,len(contents2)): # loop through each case in the batch

                    field_data = contents2[i]['columns']

                    if field_data['itemid'] in results.case_id.values:

                        results.loc[results['case_id'] == field_data['itemid'],['violation']] = field_data['violation']
                        results.loc[results['case_id'] == field_data['itemid'],['conclusion']] = field_data['conclusion']
                        results.loc[results['case_id'] == field_data['itemid'],['importance']] = field_data['importance']
                        results.loc[results['case_id'] == field_data['itemid'],['originatingbody']] = field_data['originatingbody']

                        
        except:
            print("Error")
            
    # Save as pickle again for easy recall
    results.to_pickle("./results.pkl")    

Error


In [6]:
# Inspect some content
results.head()

Unnamed: 0,case,case_id,content,claimed,awarded,num_awarded,num_claimed,violation,conclusion,importance,originatingbody,case_date
6,CASE OF ORAL AND OTHERS v. TURKEY,001-60396,FIRST SECTION CASE OF ORAL AND OTHERS ...,,,,,,Struck out of the list (friendly settlement),4,4.0,
7,CASE OF ULGER v. TURKEY,001-60397,THIRD SECTION CASE OF ÜLGER v. TURKEY...,,,,,,Struck out of the list (friendly settlement),4,4.0,
18,Wessels-Bergervoet c. Pays-Bas,002-5323,Note d’information sur la jurisprudence de la ...,,,,,,Violation de l'art. 14+P1-1;Satisfaction équit...,1,,
19,Olivieira c. Pays-Bas,002-5335,Note d’information sur la jurisprudence de la ...,,,,,,Non-violation de P4-2;Aucune question distinct...,1,,
20,Ali Erol c. Turquie,002-5317,Note d’information sur la jurisprudence de la ...,,,,,,Radiation du rôle (règlement amiable),2,,


In [21]:
# This is a loading script which does all of the io on the directory. It contains:
# Strings to define start and end of relevant sections

reload = 1
if reload:

    # Define a simple set of start and end wildcards for the claimed and awarded sections
    cl_start_str='SECTION'
    cl_end_str='President'
    aw_start_str='SECTION'
    aw_end_str='President'

    # First do English cases
    for i in range(0,len(results)):

        result = results['content'].iloc[i]

        if len(result.split(cl_start_str))>1:

            # Choose text from start wildcard onwards
            to_end = result.split(cl_start_str)[1]
            # Choose text before end wildcard
            from_start = to_end.split(cl_end_str)[0]
            # Add to resuls array
            results['claimed'].iloc[i] = from_start

        if len(result.split(aw_start_str))>1:

            # Repeat for awarded section
            to_end = result.split(aw_start_str)[1]
            from_start = to_end.split(aw_end_str)[0]
            results['awarded'].iloc[i] = from_start
                
    # Then do French cases
    cl_start_str='Note d’information'
    cl_end_str='Cliquez ici pour accéder aux'
    aw_start_str='Note d’information'
    aw_end_str='Cliquez ici pour accéder aux'

    for i in range(0,len(results)):

        if results['claimed'].iloc[i] =='':

            result = results['content'].iloc[i]

            if len(result.split(cl_start_str))>1:

                # Choose text from start wildcard onwards
                to_end = result.split(cl_start_str)[1]
                # Choose text before end wildcard
                from_start = to_end.split(cl_end_str)[0]
                # Add to resuls array
                results['claimed'].iloc[i] = from_start

            if len(result.split(aw_start_str))>1:

                # Repeat for awarded section
                to_end = result.split(aw_start_str)[1]
                from_start = to_end.split(aw_end_str)[0]
                results['awarded'].iloc[i] = from_start         
            
    # Save as pickle again for easy recall
    results.to_pickle("./results.pkl")

In [22]:
# Get case dates

try: 
    for i in range(0,len(results)):

        # Get dates while we're here look at first 40 words
        headline = result.split("JUDGMENT")
        if len(headline)>1:

            specific_split = headline[1].split()
            split_depth = min(20,len(specific_split))
            date_str = ' '.join(specific_split[:split_depth])
            matches = list(datefinder.find_dates(date_str))

            if len(matches) > 1:
                # date returned will be a datetime.datetime object. here we are only using the first match.
                results['case_date'].iloc[i] = matches[1]   
            elif len(matches)==1:
                results['case_date'].iloc[i] = matches[0]


        # Get dates while we're here look at first 40 words
        headline = result.split("Arrêt")
        if len(headline)>1:
            specific_split = headline[1].split()
            split_depth = min(20,len(specific_split))
            date_str = ' '.join(specific_split[:split_depth])
            matches = list(datefinder.find_dates(date_str))
            if len(matches) > 1:
                # date returned will be a datetime.datetime object. here we are only using the first match.
                results['case_date'].iloc[i] = matches[1]   
            elif len(matches)==1:
                results['case_date'].iloc[i] = matches[0]  
                
except:
    print("Error")

In [23]:
#results['case_date'] = ''
results.head()

Unnamed: 0,case,case_id,content,claimed,awarded,num_awarded,num_claimed,violation,conclusion,importance,originatingbody,case_date
6,CASE OF ORAL AND OTHERS v. TURKEY,001-60396,FIRST SECTION CASE OF ORAL AND OTHERS ...,CASE OF ORAL AND OTHERS v. TURKEY (Appl...,CASE OF ORAL AND OTHERS v. TURKEY (Appl...,,,,Struck out of the list (friendly settlement),4,4.0,2002-03-28 00:00:00
7,CASE OF ULGER v. TURKEY,001-60397,THIRD SECTION CASE OF ÜLGER v. TURKEY...,CASE OF ÜLGER v. TURKEY (Application n...,CASE OF ÜLGER v. TURKEY (Application n...,,,,Struck out of the list (friendly settlement),4,4.0,2002-03-28 00:00:00
18,Wessels-Bergervoet c. Pays-Bas,002-5323,Note d’information sur la jurisprudence de la ...,sur la jurisprudence de la Cour 43 Juin 2002 ...,sur la jurisprudence de la Cour 43 Juin 2002 ...,,,,Violation de l'art. 14+P1-1;Satisfaction équit...,1,,
19,Olivieira c. Pays-Bas,002-5335,Note d’information sur la jurisprudence de la ...,sur la jurisprudence de la Cour 43 Juin 2002 ...,sur la jurisprudence de la Cour 43 Juin 2002 ...,,,,Non-violation de P4-2;Aucune question distinct...,1,,
20,Ali Erol c. Turquie,002-5317,Note d’information sur la jurisprudence de la ...,sur la jurisprudence de la Cour 43 Juin 2002 ...,sur la jurisprudence de la Cour 43 Juin 2002 ...,,,,Radiation du rôle (règlement amiable),2,,


In [24]:
# Results

# Load pickle if required
reload = 0
if reload:
    results = pd.read_pickle("./results.pkl") 

results.head()

Unnamed: 0,case,case_id,content,claimed,awarded,num_awarded,num_claimed,violation,conclusion,importance,originatingbody,case_date
6,CASE OF ORAL AND OTHERS v. TURKEY,001-60396,FIRST SECTION CASE OF ORAL AND OTHERS ...,CASE OF ORAL AND OTHERS v. TURKEY (Appl...,CASE OF ORAL AND OTHERS v. TURKEY (Appl...,,,,Struck out of the list (friendly settlement),4,4.0,2002-03-28 00:00:00
7,CASE OF ULGER v. TURKEY,001-60397,THIRD SECTION CASE OF ÜLGER v. TURKEY...,CASE OF ÜLGER v. TURKEY (Application n...,CASE OF ÜLGER v. TURKEY (Application n...,,,,Struck out of the list (friendly settlement),4,4.0,2002-03-28 00:00:00
18,Wessels-Bergervoet c. Pays-Bas,002-5323,Note d’information sur la jurisprudence de la ...,sur la jurisprudence de la Cour 43 Juin 2002 ...,sur la jurisprudence de la Cour 43 Juin 2002 ...,,,,Violation de l'art. 14+P1-1;Satisfaction équit...,1,,
19,Olivieira c. Pays-Bas,002-5335,Note d’information sur la jurisprudence de la ...,sur la jurisprudence de la Cour 43 Juin 2002 ...,sur la jurisprudence de la Cour 43 Juin 2002 ...,,,,Non-violation de P4-2;Aucune question distinct...,1,,
20,Ali Erol c. Turquie,002-5317,Note d’information sur la jurisprudence de la ...,sur la jurisprudence de la Cour 43 Juin 2002 ...,sur la jurisprudence de la Cour 43 Juin 2002 ...,,,,Radiation du rôle (règlement amiable),2,,


In [25]:
# Now find numerical content and try to arrive at a claimed figure

# Define list of allowed currencies
curr = ['EUR','EURO','GBP','USD','Euros','Pounds','Dollars','French','Turkish','FRF','Lira','Turkish Lira']

# Remove all rows where the content failed
results = results.loc[results['claimed']!='']
results = results.loc[results['awarded']!='']

# Define function to find currencies in string
def find_currency(x):
    px = [Price.fromstring(i) for i in x.split(".")]
    px2 = [t.amount for t in px if t.currency in curr]
    return px2

# Update the num claimed col
results['num_claimed'] = results.apply(lambda x: find_currency(str(x['claimed'])),axis=1)
results['num_awarded'] = results.apply(lambda x: find_currency(str(x['awarded'])),axis=1)

# Set the fields to zero if they contain an empty list
results.num_awarded = results.num_awarded.apply(lambda y: [0] if (len(y)==0) else y)
results.num_claimed = results.num_claimed.apply(lambda y: [0] if (len(y)==0) else y)

In [26]:
# Remove all rows where the content failed
results = results.loc[results['violation']!='']

# Inspect results
results.head(10)

Unnamed: 0,case,case_id,content,claimed,awarded,num_awarded,num_claimed,violation,conclusion,importance,originatingbody,case_date
100,AFFAIRE IMMEUBLES GROUPE KOSSER c. FRANCE,001-64897,PREMIÈRE SECTION AFFAIRE IMMEUBLES GROUPE ...,AFFAIRE IMMEUBLES GROUPE KOSSER c. FRANCE ...,AFFAIRE IMMEUBLES GROUPE KOSSER c. FRANCE ...,"[26571387, 50000, 3000]","[26571387, 50000, 3000]",6;6-1,Non-violation de l'art. 6-1 en ce qui concerne...,3,4,
101,AFFAIRE APBP c. FRANCE,001-64899,PREMIÈRE SECTION AFFAIRE APBP c. FRANCE ...,AFFAIRE APBP c. FRANCE (Requête n° 38436/...,AFFAIRE APBP c. FRANCE (Requête n° 38436/...,"[10200630, 15, 3000]","[10200630, 15, 3000]",6;6-1,Non-violation de l'art. 6-1 en ce qui concerne...,3,4,
110,CASE OF VISSER v. THE NETHERLANDS,001-60112,THIRD SECTION CASE OF VISSER v. THE N...,CASE OF VISSER v. THE NETHERLANDS (App...,CASE OF VISSER v. THE NETHERLANDS (App...,[0],[0],6;6-1,Violation of Art. 6-1 and 6-3-d;Pecuniary dama...,2,6,2021-05-29 00:00:00
146,AFFAIRE VOLKWEIN c. ALLEMAGNE,001-64964,TROISIÈME SECTION AFFAIRE VOLKWEIN c. ALLEM...,AFFAIRE VOLKWEIN c. ALLEMAGNE (Requête n°...,AFFAIRE VOLKWEIN c. ALLEMAGNE (Requête n°...,"[6000, 41, 2675, 41, 3000, 1200]","[6000, 41, 2675, 41, 3000, 1200]",6;6-1,Violation de l'art. 6-1;Dommage matériel - dem...,4,6,
188,AFFAIRE ADAMOGIANNIS c. GRECE,001-64882,PREMIÈRE SECTION AFFAIRE ADAMOGIANNIS c. G...,AFFAIRE ADAMOGIANNIS c. GRÈCE (Requête n°...,AFFAIRE ADAMOGIANNIS c. GRÈCE (Requête n°...,[0],[0],6;6-1,Violation de l'art. 6-1,3,4,
193,AFFAIRE MIKULIĆ c. CROATIE,001-64592,PREMIÈRE SECTION AFFAIRE MIKULIĆ c. CROATI...,AFFAIRE MIKULIĆ c. CROATIE (Requête no 53...,AFFAIRE MIKULIĆ c. CROATIE (Requête no 53...,[0],[0],6;6-1;8;8-1;13;13+6-1,Violation de l'article 6 - Droit à un procès é...,1,4,
210,AFFAIRE ČONKA c. BELGIQUE,001-64585,TROISIÈME SECTION AFFAIRE ČONKA c. BELGIQU...,AFFAIRE ČONKA c. BELGIQUE (Requête no 515...,AFFAIRE ČONKA c. BELGIQUE (Requête no 515...,"[19850, 9000, 10000, 9000]","[19850, 9000, 10000, 9000]",5;5-2;5-4;13;13+P4-4;P4-4,Exception préliminaire rejetée (Art. 35) Condi...,1,6,
217,AFFAIRE LEONARDI c. ITALIE,001-64914,TROISIÈME SECTION AFFAIRE LEONARDI c. ITAL...,AFFAIRE LEONARDI c. ITALIE (Requête n° 54...,AFFAIRE LEONARDI c. ITALIE (Requête n° 54...,[500],[500],6;6-1,Violation de l'art. 6-1;Préjudice moral - répa...,4,6,
218,AFFAIRE PRETE c. ITALIE,001-64915,TROISIÈME SECTION AFFAIRE PRETE c. ITALIE ...,AFFAIRE PRETE c. ITALIE (Requête n° 54279...,AFFAIRE PRETE c. ITALIE (Requête n° 54279...,[2000],[2000],6;6-1,Violation de l'art. 6-1;Préjudice moral - répa...,4,6,
219,AFFAIRE ANDREOZZI c. ITALIE,001-64916,TROISIÈME SECTION AFFAIRE ANDREOZZI c. ITA...,AFFAIRE ANDREOZZI c. ITALIE (Requête n° 5...,AFFAIRE ANDREOZZI c. ITALIE (Requête n° 5...,[500],[500],6;6-1,Violation de l'art. 6-1;Préjudice moral - répa...,4,6,


In [27]:
# Set wildcards for selecting P1-1 cases
wildcards_p1 = ['1-1','P1','P1-1','P1-1-1','Protocol 1-1','Article 1','Article 1 al. 1 du Protocole','1 du Protocole n° 1','1-2']
wildcards_p2 = ['P2','2-1']
wildcards_p3 = ['P3']
wildcards_p4 = ['P4','4-2']
wildcards_p5 = ['P5','5-2','5-3','5-4','5-1','5-5']
wildcards_p6 = ['P6','6-1','6-3','6-2']
wildcards_p8 = ['P8','8-1']
wildcards_p13 = ['13-P4','13','13+3','13+7','13+5']

# Do filtering into various articles
results['article'] = ''
results['article'] = results.apply(lambda x: 'p1' if any(i in x['violation'] for i in wildcards_p1) else x['article'],axis=1)
results['article'] = results.apply(lambda x: 'p2' if any(i in x['violation'] for i in wildcards_p2) else x['article'],axis=1)
results['article'] = results.apply(lambda x: 'p4' if any(i in x['violation'] for i in wildcards_p4) else x['article'],axis=1)
results['article'] = results.apply(lambda x: 'p5' if any(i in x['violation'] for i in wildcards_p5) else x['article'],axis=1)
results['article'] = results.apply(lambda x: 'p6' if any(i in x['violation'] for i in wildcards_p6) else x['article'],axis=1)
results['article'] = results.apply(lambda x: 'p8' if any(i in x['violation'] for i in wildcards_p8) else x['article'],axis=1)
results['article'] = results.apply(lambda x: 'p13' if any(i in x['violation'] for i in wildcards_p13) else x['article'],axis=1)

In [45]:
# Define originator
results['originator'] = ''
results['defendant'] = ''

try:
    # Split to get originator and defendant
    # Source
    results['originator'] = results.apply(lambda x: x['case'].split(" v.")[0].split(" ")[-1] if len(x['case'].split(" v."))>1 else x['originator'],axis=1)
    results['originator'] = results.apply(lambda x: x['case'].split(" c.")[0].split(" ")[-1] if len(x['case'].split(" c."))>1 else x['originator'],axis=1)
    # Defendant
    results['defendant'] = results.apply(lambda x: x['case'].split(" v. ")[1].split(" ")[0] if len(x['case'].split(" v. "))>1 else x['defendant'],axis=1)
    results['defendant'] = results.apply(lambda x: x['case'].split(" c. ")[1].split(" ")[0] if len(x['case'].split(" c. "))>1 else x['defendant'],axis=1)

    # Title format the originator and defendant
    results['originator'] = results.apply(lambda x: x['originator'].title(),axis=1)
    results['defendant'] = results.apply(lambda x: x['defendant'].title(),axis=1)

    # Add the first name
    results['originator'] = results.apply(lambda x: x['content'].split(str(x['originator']))[0].split(" ")[-1] + " " + x['originator'] if len(x['originator'])>1 else x['originator'],axis=1)
    
    # Then look in text for prefix or suffix
    results['type'] = ''
    results['type'] = results.apply(lambda x: 'M' if 'Mr '+x['originator'] in x['content'] else x['type'],axis=1)
    results['type'] = results.apply(lambda x: 'F' if 'Mrs '+x['originator'] in x['content'] else x['type'],axis=1)

except:
    print("Error")

results.head(5)

Unnamed: 0,case,case_id,content,claimed,awarded,num_awarded,num_claimed,violation,conclusion,importance,originatingbody,case_date,article,originator,defendant,type
100,AFFAIRE IMMEUBLES GROUPE KOSSER c. FRANCE,001-64897,PREMIÈRE SECTION AFFAIRE IMMEUBLES GROUPE ...,AFFAIRE IMMEUBLES GROUPE KOSSER c. FRANCE ...,AFFAIRE IMMEUBLES GROUPE KOSSER c. FRANCE ...,"[26571387, 50000, 3000]","[26571387, 50000, 3000]",6;6-1,Non-violation de l'art. 6-1 en ce qui concerne...,3,4,,p6,Kosser,France,
101,AFFAIRE APBP c. FRANCE,001-64899,PREMIÈRE SECTION AFFAIRE APBP c. FRANCE ...,AFFAIRE APBP c. FRANCE (Requête n° 38436/...,AFFAIRE APBP c. FRANCE (Requête n° 38436/...,"[10200630, 15, 3000]","[10200630, 15, 3000]",6;6-1,Non-violation de l'art. 6-1 en ce qui concerne...,3,4,,p6,Rozakis\n\tGreffier\tPrésident Apbp,France,
110,CASE OF VISSER v. THE NETHERLANDS,001-60112,THIRD SECTION CASE OF VISSER v. THE N...,CASE OF VISSER v. THE NETHERLANDS (App...,CASE OF VISSER v. THE NETHERLANDS (App...,[0],[0],6;6-1,Violation of Art. 6-1 and 6-3-d;Pecuniary dama...,2,6,2021-05-29 00:00:00,p6,Visser,The,
146,AFFAIRE VOLKWEIN c. ALLEMAGNE,001-64964,TROISIÈME SECTION AFFAIRE VOLKWEIN c. ALLEM...,AFFAIRE VOLKWEIN c. ALLEMAGNE (Requête n°...,AFFAIRE VOLKWEIN c. ALLEMAGNE (Requête n°...,"[6000, 41, 2675, 41, 3000, 1200]","[6000, 41, 2675, 41, 3000, 1200]",6;6-1,Violation de l'art. 6-1;Dommage matériel - dem...,4,6,,p6,Volkwein,Allemagne,
188,AFFAIRE ADAMOGIANNIS c. GRECE,001-64882,PREMIÈRE SECTION AFFAIRE ADAMOGIANNIS c. G...,AFFAIRE ADAMOGIANNIS c. GRÈCE (Requête n°...,AFFAIRE ADAMOGIANNIS c. GRÈCE (Requête n°...,[0],[0],6;6-1,Violation de l'art. 6-1,3,4,,p6,Adamogiannis,Grece,


In [46]:
# Save as pickle again for easy recall
results.to_pickle("./results.pkl")