In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -U sentence-transformers > /dev/null

In [None]:
!ls

drive  sample_data


In [None]:
import json
import sklearn
import sklearn.model_selection
import pandas as pd

In [None]:
random_state = 321

# Parse data

In [None]:
def read_data(f):
    # returns JSON object as
    # a dictionary
    data = json.load(f)
    all_data_arguments = []
    for record in data:
        clauses_id_to_text = {}
        text_name = record['name']
        for clause in record['clauses']:
            _id, start, end = clause['_id'], clause['start'], clause['end']
            clause_text = record['text'][start:end]
            clauses_id_to_text[_id] = clause_text

        arguments = []
        for argument in record['arguments']:
            premises = [p for p in argument['premises']] if type(argument['premises']) == list else [argument['premises']]
            conclusions = [c for c in argument['conclusion']] if type (argument['conclusion']) == list else [argument['conclusion']]
            argument_premises = [clauses_id_to_text[p] for p in premises]
            argument_conclusions = [clauses_id_to_text[c] for c in conclusions]
            arguments.append({'premises': argument_premises, 'conclusion': argument_conclusions})

        all_data_arguments.append(arguments)
        
    return data

In [None]:
f = open('drive/MyDrive/Colab Notebooks/nlp/legal dataset/data/ECHR_Corpus.json')
data = read_data(f)

In [None]:
data[0].keys()

dict_keys(['name', 'text', 'clauses', 'arguments'])

# Create df

In [None]:
def data_to_df(data): 
  data_df =pd.DataFrame(columns=['name','argument','clause_id'])
  for entry in data:
    name = entry['name']
    for arg in entry['arguments']:
      for p in arg['premises']:
        for clause in entry['clauses']:
          if clause['_id']==p:
            data_df = data_df.append({'name':name,'argument':entry['text'][int(clause['start']):int(clause['end'])],'clause_id':clause['_id']},ignore_index=True)
            break
  return data_df

In [None]:
data_df = data_to_df(data)

In [None]:
data_df.shape

(1951, 3)

In [None]:
data_df.sort_values(by='argument')['argument'].iloc[2]

'   In the Court’s opinion, the balancing of the interests of the defence against arguments in favour of maintaining the anonymity of witnesses raises special problems if the witnesses in question are members of the police force of the State. '

In [None]:
data_df = data_df.astype({'name': 'string','argument': 'string','clause_id': 'string'})
data_df.dtypes

name         string
argument     string
clause_id    string
dtype: object

In [None]:
to_save_path = 'drive/MyDrive/Colab Notebooks/nlp/legal dataset/data/legal_dataset_args.pkl'
data_df.to_pickle(to_save_path)
print(data_df.shape)
df = pd.read_pickle(to_save_path)
df.shape

(1951, 3)


(1951, 3)

In [None]:
len(data_df['clause_id'].unique())

1951

In [None]:
df.dropna()

Unnamed: 0,name,argument,clause_id
0,00.txt,The Commission notes that the applicant was de...,5d4be09d3e582511aa1cda28
1,00.txt,He was released after the Court of Appeal revi...,5d4be0c33e582511aa1cda2b
2,00.txt,Article 5 para. 1 (Art. 5-1) of the Conventio...,5d4be0803e582511aa1cda22
3,00.txt,The Commission finds that the applicant was de...,5d4be0ff3e582511aa1cda37
4,00.txt,The Commission also finds no evidence in the c...,5d4be1143e582511aa1cda3a
...,...,...,...
2013,07.txt,There is little or no evidence for the Govern...,5f943a16bf181507836fa249
2014,07.txt,Nor is there a scrap of evidence for the asser...,5f943a31bf181507836fa24a
2017,07.txt,"In sum, I have not been convinced that the ap...",5f943ad3bf181507836fa24e
2020,07.txt,(see the Guzzardi v. Italy judgment of 6 Novem...,5f943c36bf181507836fa253


# Train/Test split

In [None]:
train_df, test_df = sklearn.model_selection.train_test_split(data_df,random_state=321,test_size=0.2)
assert len(data_df) == len(train_df)+len(test_df)

In [None]:
train_df.head()

Unnamed: 0,name,argument,clause_id
737,17.txt,It recalls both its own findings and those of ...,5d4647d13e582511aa1ccc8d
1775,24.txt,I cannot see any justification for such differ...,5f916884bf181507836f9d03
78,01.txt,This is demonstrated in other cases pending be...,5d3ab9583e582511aa1cc1bf
1146,29.txt,Publication was also necessary to protect the ...,5d4a3b083e582511aa1cd365
1020,27.txt,The weight which the minority of the Commissio...,5d492edf3e582511aa1cd231


# Clustering
https://gitlab.lrz.de/lab-courses/nlp-lab-ss2022/team-list-2-tobias/kp_extraction/-/blob/master/notebooks/our_approaches/whitepapers/generate_kps_using_clustering.ipynb

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
import numpy as np 

embedder = SentenceTransformer('distiluse-base-multilingual-cased-v2')  

def agglomerative_clustering(corpus, n_clusters=10):
    corpus_embeddings = embedder.encode(corpus)
    # Normalize the embeddings to unit length
    corpus_embeddings = corpus_embeddings /  np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)
    clustering_model = AgglomerativeClustering(n_clusters=n_clusters, distance_threshold=None)#affinity='cosine', linkage='average', distance_threshold=0.4)
    clustering_model.fit_predict(corpus_embeddings)
    cluster_assignment = clustering_model.labels_
    clustered_sentences = {}
    for sentence_id, cluster_id in enumerate(cluster_assignment):
        if cluster_id not in clustered_sentences:
            clustered_sentences[cluster_id] = []
        clustered_sentences[cluster_id].append(corpus[sentence_id])
    
    df = pd.DataFrame({"cluster_id": clustered_sentences.keys(), "sentences": clustered_sentences.values()})
    return df

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/114 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.38k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/610 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/341 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/539M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/531 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/996k [00:00<?, ?B/s]

In [None]:
%%time
clustered_df = agglomerative_clustering(train_df['argument'].tolist())

CPU times: user 2min 26s, sys: 1.06 s, total: 2min 27s
Wall time: 2min 30s


In [None]:
clustered_df

Unnamed: 0,cluster_id,sentences
0,2,[It recalls both its own findings and those of...
1,7,[I cannot see any justification for such diffe...
2,9,[The weight which the minority of the Commissi...
3,1,[Even if the decisions of the courts were base...
4,4,[He stated that there was a policy on the part...
5,0,[ It is noteworthy that all the members of the...
6,5,[Under the terms of Article 26 (Art. 26) of th...
7,3,"[ Concerning the first applicant, the Court re..."
8,6,"[(see, mutatis mutandis, the Belilos v. Switze..."
9,8,[The Commission recalls that in the case of\r\...


# Unk clusters amount

In [None]:
def agglomerative_clustering_unk(corpus):
    corpus_embeddings = embedder.encode(corpus)
    # Normalize the embeddings to unit length
    corpus_embeddings = corpus_embeddings /  np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)
    clustering_model = AgglomerativeClustering(n_clusters=None, distance_threshold=3.0)#affinity='cosine', linkage='average', distance_threshold=0.4)
    clustering_model.fit_predict(corpus_embeddings)
    cluster_assignment = clustering_model.labels_
    clustered_sentences = {}
    for sentence_id, cluster_id in enumerate(cluster_assignment):
        if cluster_id not in clustered_sentences:
            clustered_sentences[cluster_id] = []
        clustered_sentences[cluster_id].append(corpus[sentence_id])
    
    df = pd.DataFrame({"cluster_id": clustered_sentences.keys(), "sentences": clustered_sentences.values()})
    return df

In [None]:
%%time
clustered_unk_df = agglomerative_clustering_unk(train_df['argument'].tolist())

CPU times: user 2min 21s, sys: 1.23 s, total: 2min 22s
Wall time: 2min 21s


In [None]:
clustered_unk_df.shape

(25, 2)

# Save Data

In [None]:
clustered_exploded_df=clustered_df.explode('sentences')

In [None]:
to_save_path = 'drive/MyDrive/Colab Notebooks/nlp/legal dataset/data/10_clusters.csv'
clustered_exploded_df.to_csv(to_save_path)

In [None]:
to_save_path = 'drive/MyDrive/Colab Notebooks/nlp/legal dataset/data/train.csv'
train_df.to_csv(to_save_path)
to_save_path = 'drive/MyDrive/Colab Notebooks/nlp/legal dataset/data/test.csv'
test_df.to_csv(to_save_path)

In [None]:
clustered_exploded_unk_df=clustered_unk_df.explode('sentences')

In [None]:
to_save_path = 'drive/MyDrive/Colab Notebooks/nlp/legal dataset/data/25_unk_clusters.csv'
clustered_exploded_unk_df.to_csv(to_save_path)