In [41]:
!nvidia-smi

Wed Jun 15 17:21:58 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   67C    P0    30W /  70W |   4458MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [42]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!pip install transformers -q
!pip install sentencepiece -q
!pip install rouge_score -q

In [4]:
import pandas as pd
import random as rn
import numpy as np
import torch
import nltk
from collections import OrderedDict
from transformers import set_seed, PegasusTokenizer, PegasusForConditionalGeneration

In [5]:
# Setting seed
set_seed(321)
rn.seed(321)
np.random.seed(321)
torch.manual_seed(321)
torch.cuda.manual_seed(321)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
nltk.download('punkt')
print(device)

cuda


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-xsum')
model = PegasusForConditionalGeneration.from_pretrained('google/pegasus-xsum')

In [43]:
def summarize(args:list,delim:str=". ")->str:
  paragraph = delim.join(args)
  tokenized_paragraph = tokenizer(paragraph, return_tensors="pt").to(device)
  tokenized_output = model.generate(**tokenized_paragraph)
  detokenized_output = tokenizer.batch_decode(tokenized_output, skip_special_tokens=True)
  return detokenized_output[0]

def get_topic(args:list, use_unique=False)->str:
  unique_args = args if not use_unique else list(OrderedDict.fromkeys(args))
  max_count = 5
  summaries = []
  args_count = 0
  while(True):
    args_count = len(unique_args)
    print(args_count)
    if(args_count>max_count):
      summaries = []
      buckets_count = int(np.ceil(args_count/max_count))
      for i in range(buckets_count):
        if (i+1)*max_count>args_count:
          summaries.append(summarize(unique_args[i*max_count:args_count]))
        else:
          summaries.append(summarize(unique_args[i*max_count:(i+1)*max_count]))
      unique_args = summaries if not use_unique else list(OrderedDict.fromkeys(summaries))
    else:
      return summarize(unique_args)

# Read data

In [44]:
to_read_path = 'drive/MyDrive/Colab Notebooks/nlp/legal dataset/data/10_clusters.csv'
clustered_df = pd.read_csv(to_read_path)
clustered_df = clustered_df[['cluster_id','sentences']]
clustered_df.head()

Unnamed: 0,cluster_id,sentences
0,2,It recalls both its own findings and those of ...
1,2,This is demonstrated in other cases pending be...
2,2,Publication was also necessary to protect the ...
3,2,"In the present case,\r\nthere was a formidable..."
4,2,It was therefore not empowered to take evidenc...


# Group

In [45]:
grouped_df = clustered_df.groupby('cluster_id')
grouped_df.count()

Unnamed: 0_level_0,sentences
cluster_id,Unnamed: 1_level_1
0,325
1,278
2,198
3,113
4,244
5,36
6,96
7,79
8,72
9,119


# Do Labeling

In [46]:
model.to(device)
print('')




In [47]:
%%time

topics = []

for id in range(10):
  result = get_topic(grouped_df.get_group(id)['sentences'].tolist())
  topics.append(result)
  print(result)

325
65
13
3
A round-up of interesting court cases from the UK and around the world this week:
278
56
12
3
A complaint has been made to the European Court of Human Rights (ECHR) against the Archbishop of Canterbury, the Most Reverend Justin Welby, on the grounds that he has failed to fulfil his obligations under the European Convention on Human Rights.
198
40
8
2
Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair Aupair
113
23
5
The European Court of Human Rights (ECHR) has upheld the decision of the St Plten Regional Court to release on bail Ahmet Sadik, who had been arrested for drinking and driving.
244
49
10
2
The

In [48]:
clustered_topic_df = clustered_df.join(pd.Series(topics)[clustered_df['cluster_id']].to_frame("topic").reset_index())
assert (clustered_topic_df['cluster_id']==clustered_topic_df['index']).all()
clustered_topic_df = clustered_topic_df[['cluster_id','sentences','topic']]

In [49]:
clustered_topic_df.head()

Unnamed: 0,cluster_id,sentences,topic
0,2,It recalls both its own findings and those of ...,Aupair Aupair Aupair Aupair Aupair Aupair Aupa...
1,2,This is demonstrated in other cases pending be...,Aupair Aupair Aupair Aupair Aupair Aupair Aupa...
2,2,Publication was also necessary to protect the ...,Aupair Aupair Aupair Aupair Aupair Aupair Aupa...
3,2,"In the present case,\r\nthere was a formidable...",Aupair Aupair Aupair Aupair Aupair Aupair Aupa...
4,2,It was therefore not empowered to take evidenc...,Aupair Aupair Aupair Aupair Aupair Aupair Aupa...


### Save

In [50]:
to_save_path = 'drive/MyDrive/Colab Notebooks/nlp/legal dataset/data/labeled_10_clusters_sz_5.csv'
clustered_topic_df.to_csv(to_save_path)

# Same for unk clusters amount

In [15]:
to_read_path = 'drive/MyDrive/Colab Notebooks/nlp/legal dataset/data/25_unk_clusters.csv'
clustered_unk_df = pd.read_csv(to_read_path)
clustered_unk_df = clustered_unk_df[['cluster_id','sentences']]
clustered_unk_df.head()

Unnamed: 0,cluster_id,sentences
0,20,It recalls both its own findings and those of ...
1,20,This is demonstrated in other cases pending be...
2,20,"In these circumstances, the Commission finds t..."
3,20,The Commission finds that the suggested applic...
4,20,"In this context, the Commission had\r\nregard..."


In [16]:
grouped_unk_df = clustered_unk_df.groupby('cluster_id')
grouped_unk_df.count()

Unnamed: 0_level_0,sentences
cluster_id,Unnamed: 1_level_1
0,50
1,66
2,98
3,72
4,39
5,98
6,96
7,52
8,71
9,119


In [18]:
%%time

topics = []

for id in range(25):
  result = get_topic(grouped_unk_df.get_group(id)['sentences'].tolist())
  topics.append(result)
  print(result)

50
13
4
The decision by the Court of Appeal to uphold the decision of the High Court to uphold the reservation of a place of worship for Muslims on the M1 motorway is a mixed bag..
66
17
5
2
The Court of Justice of the European Union (CJEU) has upheld a decision by the Court of Justice of the European Union (ECHR) to dismiss an appeal against a decision by the European Court of Human Rights (ECHR) to uphold a ruling by the European Court of Human Rights (ECHR).
98
25
7
2
The European Court of Human Rights (ECHR) has ruled in favour of a Turkish man who was held in pre-trial detention for five years without trial on the grounds that he had received funds from the Turkish government...
72
18
5
2
The European Court of Justice (CJEU) has upheld the decision of the Austrian Supreme Court to reject an appeal by a Bulgarian child killer against a decision by the Austrian Parole Board to release him on parole.
39
10
3
Amy Winehouse's mother has won a court case against the Turkish authorities 

In [19]:
len(topics)

25

In [22]:
clustered_topic_unk_df = clustered_unk_df.join(pd.Series(topics)[clustered_unk_df['cluster_id']].to_frame("topic").reset_index())
clustered_topic_unk_df = clustered_topic_unk_df[['cluster_id','sentences','topic']]

In [23]:
clustered_topic_unk_df

Unnamed: 0,cluster_id,sentences,topic
0,20,It recalls both its own findings and those of ...,The European Court of Human Rights (ECHR) has ...
1,20,This is demonstrated in other cases pending be...,The European Court of Human Rights (ECHR) has ...
2,20,"In these circumstances, the Commission finds t...",The European Court of Human Rights (ECHR) has ...
3,20,The Commission finds that the suggested applic...,The European Court of Human Rights (ECHR) has ...
4,20,"In this context, the Commission had\r\nregard...",The European Court of Human Rights (ECHR) has ...
...,...,...,...
1555,19,"The Commission considers, in the light of the ...",The European Commission has taken cognizance o...
1556,19,These include the power to quash in all respec...,The European Commission has taken cognizance o...
1557,19,that a thorough\r\nexamination of this complai...,The European Commission has taken cognizance o...
1558,19,The Commission considers that the applicant's ...,The European Commission has taken cognizance o...


### Save

In [24]:
to_save_path = 'drive/MyDrive/Colab Notebooks/nlp/legal dataset/data/labeled_25_unk_clusters_sz_4.csv'
clustered_topic_unk_df.to_csv(to_save_path)

# Explore results

In [57]:
to_read_path = 'drive/MyDrive/Colab Notebooks/nlp/legal dataset/data/labeled_10_clusters_sz_5.csv'
clustered_topic_df = pd.read_csv(to_read_path).drop(columns=['Unnamed: 0'])
clustered_topic_df

Unnamed: 0,cluster_id,sentences,topic
0,2,It recalls both its own findings and those of ...,Aupair Aupair Aupair Aupair Aupair Aupair Aupa...
1,2,This is demonstrated in other cases pending be...,Aupair Aupair Aupair Aupair Aupair Aupair Aupa...
2,2,Publication was also necessary to protect the ...,Aupair Aupair Aupair Aupair Aupair Aupair Aupa...
3,2,"In the present case,\r\nthere was a formidable...",Aupair Aupair Aupair Aupair Aupair Aupair Aupa...
4,2,It was therefore not empowered to take evidenc...,Aupair Aupair Aupair Aupair Aupair Aupair Aupa...
...,...,...,...
1555,8,"However, the fact complained of relates to a p...",The European Court of Human Rights (ECHR) has ...
1556,8,The\r\napplicant refers here in particular to ...,The European Court of Human Rights (ECHR) has ...
1557,8,It is to be observed that a crucial issue in ...,The European Court of Human Rights (ECHR) has ...
1558,8,"In 1988, when considering certain amendments t...",The European Court of Human Rights (ECHR) has ...


In [58]:
to_read_path = 'drive/MyDrive/Colab Notebooks/nlp/legal dataset/data/labeled_25_unk_clusters_sz_4.csv'
clustered_topic_unk_df = pd.read_csv(to_read_path).drop(columns=['Unnamed: 0'])
clustered_topic_unk_df

Unnamed: 0,cluster_id,sentences,topic
0,20,It recalls both its own findings and those of ...,The European Court of Human Rights (ECHR) has ...
1,20,This is demonstrated in other cases pending be...,The European Court of Human Rights (ECHR) has ...
2,20,"In these circumstances, the Commission finds t...",The European Court of Human Rights (ECHR) has ...
3,20,The Commission finds that the suggested applic...,The European Court of Human Rights (ECHR) has ...
4,20,"In this context, the Commission had\r\nregard...",The European Court of Human Rights (ECHR) has ...
...,...,...,...
1555,19,"The Commission considers, in the light of the ...",The European Commission has taken cognizance o...
1556,19,These include the power to quash in all respec...,The European Commission has taken cognizance o...
1557,19,that a thorough\r\nexamination of this complai...,The European Commission has taken cognizance o...
1558,19,The Commission considers that the applicant's ...,The European Commission has taken cognizance o...


In [59]:
clustered_topic_unk_df['cluster_id'].min()

0

In [60]:
clustered_topic_unk_df['cluster_id'].max()

24

In [61]:
clustered_topic_df['cluster_id'].min()

0

In [62]:
clustered_topic_df['cluster_id'].max()

9

In [71]:
def print_row(i):
  entry = clustered_topic_unk_df.iloc[i]
  print('   Sentences')
  print(entry['sentences'])
  print('   Topic')
  print(entry['topic'])

In [75]:
print_row(12)

   Sentences
In this
respect, they refer to the numerous other matters related to the
proceedings at issue, and the proceedings before the Vienna Juvenile
Court.
   Topic
The European Court of Human Rights (ECHR) has ruled in a case brought by a woman that her husband's complaints against the police were not raised at least in substance before the domestic courts.


In [74]:
clustered_topic_unk_df

Unnamed: 0,cluster_id,sentences,topic
0,20,It recalls both its own findings and those of ...,The European Court of Human Rights (ECHR) has ...
1,20,This is demonstrated in other cases pending be...,The European Court of Human Rights (ECHR) has ...
2,20,"In these circumstances, the Commission finds t...",The European Court of Human Rights (ECHR) has ...
3,20,The Commission finds that the suggested applic...,The European Court of Human Rights (ECHR) has ...
4,20,"In this context, the Commission had\r\nregard...",The European Court of Human Rights (ECHR) has ...
...,...,...,...
1555,19,"The Commission considers, in the light of the ...",The European Commission has taken cognizance o...
1556,19,These include the power to quash in all respec...,The European Commission has taken cognizance o...
1557,19,that a thorough\r\nexamination of this complai...,The European Commission has taken cognizance o...
1558,19,The Commission considers that the applicant's ...,The European Commission has taken cognizance o...
