In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
# if using a cloud service like colab
!pip install sklearn
!pip install spacy scipy fast_pagerank
!pip install nltk
!pip install sentence_transformers

In [None]:
import json
import pandas as pd
import numpy as np
import spacy
from sklearn.metrics.pairwise import cosine_similarity


from scipy import sparse
from fast_pagerank import pagerank
from fast_pagerank import pagerank_power

from itertools import product


pd.set_option('display.max_colwidth', None)

In [None]:
# Navigate to your root directory in my case kp_nlp
cd /content/gdrive/MyDrive/collab/kp_nlp


In [None]:
import pandas as pd
legal_df = pd.read_csv(r'datasets/legal_dataset/data/echr_arguments.csv', index_col=None)


In [None]:
legal_df.head()

In [None]:
import nltk
nltk.download('punkt')
legal_df["argument"].dropna(inplace=True)


In [None]:
legal_df=legal_df.dropna()

In [None]:
legal_df.head()

In [None]:
legal_df["argument"].isna().sum()

In [None]:
legal_df["sents"] =  legal_df.apply(lambda x : nltk.tokenize.sent_tokenize(x["argument"]), axis=1)

In [None]:
legal_df.head(10)

In [None]:
legal_df.columns

In [None]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer("nlpaueb/bert-base-uncased-echr")

**Pagerank** and generating top X potential kp on the full data
Note: Sections are here Just geographically not on a cluster => matching might be required

In [None]:
def gen_match_matrix(model, sents, min_match_score=0):
    sents1 = [x for x in sents]
    sents1_embeddings = model.encode(sents1)
    sim_matrix = cosine_similarity(sents1_embeddings, sents1_embeddings)
    super_threshold_indices = sim_matrix < min_match_score
    sim_matrix[super_threshold_indices] = 0
    #print(sim_matrix)
    return sim_matrix

In [None]:
def filter_ranked_list(ranked_sents, model, min_match=0.8, N=3):
    ranked_sents = [x[0] for x in ranked_sents]
    filtered_sents = []
    for i, s in enumerate(ranked_sents):
        if len(filtered_sents) >= N or len(filtered_sents) >= len(ranked_sents):
            break
        if len(filtered_sents) == 0:
            filtered_sents.append(s)
            # print('FFFF ' , s)
        else:
            matching_scores = gen_match_matrix(model, [s] + filtered_sents)
            max_sim = np.max(matching_scores[0][1:])
            # print(matching_scores)
            if max_sim < min_match:
                # print('SSSS ' , s)
                filtered_sents.append(s)
        
        # print(min_match)
    
    return filtered_sents

In [None]:
def apply_page_rank(sentences, p=0.85, min_match_score=0.5, min_len=5, max_len=35):
    cand_sents = [x for x in sentences if len(x.split()) < max_len and len(x.split()) > min_len]
    if len(cand_sents) == 0:
        #print('empty')
        return []
    # print(cands)
    # print(cands_qualities)
    cands_matching_mat = gen_match_matrix(model, cand_sents, min_match_score=min_match_score)
    pr = pagerank(cands_matching_mat, p=p)  # it looks like modifying the initial probability doesn't help
    # pr=pagerank(cands_matching_mat, p=p)
    ranked_candidates = list(zip(cand_sents, pr))
    return sorted(ranked_candidates, key=lambda x: -x[1])

In [None]:
ranked_sents_per_section = []

sections = []
ranked_sents_per_section = []
ranked_kps_per_section = []
for i, row in legal_df.iterrows():
    ranked_sents = apply_page_rank(row['sents'], p=0.2, min_match_score=0.5)
    ranked_sents_per_section.append(ranked_sents)
    ranked_kps = filter_ranked_list(ranked_sents, model, min_match=0.8, N=2)
    ranked_kps_per_section.append(ranked_kps)
    sections.append(row["name"])
    
results_df = pd.DataFrame({ "ranked_sents": ranked_sents_per_section,"section":sections, "ranked_kps": ranked_kps_per_section}) # "sections": sections,


In [None]:
results_df.head(10)

In [None]:
res_cleaned= results_df.dropna()

In [None]:
res_cleaned=res_cleaned[res_cleaned['ranked_sents'].map(lambda d: len(d)) > 0]

In [None]:
res_cleaned.head(10)

In [None]:
len(res_cleaned.iloc[3,2])

Explode from list and set to a string. This is needed to be able to perform the grouping correctly



In [None]:
res_cleaned=res_cleaned.explode("ranked_sents")

In [None]:
res_cleaned=res_cleaned.explode("ranked_sents")

In [None]:
res_cleaned=res_cleaned.explode("ranked_kps")

In [None]:
res_cleaned.head()

In [None]:
res_cleaned.iloc[0,2]

In [None]:
res_cleaned.drop_duplicates(subset=['ranked_kps'])

In [None]:
len(res_cleaned['section'].unique())

In [None]:
res_cleaned[["ranked_sents", "section","ranked_kps"]] = res_cleaned[["ranked_sents", "section","ranked_kps"]].astype(str) 

We join all arguments belonging to the same section by a unique token ";" not present in the initial set to be able to resplit at a later point

In [None]:
test_df=res_cleaned.groupby('section')['ranked_kps'].apply('; '.join).reset_index() ## All sections concatinated

In [None]:
test_df.head(10) 

In [None]:
ranked_sents_per_section

In [None]:
all_text = []
for i, row in legal_df.iterrows():
    all_text.extend(row["sents"])

In [None]:
print(all_text)

In [None]:
# with all text 
ranked_sents = apply_page_rank(all_text, p=0.2, min_match_score=0.5)
ranked_kps = filter_ranked_list(ranked_sents, model, min_match=0.8, N=20)

In [None]:
# all text ranked
ranked_kps

In [None]:
ds = pd.DataFrame({"Kp candidates":ranked_kps})

In [None]:
ds.to_csv('res_onall_20top.csv')

**Example for just one instance**


In [None]:
#apply page rank on section 

print("section:\n",test_df.iloc[1,0])
list_df= test_df.iloc[1,1].split(';')
list_df=list(dict.fromkeys(list_df))
print("text:\n",list_df)

In [None]:
print(len(set(list_df)))
print(len(list_df))

In [None]:
#PR on section 01
# 10 KP for this section make a len and depend on it give the number of wished Kp size
ranked_sents_section1 = apply_page_rank(list_df, p=0.2, min_match_score=0.8)
ranked_kps_section1 = filter_ranked_list(ranked_sents_section1, model, min_match=0.8, N=10) # make them unique decrease min_match

In [None]:
ranked_kps_section1

**Pagerank On all sections. Here I am trying to  generate as many as possible kp candidates we fix for the instance 10 kp per section**

In [None]:
test_df.iloc[0,1]

In [None]:
section_dict_kp={}

for i in range(len(test_df["ranked_kps"])):
  x=test_df.iloc[i,1].split(';')
  section_dict_kp["section "+test_df.iloc[i,0][:-4]]=list(set(x))
print(section_dict_kp)

In [None]:
print(section_dict_kp.keys())

In [None]:
for i in section_dict_kp.keys():
  print(len(section_dict_kp[i]))

In [None]:
section_dict_kp['section 00']

In [None]:
import matplotlib.pyplot as plt

# make int index for section
# Not sure if they are auto sorted. in our case we did it before but just another check as this will be crucial for some hyperparameters 
sec_index=[]
len_text= []
for i in section_dict_kp.keys():
  sec_index.append(int(i.split(' ')[1]))
  len_text.append(len(section_dict_kp[i]))


print(sec_index)
print(len_text)

s=0
for i in len_text:
  s+=i

print(s)
#plt.bar(DayOfWeekOfCall, DispatchesOnThisWeekday, align='center')

#plt.xticks(DayOfWeekOfCall, LABELS)


x_line=[np.array(sec_index).min(),np.array(sec_index).max()]



#Fig
fig=plt.figure(figsize=[10,10])
ax = fig.add_subplot(211)
ax2 = fig.add_subplot(212)
ax.scatter(x=sec_index,y=len_text)
ax.plot(x_line,[120,120],color='orange')
ax.plot(x_line,[80,80],color='red')
ax.plot(x_line,[40,40],color='green')
ax.set_xlabel('section')
ax.set_ylabel('length of section')
#ax2 = fig.add_subplot(212)
#ax2.bar(sec_index,height=len_text)


In [None]:
section_candidate={}
dict_grenzwert= {
    "<40":3,
    "40_80":6,
    "80_120":9,
    ">120":12
} #this dictonnary won't be used for now and just for visibility and the down sections repetition can be avoided but like this is clearly for the reader
for i in section_dict_kp.keys():
    l_text=len(section_dict_kp[i])
    if l_text <40: 
      ranked_sents_temp = apply_page_rank(section_dict_kp[i], p=0.2, min_match_score=0.5)
      ranked_kps_temp = filter_ranked_list(ranked_sents_temp, model, min_match=0.6, N=dict_grenzwert['<40']) # decreased the min math to ensure a higher contextuel coverage
      section_candidate[i+ 'kp']=ranked_kps_temp
    elif l_text>39 and l_text < 80:
       ranked_sents_temp = apply_page_rank(section_dict_kp[i], p=0.2, min_match_score=0.5)
       ranked_kps_temp = filter_ranked_list(ranked_sents_temp, model, min_match=0.8, N=dict_grenzwert['40_80'])
       section_candidate[i+ 'kp']=ranked_kps_temp
    elif l_text>79 and l_text < 120:
       ranked_sents_temp = apply_page_rank(section_dict_kp[i], p=0.2, min_match_score=0.5)
       ranked_kps_temp = filter_ranked_list(ranked_sents_temp, model, min_match=0.8, N=dict_grenzwert['80_120'])
       section_candidate[i+ 'kp']=ranked_kps_temp
    else: 
      ranked_sents_temp = apply_page_rank(section_dict_kp[i], p=0.2, min_match_score=0.5)
      ranked_kps_temp = filter_ranked_list(ranked_sents_temp, model, min_match=0.8, N=dict_grenzwert['>120'])
      section_candidate[i+ 'kp']=ranked_kps_temp




In [None]:
section_candidate

In [None]:
import json
with open("result_kp.json", "w") as f:
    json.dump(section_candidate, f)

**Clustering**

Using a cluster to determine the amount of kp
 

In [None]:
df_cluster_amount= pd.read_csv("bettopic_name_to_cluster_amount_dictionary.csv")

In [None]:
df_cluster_amount.head()

In [None]:
N_kps=df_cluster_amount["cluster_id"].values

In [None]:
section_candidate_cluster={}
for i,j in zip(section_dict_kp.keys(),N_kps):
    ranked_sents_temp = apply_page_rank(section_dict_kp[i], p=0.2, min_match_score=0.5)
    ranked_kps_temp = filter_ranked_list(ranked_sents_temp, model, min_match=0.6, N=j) # decreased the min math to ensure a higher contextuel coverage
    section_candidate_cluster[i+ 'kp']=ranked_kps_temp
    

In [None]:
section_candidate_cluster

In [None]:
import json
with open("result_kp_cluser_legalbert.json", "w") as s:
    json.dump(section_candidate_cluster, s)

In [None]:
df_kp= pd.DataFrame()
df_kp["Sections"]=section_candidate_cluster.keys()
df_kp["kp_cand"]=section_candidate_cluster.values()

In [None]:
df_kp.head(42)
df_kp.to_csv("res_kp_cand_cluster.csv")