In [None]:
import pandas as pd
import numpy as np
import re

In [None]:
from sentence_transformers import SentenceTransformer, util

In [None]:
SIM_MODEL = SentenceTransformer("models/all-MiniLM-L6-v2")

In [None]:
def process_str(s):
    s = re.sub('<s>', '', s)
    s = s.strip()
    return s

def process_ts_section(section):
    ''' clean parsed term in TS
    '''
    processed_section = ''
    try:
        processed_section = process_str(re.split('  ', section)[0])
    except Exception as e:
        print(section, e)
    return processed_section

In [None]:
def get_similarity(
    ts_string_list,
    fa_string_list,
    map_type,
    sim_threshold=0.6,
    model_path='models/all-MiniLM-L6-v2',
    pretrained=None,
    top_N=5
):
    from sentence_transformers import SentenceTransformer, util
    import torch
    
#     if pretrained is not None:
#         model = SentenceTransformer(pretrained, device=torch.device("cuda", 2))
#     else:
#         model = SentenceTransformer(model_path, device=torch.device("cuda", 2))
    model = SentenceTransformer(model_path)
        
    embeddings1 = model.encode(ts_string_list, convert_to_tensor=True)
    embeddings2 = model.encode(fa_string_list, convert_to_tensor=True)
    cosine_scores = util.pytorch_cos_sim(embeddings1, embeddings2)
    
    similar_pairs = {}
    for i in range(len(ts_string_list)):
        all_score = list(cosine_scores[i])
        above_threshold_idx = [all_score.index(k) for k in [j for j in all_score if j >= sim_threshold]]
        above_threshold_sims = [j.item() for j in all_score if j >= sim_threshold]
        idx_sims = list(zip(above_threshold_idx, above_threshold_sims))

        idx_sims = sorted(idx_sims, key=lambda x: x[1], reverse=True)[:top_N]
        ref_string = []
#         sim_score = []
        
        map_results = []
        for idx, sims in idx_sims:
            string = fa_string_list[idx]
            if string not in ref_string:
                ref_string.append(string)
                map_results.append({
                    'similar_term': string,
                    'score': round(sims, 2),
                    'map_type': map_type
                })
        if ref_string:
            similar_pairs.update({
                ts_string_list[i]: map_results
            })
    return similar_pairs

# Read TS & FA

In [None]:
ts_file = "data/docparse_csv_annotated/annotated/1_GL_SYN_TS_mkd_20221215_docparse.csv"
# ts_file = "data/join_label/13_BL_SYN_TS_mkd_20220713_docparse.csv"
fa_file = "data/docparse_csv/FA/1_GL_SYN_FA_mkd_20221215_docparse.csv"

In [None]:
# FA file
df_fa = pd.read_csv(fa_file)
# parties
isPartiesStart = df_fa.text.str.contains('^THIS AGREEMENT is|is made on|^PARTIES|Between:*', na=False, case=False)
isPartiesEnd = df_fa.text.str.contains('AGREED* as follows|AGREED* that', na=False, case=False)
partiesBeginID = df_fa[isPartiesStart]['index'].values[0] + 1
partiesEndID = df_fa[isPartiesEnd]['index'].values[0] - 1
parties_clause_id = df_fa['index'].between(isPartiesStart, isPartiesEnd)
df_fa.loc[parties_clause_id,'section'] = 'PARTIES'
df_fa.loc[parties_clause_id, 'section_id'] = '0'

df_parties = df_fa[(df_fa.section_id == "0") | (df_fa.section_id == 0)] # cols: definition + text
# definition
df_def = df_fa[
    (df_fa.section == "DEFINITIONS AND INTERPRETATION")&(df_fa.sub_section == "Definitions")
] # cols: definition + text
df_def = df_def[~df_def.definition.isna()]
# exclude parties & definition clause
df_others = df_fa[
    (df_fa.section != "DEFINITIONS AND INTERPRETATION") & (df_fa.section  != "PARTIES")
]
# schedule
df_schedule = df_others.loc[df_others.schedule.notnull()]
# main clause
df_clause = df_others.loc[~df_others.schedule.notnull()]
#df_clause = df_clause[(df_clause.text_element != "section") & (df_clause.text_element != "sub_section")]


# TS file
df_ts = pd.read_csv(ts_file)
df_ts["processed_section"] = df_ts["section"].apply(
    lambda i: process_ts_section(i)
)

In [None]:
df_schedule.part.value_counts()

In [None]:
len(df_ts.text)

In [None]:
len(set(df_ts.text))

In [None]:
len(set(df_ts.procesdf))

In [None]:
set(df_ts.processed_section)

In [None]:
len(
    df_ts[df_ts.text_element=='section']
)

In [None]:
print(len(df_fa), len(df_ts))

# Term Matching

In [None]:
# TS term section vs. FA definition

top_N = 5

ts_section_list = list(set(df_ts.processed_section))
ts_section_list = [s for s in ts_section_list if s]

def_string_list = list(set(df_def.definition))
def_string_list = [s for s in def_string_list if s]


pairs_def = dict()
if def_string_list:
    pairs_def = get_similarity(
        ts_section_list,
        def_string_list,
        "sec_to_def",
        sim_threshold=0.9
    )
else:
    print(f, 'Check: No definition')

In [None]:
len(pairs_def)

In [None]:
# check the final results & compare with the raw sections

df_results = pd.read_csv('data/term_matching_csv/20230718/2_GF_SYN_TS_mkd_20221111_docparse_results.csv')
for item in ts_section_list:
    if item not in list(set(df_results.TS_term)):
        print(item)

In [None]:
# TS term section vs. FA parties
parties_string_list = []
for s in list(set(df_parties.definition)):
    if s:
        if s not in parties_string_list:
            if isinstance(s, str):
                parties_string_list.append(s)

pairs_parties = dict()
if parties_string_list:
    pairs_parties = get_similarity(
        ts_section_list,
        parties_string_list,
        "sec_to_parties",
        sim_threshold=0.5
    )
else:
    print('Check: No parties')

In [None]:
len(pairs_parties)

In [None]:
# TS term v.s. FA sub section
sub_sec_list = list(set(df_clause[df_clause.text_element == "sub_section"].sub_section))
sub_sec_list = [s for s in sub_sec_list if s]
pairs_sec_to_sub_sec = get_similarity(
    ts_section_list,
    sub_sec_list,
    "sec_to_sub_sec",
    sim_threshold=0
)

In [None]:
len(pairs_sec_to_sub_sec)

In [None]:
pairs_sec_to_sub_sec

In [None]:
# TS term + text vs. FA clause
clause_section_list = list(set(df_clause[df_clause.text_element == "section"].section))
clause_section_list = [s for s in clause_section_list if s]
# TS section v.s. FA clause section -> select potential FA clause section
pairs_clause_section = get_similarity(
    ts_section_list,
    clause_section_list,
    "clause_section",
    sim_threshold=0.3
)


In [None]:
len(pairs_clause_section)

In [None]:
# for k, v in pairs_clause_section.items():
#     print(k, len(v))

In [None]:
total_pairs_clause = []
# pairs_sec_to_sub_sec = dict()
# check under the section candidates
for k, v in pairs_clause_section.items():
    df_ts_sub = df_ts[df_ts.processed_section == k]
    ts_text_list = [] # process nan value
    for s in list(set(df_ts_sub[df_ts_sub.text_element!='section'].text)):
        if s:
            if s not in ts_text_list:
                if isinstance(s, str):
                    ts_text_list.append(s)
    
    
    ts_section_list = list(set(df_ts_sub.processed_section))

    candidates = [item['similar_term'] for item in v]
    df_clause_sub = df_clause[df_clause.section.isin(candidates)]
    print(k, len(df_ts_sub), len(ts_text_list), len(df_clause_sub))

    sub_section_list = [] # process nan value
    for s in list(set(df_clause_sub[df_clause_sub.text_element == "sub_section"].sub_section)):
        if s:
            if s not in sub_section_list:
                if isinstance(s, str):
                    sub_section_list.append(s)
    
    clause_string_list = list(set(
        df_clause_sub[(df_clause_sub.text_element != "section") & (df_clause_sub.text_element != "sub_section")].text
    ))

    pairs_sub_section = dict()
    pairs_sec_to_sub_sec_partial = dict()
    if sub_section_list:
        if ts_text_list:
            pairs_sub_section = get_similarity(
                ts_text_list,
                sub_section_list,
                "text_to_sub_sec",
                sim_threshold=0
            )
        else:
            print('no text in ', k)
        pairs_sec_to_sub_sec_partial = get_similarity(
            ts_section_list,
            sub_section_list,
            "sec_to_sub_sec",
            sim_threshold=0
        )

    else:
        print('no sub sectin in ', k)
    pairs_sec_to_sub_sec.update(pairs_sec_to_sub_sec_partial)
    pairs_clause = dict()
    if ts_text_list and clause_string_list:
        pairs_clause = get_similarity(
            ts_text_list,
            clause_string_list,
            "text_to_clause_text",
            sim_threshold=0
        )

    if pairs_sub_section:
        total_pairs_clause.append({
            k: pairs_sub_section
        })
    if pairs_clause:
        total_pairs_clause.append({k: pairs_clause})
    # break

In [None]:
len(total_pairs_clause)

In [None]:
# import json

# with open('data/term_matching_csv/json/total_pairs_clause.json', 'w') as f:
#     json.dump(total_pairs_clause, f, indent=4)

In [None]:
### add TS term + text vs. FA schedule
# TODO: use the whole schedule or the details?
# 0729: add "part" in schedule; TS term vs. FA schedule part

schedule_section_list = list(set(df_schedule.schedule))
schedule_part_list = list(set(df_schedule.part))
schedule_part_list = [p for p in schedule_part_list if p]

pairs_schedule_section = dict()
if schedule_section_list:
    pairs_schedule_section = get_similarity(
        ts_section_list,
        schedule_section_list,
        "schedule_section"
    )
else:
    print('No schedule section')

if schedule_part_list:
    pairs_schedule_part = get_similarity(
        ts_section_list,
        schedule_part_list,
        "schedule_part",
        sim_threshold=0
    )


total_pairs_sched = []
# check under the section candidates
for k, v in pairs_schedule_section.items():
    df_ts_sub = df_ts[df_ts.processed_section == k]
    s_text_list = list(set(df_ts_sub[df_ts_sub.text_element!='section'].text))
    
    candidates = [item['similar_term'] for item in v]
    df_sched_sub = df_schedule[df_schedule.schedule.isin(candidates)]
    sched_text_list = list(set(df_sched_sub.text))
    pairs_sched = dict()
    print(len(ts_text_list), len(sched_text_list))
    if ts_text_list and sched_text_list:
        pairs_sched = get_similarity(
            ts_text_list,
            sched_text_list,
            "text_to_schedule_text",
            sim_threshold=0
        )
    if pairs_sched:
        total_pairs_sched.append({k: pairs_sched})
    print(k, len(df_ts_sub), len(ts_text_list), len(df_sched_sub))

In [None]:
len(total_pairs_sched)

In [None]:
# summarize all results

df_ts['similar_def'] = df_ts['processed_section'].apply(
    lambda i: pairs_def.get(i)
)
df_ts['similar_parties'] = df_ts['processed_section'].apply(
    lambda i: pairs_parties.get(i)
)
df_ts['similar_sub_section'] = df_ts['processed_section'].apply(
    lambda i: pairs_sec_to_sub_sec.get(i)
)
df_ts['similar_schedule'] = df_ts['processed_section'].apply(
    lambda i: pairs_schedule_part.get(i)
)

df_ts_def = df_ts[~df_ts.similar_def.isna()][['section', 'processed_section', 'text','similar_def']]
df_ts_parties = df_ts[~df_ts.similar_parties.isna()][['section', 'processed_section',  'text','similar_parties']]
df_ts_sub_sec = df_ts[~df_ts.similar_sub_section.isna()][['section', 'processed_section',  'text', 'similar_sub_section']]
df_ts_sched = df_ts[~df_ts.similar_schedule.isna()][['section', 'processed_section',  'text', 'similar_schedule']]

In [None]:
### check
total_pairs = []

for sec in list(set(df_ts_def.processed_section)):
    sub_df = df_ts_def[df_ts_def.processed_section==sec]
    total_pairs.append({
        sec: dict(zip(sub_df.text, sub_df.similar_def))
    })

for sec in list(set(df_ts_parties.processed_section)):
    sub_df = df_ts_parties[df_ts_parties.processed_section==sec]
    total_pairs.append({
        sec: dict(zip(sub_df.text, sub_df.similar_parties))
    })

for sec in list(set(df_ts_sub_sec.processed_section)):
    sub_df = df_ts_sub_sec[df_ts_sub_sec.processed_section==sec]
    total_pairs.append({
        sec: dict(zip(sub_df.text, sub_df.similar_sub_section))
    })
for sec in list(set(df_ts_sched.processed_section)):
    sub_df = df_ts_sched[df_ts_sched.processed_section==sec]
    total_pairs.append({
        sec: dict(zip(sub_df.text, sub_df.similar_schedule))
    })

In [None]:
len(total_pairs)

In [None]:
total_pairs.extend(total_pairs_clause)
total_pairs.extend(total_pairs_sched)

In [None]:
len(total_pairs)

In [None]:
# with open('data/term_matching_csv/json/total_pairs.json', 'w') as f:
#     json.dump(total_pairs, f, indent=4)

In [None]:
def remark():
    ''' 
    pairs_xx = {
        term1: [{'similar_term': '', 'score': '', 'map_type': ''},...],
        term2: [{'similar_term': '', 'score': '', 'map_type': ''},...],
        ...
    }
    
    
    total_pairs_clause = [
        {
            term1: {
                text11: [{'similar_term': '', 'score': '', 'map_type': ''},...],
                text12: [{'similar_term': '', 'score': '', 'map_type': ''},...],
                ...
            },
        },
        {
            term2: {
                text21: [{'similar_term': '', 'score': '', 'map_type': ''},...],
                text22: [{'similar_term': '', 'score': '', 'map_type': ''},...],
                ...
            },
        }
        ...
    ]
    final format -> total_pairs_clause
    '''
    pass

In [None]:
from collections import defaultdict

keys = []
for pair in total_pairs:
    k = list(pair.keys())[0]
    if k not in keys:
        keys.append(k)


total_pairs_updated = []
for k in keys:
    sub_pairs = [p[k] for p in total_pairs if list(p.keys())[0] == k]
    dd = defaultdict(list)
    for p in sub_pairs:
        for i, j in p.items():
            dd[i].extend(j)
    
    total_pairs_updated.append({k: dd})

In [None]:
# with open('data/term_matching_csv/json/total_pairs_updated.json', 'w') as f:
#     json.dump(total_pairs_updated, f, indent=4)

In [None]:
results = []

for pair in total_pairs_updated:
    for sec, value in pair.items():
        for text, match in value.items():
            for item in match:
                results.append({
                    'TS_section': sec,
                    'TS_text': text,
                    'match_term': item['similar_term'],
                    'similarity': item['score'],
                    'match_type': item['map_type']
                })

In [None]:

df_results = pd.DataFrame(data=results)
df_results = df_results.drop_duplicates()
print(len(df_results))

In [None]:
df_results.head()

In [None]:
df_results = df_results.sort_values(by=['TS_text', 'similarity'], ascending=False)

In [None]:
df_results.match_type.value_counts()

In [None]:
# map TS basic information
ts_map = {}
for idx, row in df_ts[['index','text_block_id','page_id', 'section','processed_section','text']].drop_duplicates().iterrows():
    ts_map[row['text']] = [
        row['index'],
        row['text_block_id'],
        row['page_id'],
        # row['phrase_id']
        # row['section']
    ]

content2id = {
    'sec_to_def': dict(),
    'text_to_clause_text': dict(),
    'text_to_sub_sec': dict(),
    'text_to_schedule_text': dict(),
    'sec_to_parties': dict(),
    'sec_to_sub_sec': dict()
}

for idx, row in df_def[['definition', 'identifier']].drop_duplicates().iterrows():
    content2id['sec_to_def'].update({row['definition']: row['identifier']})

for idx, row in df_fa[df_fa.text_element=='sub_section'][['sub_section', 'identifier']].drop_duplicates().iterrows():
    content2id['text_to_sub_sec'].update({row['sub_section']: row['identifier']})
content2id['sec_to_sub_sec'] = content2id['text_to_sub_sec']
for idx, row in df_clause[['text', 'identifier']].drop_duplicates().iterrows():
    content2id['text_to_clause_text'].update({row['text']: row['identifier']})


for idx, row in df_schedule[['text', 'identifier']].drop_duplicates().iterrows():
    content2id['text_to_schedule_text'].update({row['text']: row['identifier']})

for idx, row in df_parties[['definition', 'identifier']].drop_duplicates().iterrows():
    content2id['sec_to_parties'].update({row['definition']: row['identifier']})

In [None]:
df_results['identifier'] = df_results.apply(
    lambda i: content2id[i['match_type']].get(i['match_term']),
    axis=1
)

df_results['index'] = df_results.apply(
    lambda i: ts_map.get(i['TS_text'])[0],
    axis=1
)
df_results['text_block_id'] = df_results.apply(
    lambda i: ts_map.get(i['TS_text'])[1],
    axis=1
)
df_results['page_id'] = df_results.apply(
    lambda i: ts_map.get(i['TS_text'])[2],
    axis=1
)
# df_results['phrase_id'] = df_results.apply(
#     lambda i: ts_map.get(i['TS_text'])[3],
#     axis=1
# )

df_results = df_results.sort_values(
    by=['TS_section', 'TS_text', 'similarity'],
    ascending=False
)



In [None]:
len(df_results)

In [None]:
df_results = df_results.drop_duplicates()

In [None]:
len(df_results) # 2924

In [None]:
df_results[df_results.TS_section=='FATCA Clauses']

In [None]:
df_results.TS_section.value_counts()

In [None]:
# df_results.to_csv('results_2_20230710.csv', index=False)

In [None]:
import os

In [None]:
for f in os.listdir('data/term_matching_csv/20230712/check/'):
    print(f)
    df_results = pd.read_csv(f'data/term_matching_csv/20230712/check/{f}')
    final = []

    for term in list(set(df_results.TS_section)):
        df_s = df_results[df_results.TS_section==term]
        for text in list(set(df_s.TS_text)):
            df_ss = df_s[df_s.TS_text==text]
            try:
                final.append({
                    'index': list(df_ss['index'])[0],
                    'text_block_id': list(df_ss['text_block_id'])[0],
                    'page_id': list(df_ss['page_id'])[0],
                    'phrase_id': list(df_ss['phrase_id'])[0],
                    'TS_term': term,
                    'TS_text': text,
                    'match_term_list': list(df_ss['match_term'])[:5],
                    'identifier_list': list(df_ss['identifier'])[:5],
                    'similarity_list': list(df_ss['similarity'])[:5],
                    'match_type_list': list(df_ss['match_type'])[:5]
                })
            except Exception as e:
                print(term, text, e)
    df_final = pd.DataFrame(data=final)
    save_file = re.sub('.csv', '_results.csv', f)
    df_final.to_csv(f'data/term_matching_csv/20230712/new/{save_file}', index=False)

In [None]:
final = []

for term in list(set(df_results.TS_section)):
    df_s = df_results[df_results.TS_section==term]
    for text in list(set(df_s.TS_text)):
        df_ss = df_s[df_s.TS_text==text]
        try:
            final.append({
                'index': list(df_ss['index'])[0],
                'text_block_id': list(df_ss['text_block_id'])[0],
                'page_id': list(df_ss['page_id'])[0],
                'TS_term': term,
                'TS_text': text,
                'match_term_list': list(df_ss['match_term'])[:5],
                'identifier_list': list(df_ss['identifier'])[:5],
                'similarity_list': list(df_ss['similarity'])[:5],
                'match_type_list': list(df_ss['match_type'])[:5]
            })
        except Exception as e:
            print(term, text, e)
df_final = pd.DataFrame(data=final)

In [None]:
df_final.to_csv('2_results_0712.csv', index=False)

In [None]:
## TODO: select top N

In [None]:
# check NaN
import os
for f in os.listdir('data/term_matching_csv/'):
    if f.endswith('.csv'):
        fpath = f'data/term_matching_csv/{f}'
        check = pd.read_csv(fpath)
        count_nan_in_df = check.isnull().sum().sum()
        count_nan2_in_df = check.isna().sum().sum()
        print (f, 'Count of NaN: ' + str(count_nan_in_df), str(count_nan2_in_df))

In [None]:
check = pd.read_csv('data/9_GF_SYN_TS_mkd_docparse.csv')

In [None]:
len(check)

In [None]:
check[~check.match_term.isna()]

In [None]:
### evaluation


In [None]:
df

In [None]:
import os
import re

results = []
path = "data/evaluation/20230712/"
for f in os.listdir(path):
    if re.search('FA.csv', f):
        df = pd.read_csv(f'{path}/{f}', encoding='utf8')
        clf = ['TP', 'FP', 'TN', 'FN']
        TP = len(df[df.judge_all=='TP'])
        FP = len(df[df.judge_all=='FP'])
        TN = len(df[df.judge_all=='TN'])
        FN = len(df[df.judge_all=='FN'])
        if TP + FP > 0:
            precision = TP / (TP + FP)
        else:
            precision = ''
        if TP + FN > 0:
            recall = TP / (TP + FN)
        else:
            recall = ''
        results.append({
            'fname': f,
            'TP': TP,
            'FP': FP,
            'TN': TN,
            'FN': FN,
            'precision': precision,
            'recall': recall
        })

In [None]:
results_df = pd.DataFrame(data=results)

In [None]:
results_df

In [None]:
sum(results_df.TP) / (sum(results_df.TP) + sum(results_df.FN))

In [None]:
# evaluate based on fa_identifier

In [None]:
def match_identifier_type(fa_identifier):
    idf_type = ''
    if isinstance(fa_identifier, str):
        if re.search('Cl_1.1\-', fa_identifier, re.I):
            idf_type = 'definition'
        else:
            if re.search('Cl', fa_identifier, re.I):
                idf_type = 'clause'
            elif re.search('Parties', fa_identifier, re.I):
                idf_type = 'parties'
            elif re.search('sched', fa_identifier, re.I):
                idf_type = 'schedule'
            else:
                pass
    return idf_type

In [None]:
### sample

path = 'data/evaluation/20230712/'

df_all = pd.DataFrame()

for file in os.listdir(path):
    if re.search('FA.csv', file):
        df = pd.read_csv(os.path.join(path, file))
        df['filename'] = file
        df['identifier_type'] = df['fa_identifier'].apply(
            lambda i: match_identifier_type(i)
        )
        df_s = df[['filename', 'fa_identifier', 'identifier_type', 'judge_all']]
        df_all = df_all.append(df_s)
        
len(df_all)

In [None]:
df_all.identifier_type.value_counts()

In [None]:
results = []

for idf in set(df_all.identifier_type):
    TP = len(df_all[(df_all.identifier_type==idf)&(df_all.judge_all=='TP')])
    FP = len(df_all[(df_all.identifier_type==idf)&(df_all.judge_all=='FP')])
    TN = len(df_all[(df_all.identifier_type==idf)&(df_all.judge_all=='TN')])
    FN = len(df_all[(df_all.identifier_type==idf)&(df_all.judge_all=='FN')])
    
    precision = TP / (TP + FP) if (TP + FP) > 0 else ''
    recall = TP / (TP + FN) if (TP + FN) > 0 else ''
    results.append({
        'annotation_type': idf,
        'TP': TP,
        'FP': FP,
        'TN': TN,
        'FN': FN,
        'precision': precision,
        'recall': recall
    })

In [None]:
from pprint import pprint

In [None]:
print(results)

In [None]:
pprint(results)

In [None]:
# 0712

pd.DataFrame(data=results)