In [3]:
import pandas as pd
import numpy as np
import re
import os
from sentence_transformers import SentenceTransformer, util
from collections import defaultdict
import time
import json

MODELS = [
    "models/all-MiniLM-L6-v2",
    "/home/data/ldrs_analytics/models/all-MiniLM-L6-v2-train_nli-boc_17398_epoch_100_lr_1e-05",
    "/home/data/ldrs_analytics/models/all-MiniLM-L6-v2-train_sts-boc_17398_epoch_100_lr_1e-05"
]
MODEL_PATH = MODELS[0]
SIM_MODEL = SentenceTransformer(MODEL_PATH)

def timeit(func):
    from functools import wraps
    import time
    LOG_DIR = '/home/data/ldrs_analytics/data/log'

    @wraps(func)
    def timeit_wrapper(*args, **kwargs):
        start_time = time.perf_counter()
        result = func(*args, **kwargs)
        end_time = time.perf_counter()
        total_time = end_time - start_time
        row = {'task': func.__name__,
               'filename': args[0].fname,
               'runtime': total_time}
        log2csv(LOG_DIR + '/log_term_matching.csv', row)
        # print(f'Function {func.__name__}{args} {kwargs} Took {total_time:.4f} seconds')
        return result
    return timeit_wrapper

@timeit
def log2csv(csv_name, row):
    '''
    log the record 'row' into path 'csv_name'
    '''
    import csv
    import os.path

    file_exists = os.path.isfile(csv_name)
    # Open the CSV file in "append" mode
    with open(csv_name, 'a', newline='') as f:
        # Create a dictionary writer with the dict keys as column fieldnames
        writer = csv.DictWriter(f, fieldnames=row.keys())
        if not file_exists:
            writer.writeheader() # file doesn't exist yet, write a header
            # Append single row to CSV
        writer.writerow(row)

def process_str(s):
    if not s or s == np.nan:
        return None
    s = re.sub('<s>', '', s)
    s = s.strip()
    return s

def process_ts_section(section):
    ''' clean parsed term in TS
    '''
    processed_section = ''
    try:
        processed_section = process_str(re.split('  ', section)[0])
    except Exception as e:
        print(section, e)
    return processed_section


def get_similarity(
    ts_string_list,
    fa_string_list,
    map_type,
    sim_threshold=0.6,
    model_path=MODEL_PATH,
    pretrained=SIM_MODEL,
    top_N=5
):
    from sentence_transformers import SentenceTransformer, util
    import torch
    
    ts_string_list = [str(i) for i in ts_string_list]
    fa_string_list = [str(i) for i in fa_string_list]
    
    if pretrained is not None:
        # model = SentenceTransformer(pretrained, device=torch.device("cuda", 2))
        model = pretrained
    else:
        model = SentenceTransformer(model_path, device=torch.device("cuda", 2))
        
    embeddings1 = model.encode(ts_string_list, convert_to_tensor=True)
    embeddings2 = model.encode(fa_string_list, convert_to_tensor=True)
    cosine_scores = util.pytorch_cos_sim(embeddings1, embeddings2)
    
    similar_pairs = {}
    for i in range(len(ts_string_list)):
        all_score = list(cosine_scores[i])
        above_threshold_idx = [all_score.index(k) for k in [j for j in all_score if j >= sim_threshold]]
        above_threshold_sims = [j.item() for j in all_score if j >= sim_threshold]
        idx_sims = list(zip(above_threshold_idx, above_threshold_sims))

        idx_sims = sorted(idx_sims, key=lambda x: x[1], reverse=True)[:top_N]
        ref_string = []
#         sim_score = []
        
        map_results = []
        for idx, sims in idx_sims:
            string = fa_string_list[idx]
            if string not in ref_string:
                ref_string.append(string)
                map_results.append({
                    'similar_term': string,
                    'score': round(sims, 2),
                    'map_type': map_type
                })
        if ref_string:
            similar_pairs.update({
                ts_string_list[i]: map_results
            })
    return similar_pairs 

In [5]:
f = "17_NBFI_SYN_TS_mkd_20221123_docparse.csv"
TS_folderpath = FA_folderpath = 'data/'

In [6]:
print('Processing: ', f)
start_time = time.perf_counter()
ts_file = os.path.join(TS_folderpath, f)
fa_f = re.sub('_TS_', '_FA_', f)
fa_file = os.path.join(FA_folderpath, fa_f)

# FA file
df_fa = pd.read_csv(fa_file).astype(str)
df_fa['index'] = df_fa['index'].astype(int)
# parties
isPartiesStart = df_fa.text.str.contains('^THIS AGREEMENT is|is made on|^PARTIES|Between:*', na=False, case=False)
isPartiesEnd = df_fa.text.str.contains('IT IS AGREED*:*|AGREED* as follows|AGREED* that', na=False, case=False)
partiesBeginID = df_fa[isPartiesStart]['index'].values[0] + 1
partiesEndID = df_fa[isPartiesEnd]['index'].values[0] - 1
parties_clause_id = df_fa['index'].between(isPartiesStart, isPartiesEnd)
df_fa.loc[parties_clause_id,'section'] = 'PARTIES'
df_fa.loc[parties_clause_id, 'section_id'] = '0'

df_fa = df_fa.replace({np.nan: None, 'nan': None, 'None': None})

# df_parties = df_fa[(df_fa.section_id == "0") | (df_fa.section_id == 0)] # cols: definition + text
df_parties = df_fa[df_fa.section=='PARTIES'] # line 149 to string, some section ids are like 0.0, not int.
# definition
df_def = df_fa[df_fa.sub_section.str.contains('Definition', na=False, case=False)] # cols: definition + text
df_def = df_def[~df_def.definition.isnull()]
# exclude parties & definition clause
df_others = df_fa[
    ~(df_fa.section.str.contains("INTERPRETATION", na=False, case=False)) & (df_fa.section_id  != "0")
]
# schedule
df_schedule = df_others.loc[df_others.schedule.notnull()]
# main clause
df_clause = df_others.loc[~df_others.schedule.notnull()]

# TS file
df_ts = pd.read_csv(ts_file)
df_ts["processed_section"] = df_ts["section"] #.apply(lambda i: process_ts_section(i))

Processing:  17_NBFI_SYN_TS_mkd_20221123_docparse.csv


In [8]:
set(df_ts.section)

{'Approvals',
 'Availability',
 'Borrower',
 'Cancellation',
 'Commitment Fee Waived. Interest Rate',
 'Coordinator',
 'Covenants',
 'Documentation',
 'Facility',
 'Facility Amount',
 'Governing Law',
 'Interest Calculation and Payment',
 'Interest Period',
 'Lenders',
 'Lenders. Agent',
 'Mandated Lead Arranger and Bookrunner ',
 'Maturity Date',
 'Purpose',
 'Repayment',
 'Signing Date',
 'Withholding Taxes',
 nan,
 'the Borrower shall immediately prepay all outstanding loans or that Lenders'}

In [23]:
df_ts

Unnamed: 0,index,text_block_id,page_id,phrase_id,section,text_element,list_id,text,keyphrase,text_granularity,clause_id,definition,schedule_id,annotation,docparse_datetime
0,0,0,1,0,,title,,Summary of Indicative Principal Terms and Cond...,,sentence,,,,,"10/07/2023, 14:15:52"
1,1,0,1,0,,paragraph,,This draft term sheet is indicative only and,['draft term sheet'],phrase,,,,,"10/07/2023, 14:15:52"
2,1,0,1,1,,paragraph,,does not constitute a legally binding commitme...,[],phrase,,,,,"10/07/2023, 14:15:52"
3,1,0,1,2,,paragraph,,The purpose of these indicative terms and,['indicative terms'],phrase,,,,,"10/07/2023, 14:15:52"
4,1,0,1,3,,paragraph,,conditions is to facilitate further discussion...,[],phrase,,,,,"10/07/2023, 14:15:52"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,225,55,14,0,Market disruption,paragraph,,,[],term,,,,,"10/07/2023, 14:15:52"
299,226,56,14,0,Break Costs,section,,Break Costs:,,sentence,,,,,"10/07/2023, 14:15:52"
300,227,56,14,0,Break Costs,paragraph,,If any Loan under the Facility is prepaid in w...,['Facility Agreement'],phrase,,,,,"10/07/2023, 14:15:52"
301,227,56,14,1,Break Costs,paragraph,,"each anniversary of that date, in respect of a...",['demand pay'],phrase,,,,,"10/07/2023, 14:15:52"


In [None]:
### todo: check # of sections in prev version & updated version

In [101]:
for pdffile in os.listdir('pdf/annotated_ts/'):
    try:
        ts_f = re.sub('_mkd_antd', '_mkd', re.sub('_antd_mkd', '_mkd_antd', pdffile))
        ts_f = re.sub('.pdf', '_docparse.csv', ts_f)
        match_f = re.sub('.csv', '_results.csv', ts_f)

        df_ts = pd.read_csv(os.path.join('data/annotated_ts/bfilled/', ts_f), encoding='utf8')
        df_ts = df_ts.replace({np.nan: None, 'nan': None, 'None': None})
        df_ts = df_ts[~df_ts.section.isna()]
        df_match = pd.read_csv(os.path.join(match_path, match_f), encoding='utf8')
        sec_ts = []
        for s in df_ts.section:
            if s.strip() not in sec_ts:
                sec_ts.append(s.strip())
        sec_match = [s.strip() for s in list(set(df_match.TS_term))]
        n1 = len(sec_ts)
        n2 = len(sec_match)

        with open(os.path.join('pdf/annotated_ts/', pdffile), 'rb') as f:
            pdf = pdftotext.PDF(f)
        count = 0
        regex = ':'
        length = max([len(line) for line in re.split('\n', page)])

        pdf_results = []
        for page in pdf:
            for line in re.split('\n', page):
                if re.search(regex, line[:round(length/2)]):
                    if len(line) - len(line.lstrip()) < 5:
                        # count += 1
                        pdf_results.append(
                            re.split(regex,line[:round(length/2)])[0].strip()
                        )
                    else:
                        pass
                        # print(line)
        pdf_results = list(set(pdf_results))
        pdf_results = [re.sub('• |\uf0b7 |\(s\)', '', item) for item in pdf_results]
        count = len(pdf_results)
        if count > n2:
            print(f, n1, n2, count)
            excluded2 = [e for e in pdf_results if e not in sec_ts]
            print(sec_ts)
            print(excluded2)
            print('\n\n\n')
    except Exception as e:
        print(e)

<_io.BufferedReader name='pdf/annotated_ts/78_GF_PRJ_TS_mkd_antd_20210722.pdf'> 27 27 29
['Summary of Indicative Terms and Conditions', '1. Borrower', '2. Lender', '3. Facilities  Tranche A', '4. Purpose', '5. Final Maturity Date', '6. Availability Period  Tranche A', '7. Drawdown', '8. Interest Rate', '9. Front-end Fee', '10 . All-in pricing', '11 . Repayment  Tranche A', '12 . Voluntary Prepayment', '13 . Cancellation', '14 . Commitment Fee', '15 . Default Interest', '16 Financial', '. Covenant', '17 . Other Undertakings', '18 . Status', '19 . Expenses', '20 . Increased Cost', '21 . Taxation', '21 . Documentation', '22 . Legal Counsel', '22 . Governing Law', '23 .']
['6. Availability', '(A) FATCA Deduction', '3. Facilities', '.  Covenant    that', '(B) FATCA information', '5. Final Maturity', '10 All-in pricing', '21 Taxation', '20 Increased Cost', '22 Legal Counsel', '21 Documentation', '"FATCA" means', '19 Expenses', '18 Status', '17 Other', '5) Insert as a new clause', '14 Commitm

[Errno 2] No such file or directory: 'data/annotated_ts/bfilled/40_AF_SYN_TS_mkd20190130_docparse.csv'
[Errno 2] No such file or directory: 'data/annotated_ts/bfilled/76_NBFI_SYN_TS_mkd_20211231_docparse.csv'
[Errno 2] No such file or directory: 'data/annotated_ts/bfilled/annotated_ts.json'
<_io.BufferedReader name='pdf/annotated_ts/12_NBFI_SYN_TS_mkd_antd_20221107.pdf'> 56 56 57
['Borrower', 'Guarantor', 'Obligors', 'Parent', 'Group', 'nan', 'Mandated Lead Arrangers and Bookrunners', 'Lenders', 'Majority Lenders', 'Facility Agent', 'Finance Parties', 'Facility Type and Facility Amount', 'Currency', 'Purpose', 'Final Maturity Date', '1 st Extension Option', '2 nd Extension Option', 'Availability Period', 'Drawdown', 'Repayment', 'Voluntary Prepayment', 'Mandatory Prepayment Events', 'Voluntary Cancellation', 'Interest Margin', 'Interest Period', 'Interest Rate', 'Default Interest', 'Base Rate', 'Arrangement Fee 1 st Extension Fee', '2 nd Extension Fee', 'Commitment Fee', 'Taxes and Ded

In [99]:
# match_path = 'term_matching_csv/20230807/'
# for f in os.listdir(match_path):
#     if f.endswith('csv'):
#         try:
#             ts_file = os.path.join('data/annotated_ts/bfilled/', re.sub('_results.csv', '.csv',f))
#             df_ts = pd.read_csv(ts_file, encoding='utf8')
#             df_ts = df_ts.replace({np.nan: None, 'nan': None, 'None': None})
#             df_ts = df_ts[~df_ts.section.isna()]
#             df_match = pd.read_csv(os.path.join(match_path,f), encoding='utf8')
#             sec_ts = []
#             for s in df_ts.section:
#                 if s not in sec_ts:
#                     sec_ts.append(s.strip())
#             sec_match = [s.strip() for s in list(set(df_match.TS_term))]
#             n1 = len(sec_ts)
#             n2 = len(sec_match)

#     #         excluded = [item for item in sec_ts if item not in sec_match]
#     #         print(excluded)

#             pdf_filename = re.sub('_docparse_results.csv', '.pdf', f)
#             pdf_filename = re.sub('_mkd', '_mkd_antd', pdf_filename)
#             with open(os.path.join('pdf/annotated_ts/', pdf_filename), 'rb') as pdffile:
#                 pdf = pdftotext.PDF(pdffile)
#             count = 0
#             regex = ':'
#             length = max([len(line) for line in re.split('\n', page)])
            
#             pdf_results = []
#             for page in pdf:
#                 for line in re.split('\n', page):
#                     if re.search(regex, line[:round(length/2)]):
#                         if len(line) - len(line.lstrip()) < 5:
#                             count += 1
#                             pdf_results.append(
#                                 re.split(regex,line[:round(length/2)])[0].strip()
#                             )
#                         else:
#                             pass
#                             # print(line)
            
#             if count > n2:
#                 print(f, n1, n2, count)
#                 excluded2 = [e for e in pdf_results if e not in sec_ts]
#                 print(sec_ts)
#                 print(excluded2)
#                 print('\n\n\n')
# #                 print(sec_ts)
# #                 print('-'*50)
# #                 print(pdf_results)
            
#         except Exception as e:
#             print(f, e)

In [67]:
f

'1_GL_SYN_TS_mkd_20221215_docparse_results.csv'

In [32]:
import pdftotext

In [54]:
with open("pdf/annotated_ts/1_GL_SYN_TS_mkd_antd_20221215.pdf", "rb") as f:
    pdf = pdftotext.PDF(f)

In [63]:
count = 0
regex = ':'
length = max([len(line) for line in re.split('\n', page)])
print(length)

for page in pdf:
    for line in re.split('\n', page):
        # length = len(line)
        if re.search(regex, line[:round(length/2)]):
            if len(line) - len(line.lstrip()) < 5:
                count += 1
                #print(line[:round(length/2)])
            else:
                print(line)

75
                           Tranche A: €[50] million term loan facility (the “Euro
                           Tranche B: US$[248] million term loan facility (the “US$
                      limited to:
                            less:         but more than
                  Euro Base Rate: EURIBOR, with floor at 0%. [Clause 1.1
                     basis:
                       (Information: miscellaneous)]; (v) notice of any change in
                       (Information: miscellaneous)]; (vi) notice of any Event of
                        Guarantee: PRC Law. [To be documented under the PRC
            follows:


In [64]:
count

60

In [4]:



# TS term section vs. FA definition
top_N = 5

ts_section_list = list(set(df_ts.processed_section))
ts_section_list = [s for s in ts_section_list if s]
def_string_list = list(set(df_def.definition))
def_string_list = [s for s in def_string_list if s]

pairs_def = dict()
if ts_section_list and def_string_list:
    pairs_def = get_similarity(
        ts_section_list,
        def_string_list,
        "sec_to_def",
        sim_threshold=0.9
    )
# else:
#     print(f, 'Check: No definition')

# TS term section vs. FA parties
parties_string_list = []
for s in list(set(df_parties.definition)):
    if s:
        if s not in parties_string_list:
            if isinstance(s, str):
                parties_string_list.append(s)
pairs_parties = dict()
if ts_section_list and parties_string_list:
    pairs_parties = get_similarity(
        ts_section_list,
        parties_string_list,
        "sec_to_parties",
        sim_threshold=0.5
    )
# else:
#     print(f, 'Check: No parties')

# TS term v.s. FA sub section
sub_sec_list = list(set(df_clause[df_clause.text_element == "sub_section"].sub_section))
sub_sec_list = [s for s in sub_sec_list if s]
pairs_sec_to_sub_sec = dict()
if ts_section_list and sub_sec_list:
    pairs_sec_to_sub_sec = get_similarity(
        ts_section_list,
        sub_sec_list,
        "sec_to_sub_sec",
        sim_threshold=0
    )
# else:
#     print(f, 'Check: No sub-section')
# TS term + text vs. FA clause
clause_section_list = list(set(df_clause[df_clause.text_element == "section"].section))
clause_section_list = [s for s in clause_section_list if s]
# TS section v.s. FA clause section -> select potential FA clause section

total_pairs_clause = []
pairs_clause_section = dict()
if ts_section_list and clause_section_list:
    pairs_clause_section = get_similarity(
        ts_section_list,
        clause_section_list,
        "clause_section",
        sim_threshold=0.3
    )
# else:
#     print(f, 'Check: No section')
    # check under the section candidates
    for k, v in pairs_clause_section.items():
        df_ts_sub = df_ts[df_ts.processed_section == k]    
        # TODO: improve this part, duplicates         
        ts_text_list = [] # process nan value
        for s in list(set(df_ts_sub[df_ts_sub.text_element!='section'].text)):
            if s:
                if s not in ts_text_list:
                    if isinstance(s, str):
                        ts_text_list.append(s)
        sub_ts_section_list = list(set(df_ts_sub.processed_section))

        candidates = [item['similar_term'] for item in v]
        df_clause_sub = df_clause[df_clause.section.isin(candidates)]
        # print(f'ts_section: {k}, ts_section_length: {len(df_ts_sub)}, ts_text_length: {len(ts_text_list)}, fa_section_length: {len(df_clause_sub)}')

        # TODO: improve this part
        sub_section_list = [] # process nan value
        for s in list(set(df_clause_sub[df_clause_sub.text_element == "sub_section"].sub_section)):
            if s:
                if s not in sub_section_list:
                    if isinstance(s, str):
                        sub_section_list.append(s)
        clause_string_list = list(set(
            df_clause_sub[(df_clause_sub.text_element != "section") & (df_clause_sub.text_element != "sub_section")].text
        ))

        pairs_sub_section = dict()
        # pairs_sec_to_sub_sec_partial = dict()
        if sub_section_list:
            if ts_text_list:
                pairs_sub_section = get_similarity(
                    ts_text_list,
                    sub_section_list,
                    "text_to_sub_sec",
                    sim_threshold=0
                )
            # else:
            #     print('no text in ', k)
            # if sub_ts_section_list:
            #     pairs_sec_to_sub_sec_partial = get_similarity(
            #         sub_ts_section_list,
            #         sub_section_list,
            #         "sec_to_sub_sec",
            #         sim_threshold=0
            #     )
            # else:
            #     print('no section in ', k)

        # else:
        #     print('no sub section in ', k)
        # if sub_sec_list:
        #     pairs_sec_to_sub_sec.update(pairs_sec_to_sub_sec_partial)
        pairs_clause = dict()
        if ts_text_list and clause_string_list:
            pairs_clause = get_similarity(
                ts_text_list,
                clause_string_list,
                "text_to_clause_text",
                sim_threshold=0
            )

        if pairs_sub_section:
            # case: same text under different terms
            total_pairs_clause.append({
                k: pairs_sub_section
            })
        if pairs_clause:
            total_pairs_clause.append({k: pairs_clause})

### add TS term + text vs. FA schedule
# TODO: use the whole schedule or the details?
# 0729: add "part" in schedule; TS term vs. FA schedule part
ts_section_list = list(set(df_ts.processed_section))
ts_section_list = [s for s in ts_section_list if s]
schedule_section_list = list(set(df_schedule.schedule))
schedule_part_list = []

for s in list(set(df_schedule.part)):
    if s:
        if s not in schedule_part_list:
            if isinstance(s, str):
                schedule_part_list.append(s)

pairs_schedule_section = dict()
if ts_section_list and schedule_section_list:
    pairs_schedule_section = get_similarity(
        ts_section_list,
        schedule_section_list,
        "schedule_section",
        sim_threshold=0.5
    )
# else:
#     print('No schedule section')

pairs_schedule_part = dict()
if ts_section_list and schedule_part_list:
    pairs_schedule_part = get_similarity(
        ts_section_list,
        schedule_part_list,
        "schedule_part",
        sim_threshold=0
    )
# else:
#     print('No schedule part')

total_pairs_sched = []
if ts_section_list and schedule_section_list:
    # check under the section candidates
    for k, v in pairs_schedule_section.items():
        df_ts_sub = df_ts[df_ts.processed_section == k]
        ts_text_list = list(set(df_ts_sub[df_ts_sub.text_element!='section'].text))

        candidates = [item['similar_term'] for item in v]
        df_sched_sub = df_schedule[df_schedule.schedule.isin(candidates)]
        sched_text_list = list(set(df_sched_sub.text))
        pairs_sched = dict()
        # print(f'ts_text_length: {len(ts_text_list)}, schedule_text_length: {len(sched_text_list)}')
        if ts_text_list and sched_text_list:
            pairs_sched = get_similarity(
                ts_text_list,
                sched_text_list,
                "text_to_schedule_text",
                sim_threshold=0
            )
        if pairs_sched:
            total_pairs_sched.append({k: pairs_sched})

# summarize all results

df_ts['similar_def'] = df_ts['processed_section'].apply(
    lambda i: pairs_def.get(i)
)
df_ts['similar_parties'] = df_ts['processed_section'].apply(
    lambda i: pairs_parties.get(i)
)
df_ts['similar_sub_section'] = df_ts['processed_section'].apply(
    lambda i: pairs_sec_to_sub_sec.get(i) if sub_sec_list else None
)
df_ts['similar_schedule'] = df_ts['processed_section'].apply(
    lambda i: pairs_schedule_part.get(i)
)
df_ts['similar_sched_section'] = df_ts['processed_section'].apply(
    lambda i: pairs_schedule_section.get(i)
)

df_ts_def = df_ts[~df_ts.similar_def.isna()][['section', 'processed_section', 'text','similar_def']]
df_ts_parties = df_ts[~df_ts.similar_parties.isna()][['section', 'processed_section',  'text','similar_parties']]
df_ts_sub_sec = df_ts[~df_ts.similar_sub_section.isna()][['section', 'processed_section',  'text', 'similar_sub_section']]
df_ts_sched = df_ts[~df_ts.similar_schedule.isna()][['section', 'processed_section',  'text', 'similar_schedule']]
df_ts_sched_sec = df_ts[~df_ts.similar_sched_section.isna()][['section', 'processed_section',  'text', 'similar_sched_section']]

### check
total_pairs = []

for sec in list(set(df_ts_def.processed_section)):
    sub_df = df_ts_def[df_ts_def.processed_section==sec]
    total_pairs.append({
        sec: dict(zip(sub_df.text, sub_df.similar_def))
    })

for sec in list(set(df_ts_parties.processed_section)):
    sub_df = df_ts_parties[df_ts_parties.processed_section==sec]
    total_pairs.append({
        sec: dict(zip(sub_df.text, sub_df.similar_parties))
    })

for sec in list(set(df_ts_sub_sec.processed_section)):
    sub_df = df_ts_sub_sec[df_ts_sub_sec.processed_section==sec]
    total_pairs.append({
        sec: dict(zip(sub_df.text, sub_df.similar_sub_section))
    })
for sec in list(set(df_ts_sched.processed_section)):
    sub_df = df_ts_sched[df_ts_sched.processed_section==sec]
    total_pairs.append({
        sec: dict(zip(sub_df.text, sub_df.similar_schedule))
    })
for sec in list(set(df_ts_sched_sec.processed_section)):
    sub_df = df_ts_sched_sec[df_ts_sched_sec.processed_section==sec]
    total_pairs.append({
        sec: dict(zip(sub_df.text, sub_df.similar_sched_section))
    })

total_pairs.extend(total_pairs_clause)
total_pairs.extend(total_pairs_sched)


keys = []
for pair in total_pairs:
    k = list(pair.keys())[0]
    if k not in keys:
        keys.append(k)


total_pairs_updated = []
for k in keys:
    sub_pairs = [p[k] for p in total_pairs if list(p.keys())[0] == k]
    dd = defaultdict(list)
    for p in sub_pairs:
        for i, j in p.items():
            dd[i].extend(j)

    total_pairs_updated.append({k: dd})

results = []

for pair in total_pairs_updated:
    for sec, value in pair.items():
        for text, match in value.items():
            for item in match:
                results.append({
                    'TS_section': sec,
                    'TS_text': text,
                    'match_term': item['similar_term'],
                    'similarity': item['score'],
                    'match_type': item['map_type']
                })

df_results = pd.DataFrame(data=results)
# print(json.dumps(results,indent=4))
# df_results = df_results.sort_values(by=['TS_text', 'similarity'], ascending=False)

ts_map = {}
for idx, row in df_ts[['index','text_block_id','page_id', 'phrase_id', 'section','processed_section','text']].drop_duplicates().iterrows():
    ts_map[row['text']] = [
        row['index'],
        row['text_block_id'],
        row['page_id'],
        row['phrase_id']
    ]

content2id = {
    'sec_to_def': dict(),
    'text_to_clause_text': dict(),
    'text_to_sub_sec': dict(),
    'text_to_schedule_text': dict(),
    'sec_to_parties': dict(),
    'sec_to_sub_sec': dict(),
    'schedule_part': dict(),
    'schedule_section': dict()
}

for idx, row in df_def[['definition', 'identifier']].drop_duplicates().iterrows():
    content2id['sec_to_def'].update({row['definition']: row['identifier']})

for idx, row in df_fa[df_fa.text_element=='sub_section'][['sub_section', 'identifier']].drop_duplicates().iterrows():
    content2id['text_to_sub_sec'].update({row['sub_section']: row['identifier']})
content2id['sec_to_sub_sec'] = content2id['text_to_sub_sec']
for idx, row in df_clause[['text', 'identifier']].drop_duplicates().iterrows():
    content2id['text_to_clause_text'].update({row['text']: row['identifier']})


for idx, row in df_schedule[['text', 'identifier']].drop_duplicates().iterrows():
    content2id['text_to_schedule_text'].update({row['text']: row['identifier']})
for idx, row in df_schedule[['part', 'identifier']].drop_duplicates().iterrows():
    content2id['schedule_part'].update({row['part']: row['identifier']})
for idx, row in df_schedule[df_schedule.text_element=='section'][['schedule', 'identifier']].drop_duplicates().iterrows():
    content2id['schedule_section'].update({row['schedule']: row['identifier']})

for idx, row in df_parties[['definition', 'identifier']].drop_duplicates().iterrows():
    content2id['sec_to_parties'].update({row['definition']: row['identifier']})

df_results['identifier'] = df_results.apply(
    lambda i: content2id[i['match_type']].get(i['match_term']),
    axis=1
)

df_results['index'] = df_results.apply(
    lambda i: ts_map.get(i['TS_text'])[0],
    axis=1
)
df_results['text_block_id'] = df_results.apply(
    lambda i: ts_map.get(i['TS_text'])[1],
    axis=1
)
df_results['page_id'] = df_results.apply(
    lambda i: ts_map.get(i['TS_text'])[2],
    axis=1
)
df_results['phrase_id'] = df_results.apply(
    lambda i: ts_map.get(i['TS_text'])[3],
    axis=1
)

df_results = df_results.sort_values(
    by=['TS_section', 'TS_text', 'similarity'],
    ascending=False
)
df_results = df_results[~df_results.match_term.isna()]
df_results = df_results.drop_duplicates()
check_csv_path = os.path.join(OUTPUT_folderpath, check_date, 'check', f)
df_results.to_csv(check_csv_path, index=False)

final = []

for term in list(set(df_results.TS_section)):
    df_s = df_results[df_results.TS_section==term]
    for text in list(set(df_s.TS_text)):
        df_ss = df_s[df_s.TS_text==text]
        try:
            final.append({
                'index': list(df_ss['index'])[0],
                'text_block_id': list(df_ss['text_block_id'])[0],
                'page_id': list(df_ss['page_id'])[0],
                'phrase_id': list(df_ss['phrase_id'])[0],
                'TS_term': term,
                'TS_text': text,
                'match_term_list': list(df_ss['match_term'])[:5],
                'identifier_list': list(df_ss['identifier'])[:5],
                'similarity_list': list(df_ss['similarity'])[:5],
                'match_type_list': list(df_ss['match_type'])[:5]
            })
        except Exception as e:
            print(f'TS_term: {term}, TS_text: {text}, Error: {e}')
df_final = pd.DataFrame(data=final)
save_f = re.sub('.csv', '_results.csv', f)
results_csv_path = os.path.join(OUTPUT_folderpath, check_date, save_f)
df_final.to_csv(results_csv_path, index=False)
# print('results are saved to: ', save_f)
end_time = time.perf_counter()
total_time = end_time - start_time
print(f'{f} matching time: {total_time} s')
done_list.append(f)

with open(os.path.join(OUTPUT_folderpath, f"done_term_matching_list_{check_date}.json"), "w") as outfile:
    outfile.write(json.dumps(done_list,indent=4))
except Exception as e:
print(f, e)                 

SyntaxError: invalid syntax (<ipython-input-4-3bd5fb28ace2>, line 431)