In [8]:
!pip3 install -q imblearn
!pip3 install -q boto3>=1.28.59
!pip3 install -q openpyxl

[0m

In [9]:
import pandas as pd 

In [10]:
df = pd.read_json("./data/cleaned_tags_data_na_tag_lt3.json", orient='records')

In [11]:
file_analysis_df = pd.read_excel("./data/File-Analysis.xlsx")

In [12]:
good_files = set(file_analysis_df.File.to_list()[:100])

In [13]:
df = pd.read_json("./data/all-data.json")
df['tags'] = df['tags'].str.replace("Events of Default", "Event of Default")
df = df.drop_duplicates('section_content', keep=False)
df = df[df.filename.map(lambda x: x in good_files)]

In [25]:
agg_df = df[['tags', 'section_content']].groupby("section_content").agg(set).reset_index()
agg_df['tag_size'] = agg_df.tags.map(lambda x: len(x))

In [27]:
agg_df[agg_df.tag_size > 1].shape

(7, 3)

In [None]:
df[['tags']].groupby("tags").size().reset_index().sort_values(by=0)

In [27]:
label_counts = df[['tags']].groupby('tags').size().reset_index().rename(columns={0:"freq"}).sort_values(by='freq')
label_counts

Unnamed: 0,tags,freq
2,Compliance Certificate,11
12,Mandatory Prepayments / Redemption,14
7,Financial Statements,23
19,Restricted Investments,23
14,Optional Prepayment / Redemption,26
9,Incremental Facilities,40
20,Restricted Payments,42
15,Permitted Indebtedness,44
3,Consequences of Default,45
1,Asset Disposition,47


In [28]:
median_count = label_counts['freq'].median()+30

In [29]:
undersampled_labels = label_counts[label_counts.freq < median_count]

In [38]:
undersampled_labels[3:]

Unnamed: 0,tags,freq
19,Restricted Investments,23
14,Optional Prepayment / Redemption,26
9,Incremental Facilities,40
20,Restricted Payments,42
15,Permitted Indebtedness,44
3,Consequences of Default,45
1,Asset Disposition,47
21,Transactions with Affiliates,48
17,Prepayment,56
8,Governing Laws,61


## Generate Content for Undersampled tags

In [31]:
import boto3 
import json 

In [32]:
bedrock = boto3.client(service_name="bedrock-runtime")

In [33]:
prompt = """You are a financial expert tasked with generating synthetic text from a given section of a Credit Agreement document. Here are the guidelines for generating synthetic content. 
- Maintain content length similar to the original content provided
- Modify the content slightly so that it can be used for training
- Do not deviate from the language of the original content
- You can modify the numbers, names of persons, companies, locations, or other nouns to generate the synthetic content. Maintain the consistency of the replaced names
- You can also replace the content with synonyms without overdoing it. 
Think deeply about the content and how it can be synthesized in the <thinking> tags.
Then finally, generate synthesized content in <content> tags. 
Here's the section content as follows: 
<section_content> 
{section_content}
</section_content> 
"""

In [39]:
from tqdm import tqdm
import time
import re 

In [68]:
def synthesize(section_content):
    content = prompt.format(section_content=section_content)
    body = json.dumps({
      "max_tokens": 4096,
      "messages": [{"role": "user", "content": content}],
      "anthropic_version": "bedrock-2023-05-31"
    })

    response = bedrock.invoke_model(body=body, modelId="anthropic.claude-3-5-sonnet-20240620-v1:0")
    response_body = json.loads(response.get("body").read())
    response = response_body.get("content")[0]['text']
    time.sleep(10)
    return response

In [69]:
synthesized_content = [] 

In [70]:
def undersample_gen(): 
    for t in tqdm(undersampled_labels[3:].itertuples()): 
        fill_count = int(median_count - t.freq )
        print(t.tags, fill_count)
        while True: 
            for row in df[df.tags == t.tags].to_dict(orient='records'): 
                if fill_count == 0: 
                    break 
                yield row

                fill_count -= 1
            if fill_count == 0: 
                break 
    

In [71]:
samples = undersample_gen()

In [72]:
# synthesize  = lambda x: f"<content>{x}</content>"

In [None]:
for row in samples: 
    lines = row["section_content"].split("\n")
    result = re.findall("<content>((.|\n)+)</content>", synthesize(lines[2:]), flags=re.MULTILINE)
    content = "\n".join(lines[:2]) + result[0][0]
    synthesized_row = {
        'title': row['title'], 
        'category': row['category'], 
        'tags': row['tags'], 
        'checklist': row['checklists'], 
        'filename': "[synth]" + row['filename'], 
        'section_content': content, 
        'word_count': len(content.split())
    }
    synthesized_content.append(synthesized_row)

In [65]:
synth_df = pd.DataFrame(synthesized_content)

In [67]:
synth_df[['tags']].groupby('tags').size().reset_index().sort_values(by=0)

Unnamed: 0,tags,0
7,Loan Repayment,6
0,Additional Liens,8
10,Premium and Fees,8
4,Financial Covenant,12
3,Event of Default,14
15,Waterfall of Proceeds,22
5,Governing Laws,25
11,Prepayment,30
14,Transactions with Affiliates,38
1,Asset Disposition,39
