In [None]:
!pip3 install openpyxl

In [1]:
import json 
import pandas as pd 
from pathlib import Path 
from glob import glob 

In [3]:
BASE_DIR = "./data/input/" 

In [4]:
dfs = [] 
for filepath in glob(BASE_DIR + "/*"): 
    tempdf = pd.read_json(filepath) 
    filename = Path(filepath).name
    tempdf['filename'] = filename
    dfs.append(tempdf) 
    


In [5]:
df = pd.concat(dfs, ignore_index=True)
df = df.reset_index(drop=True)

In [6]:
print(f"Number of rows: {df.shape[0]}")
print(f"Number of unique files: {pd.unique(df.filename).shape[0]}")
print(f"Number of unique tags: {pd.unique(df.tags).shape[0]}")

Number of rows: 13664
Number of unique files: 135
Number of unique tags: 25


In [7]:
df[['tags']].groupby('tags').size().reset_index()

Unnamed: 0,tags,0
0,Additional Liens,108
1,Asset Disposition,65
2,Compliance Certificate,25
3,Consequences of Default,137
4,Event of Default,31
5,Events of Default,103
6,Facilities / Instrument,171
7,Financial Covenant,98
8,Financial Statements,24
9,Governing Laws,112


In [8]:
print(f"Number of files with empty section content: {pd.unique(df[df['section_content'].isna()].filename).shape[0]}")
print(f"Number of files with empty section and tag present content: {pd.unique(df[(df['section_content'].isna()) & (df['tags'].notna()) ].filename).shape[0]}")
print(f"Number of records where section is empty: {df[(df['section_content'].isna())].shape[0]}")

Number of files with empty section content: 46
Number of files with empty section and tag present content: 5
Number of records where section is empty: 117


In [9]:
df.head()

Unnamed: 0,title,category,tags,checklists,section_content,filename
0,Computation of Time Periods,FYI,,[],Section 1.02. Computation of Time Periods. In ...,"Abbvie - 6,000,000,000 TERM LOAN CREDIT AGREEM..."
1,Accounting Terms,FYI,,[],Section 1.03. Accounting Terms. Except as othe...,"Abbvie - 6,000,000,000 TERM LOAN CREDIT AGREEM..."
2,Terms Generally,FYI,,[],Section 1.04. Terms Generally. The definitions...,"Abbvie - 6,000,000,000 TERM LOAN CREDIT AGREEM..."
3,Divisions,FYI,,[],Section 1.05. Divisions. For all purposes unde...,"Abbvie - 6,000,000,000 TERM LOAN CREDIT AGREEM..."
4,The Advances,FYI,Facilities / Instrument,[],Section 2.01. The Advances.\nIf a Pre-Closing ...,"Abbvie - 6,000,000,000 TERM LOAN CREDIT AGREEM..."


In [10]:
df['section_content'] = df['section_content'].fillna("")
df['tags'] = df['tags'].fillna("NA")
df['title'] = df['title'].fillna("") 
df['category'] = df['category'].fillna("")

In [11]:
df[df['section_content'].isna()].fillna("NA").groupby('tags').size().reset_index().rename(columns={0:'count'}).sort_values(by='count', ascending=False)

Unnamed: 0,tags,count


In [12]:
# df[df['section_content'].isna()].to_excel("./data/output/empty_section_content.xlsx", index=False)

In [13]:
df['word_count'] = df['section_content'].fillna("").apply(lambda x : len(x.split()))

# Filter section content with small text

In [14]:
df = df[(df.word_count > 20)]

In [15]:
df['section_content'] = df['title'].apply(lambda x: f"Title is {x}") + "\n" + df['category'].apply(lambda x: f"Category is {x}") + "\n" +  df['section_content']

In [16]:
df.word_count.describe()

count    13084.000000
mean       475.536915
std        688.926760
min         21.000000
25%        114.000000
50%        246.000000
75%        547.250000
max       9381.000000
Name: word_count, dtype: float64

In [17]:
print(df.shape)

(13084, 7)


# Stratified Split

In [18]:
from sklearn.model_selection import StratifiedKFold

In [19]:
skf = StratifiedKFold(n_splits=3)

In [20]:
for train_index, test_index in skf.split(df['section_content'], df['tags']):
    print(train_index.shape)
    print(test_index.shape)

(8722,)
(4362,)
(8723,)
(4361,)
(8723,)
(4361,)


In [22]:
stratified_sample = df.groupby('tags', group_keys=False).apply(lambda x: x.sample(frac=0.3))

In [23]:
stratified_sample.shape

(3924, 7)

In [24]:
stratified_sample.tags.unique().shape, df.tags.unique().shape

((24,), (24,))

In [25]:
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

In [26]:
unique_filenames = stratified_sample.filename.unique()

In [27]:

# Split filenames into train and test sets
train_files, test_files = train_test_split(unique_filenames, test_size=0.2, random_state=42)

# Create train and test datasets based on the split filenames
train_df = stratified_sample[stratified_sample['filename'].isin(train_files)].reset_index() 
test_df = stratified_sample[stratified_sample['filename'].isin(test_files)].reset_index()
del train_df['index'] 
del test_df['index'] 
train_df.shape, test_df.shape

((3108, 7), (816, 7))

In [28]:
train_df.tags.unique().shape, test_df.tags.unique().shape

((24,), (24,))

In [29]:
train_df.word_count.describe()

count    3108.000000
mean      467.413449
std       689.396518
min        21.000000
25%       111.000000
50%       246.000000
75%       528.250000
max      7032.000000
Name: word_count, dtype: float64

In [30]:
selected_columns = ["section_content", "tags"]

In [31]:
train_tags = set(train_df.tags.unique())
test_tags = set(test_df.tags.unique())

In [32]:
print(test_tags - train_tags )
assert not test_tags - train_tags 

set()


In [33]:
import os 

In [34]:
OUTPUT_DIR = "./data/dataset/v2-mini"
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [35]:
train_df[selected_columns].to_json(os.path.join(OUTPUT_DIR, "train.json")) 
test_df[selected_columns].to_json(os.path.join(OUTPUT_DIR, "test.json"))