In [80]:
import os
import pandas as pd
import pypdf
import shutil

from pdf2image import convert_from_path

# Categorizing Docs

Because we don't have any way to organize docs we're going to naively group the documents together. We only really have the doc names to try to organize things, we'll start with the largest groups and do analysis from there.

Assumptions:
- we're only interested in docs with 'cab' in the name 
- the fund name is after the 'cab' portion of the name 

We're narrowing down our exploration to the top 5 groups and narrowing it down to PDFs with 1 page only; we don't want to spend time categorizing what our page of interest is, although we might want to figure that out another day

In [None]:
sample_dir = r"C:\Users\bernard.wong\OneDrive - STEPSTONE GROUP LP\Documents\GitHub - Projects\cab_dev\sample_cabs"

sample_files = []
for filename in os.listdir(sample_dir):
    if filename.endswith(".pdf"):
        sample_files.append(filename)

In [70]:
summary_df = pd.DataFrame({'file_name': sample_files})
print('total number of files: ' + str(len(summary_df)))

# only want files with 'cab' in it
summary_df = summary_df[summary_df['file_name'].apply(lambda x: 'cab.' in x.lower())]
# assuming first word after cab. is the primary fund word 
summary_df['fund'] = summary_df['file_name'].apply(lambda x: x.lower().split('cab.')[1].split()[0])

grouped_fund_df = pd.DataFrame(summary_df.groupby('fund')['file_name'].apply(lambda x: list(x))).reset_index()
grouped_fund_df['num_docs'] = grouped_fund_df['file_name'].apply(lambda x: len(x))
grouped_fund_df = grouped_fund_df.sort_values('num_docs', ascending = False)
print('number of potentially different funds: ' + str(len(grouped_fund_df)))
# only want funds with more than 5 docs 
grouped_fund_df = grouped_fund_df[grouped_fund_df['num_docs'] >= 5]
print('number of interesting funds: ' + str(len(grouped_fund_df)))
print('total docs: ' + str(sum(grouped_fund_df['num_docs'])))

interest_group_df = grouped_fund_df.head(6)

interest_group_df

total number of files: 2056
number of potentially different funds: 806
number of interesting funds: 93
total docs: 872


Unnamed: 0,fund,file_name,num_docs
693,ss,"[2021.09.30.CAB.SS A.SA.pdf, 2021.09.30.CAB.SS...",72
143,carlyle,[2021.09.30.CAB.Carlyle Asia Growth I.CPO III ...,30
673,silverstone,[2021.09.30.CAB.SilverStone Balfour.SSOF IV Of...,27
543,ocm,[2021.09.30.CAB.OCM Euro II USD.Sobrato FH.pdf...,24
94,bain,"[2021.09.30.CAB.Bain Asia II.Midland.pdf, 2021...",21
33,ag,"[2021.09.30.CAB.AG Asia Realty II.NCRS.pdf, 20...",20


In [72]:
# re-expanding groups to get page numbers of each file 
exploded_interest_group_df = interest_group_df.explode('file_name').reset_index(drop=True).drop(['num_docs'], axis = 1)
exploded_interest_group_df['num_pages'] = exploded_interest_group_df['file_name'].apply(lambda x: len(pypdf.PdfReader(os.path.join(sample_dir, x)).pages))
exploded_interest_group_df = exploded_interest_group_df[exploded_interest_group_df['num_pages'] == 1]
exploded_interest_group_df = pd.DataFrame(exploded_interest_group_df.groupby('fund')['file_name'].apply(lambda x: list(x))).reset_index()
exploded_interest_group_df['num_docs'] = exploded_interest_group_df['file_name'].apply(lambda x: len(x))
exploded_interest_group_df = exploded_interest_group_df.sort_values('num_docs', ascending = False)

exploded_interest_group_df

Unnamed: 0,fund,file_name,num_docs
4,ss,"[2021.09.30.CAB.SS A.SA.pdf, 2021.09.30.CAB.SS...",71
3,silverstone,[2021.09.30.CAB.SilverStone Balfour.SSOF IV Of...,27
0,ag,"[2021.09.30.CAB.AG Asia Realty II.NCRS.pdf, 20...",20
1,bain,"[2021.09.30.CAB.Bain Asia II.Midland.pdf, 2021...",19
2,carlyle,[2021.09.30.CAB.Carlyle Property Investors.BMO...,1


# re-organizing and viewing doc differences

In [76]:
output_folder = 'generated_groupings'
if os.path.isdir(output_folder):
    shutil.rmtree(output_folder)
os.makedirs(output_folder)

for index, row in exploded_interest_group_df.iterrows():
    fund_folder = os.path.join(output_folder, row['fund'])
    os.makedirs(fund_folder)
    for file in row['file_name']:
        shutil.copyfile(os.path.join(sample_dir, file), os.path.join(fund_folder, file))

In [82]:
for fund_folder in os.listdir(output_folder):
    print(fund_folder)
    for file in os.listdir(os.path.join(output_folder, fund_folder)):
        print(' - ' + file)
        file_img = convert_from_path(os.path.join(output_folder, fund_folder, file), 500)

ag
 - 2021.09.30.CAB.AG Asia Realty II.NCRS.pdf
generated_groupings\ag\2021.09.30.CAB.AG Asia Realty II.NCRS.pdf
 - 2021.09.30.CAB.AG Asia Realty III.Asia Realty Access Feeder.pdf
generated_groupings\ag\2021.09.30.CAB.AG Asia Realty III.Asia Realty Access Feeder.pdf
 - 2021.09.30.CAB.AG Asia Realty IV.LACERA.pdf
generated_groupings\ag\2021.09.30.CAB.AG Asia Realty IV.LACERA.pdf
 - 2021.09.30.CAB.AG Commercial RED Opps III.W VA.pdf
generated_groupings\ag\2021.09.30.CAB.AG Commercial RED Opps III.W VA.pdf
 - 2021.09.30.CAB.AG Commercial RED Opps.W VA.pdf
generated_groupings\ag\2021.09.30.CAB.AG Commercial RED Opps.W VA.pdf
 - 2021.09.30.CAB.AG Core Plus Realty II.NY Teamsters.pdf
generated_groupings\ag\2021.09.30.CAB.AG Core Plus Realty II.NY Teamsters.pdf
 - 2021.09.30.CAB.AG Core Plus Realty III.COH Beckman.pdf
generated_groupings\ag\2021.09.30.CAB.AG Core Plus Realty III.COH Beckman.pdf
 - 2021.09.30.CAB.AG Core Plus Realty IV.COH Beckman.pdf
generated_groupings\ag\2021.09.30.CAB.AG C