In [49]:
import os
import pandas as pd
import pypdf
import shutil
from PIL import Image
import numpy as np
from pdf2image import convert_from_path
from matplotlib import pyplot as plt
import cv2

%matplotlib inline

# Categorizing Docs

Because we don't have any way to organize docs we're going to naively group the documents together. We only really have the doc names to try to organize things, we'll start with the largest groups and do analysis from there.

Assumptions:
- we're only interested in docs with 'cab' in the name 
- the fund name is after the 'cab' portion of the name 

We're narrowing down our exploration to the top 5 groups and narrowing it down to PDFs with 1 page only; we don't want to spend time categorizing what our page of interest is, although we might want to figure that out another day

In [83]:
sample_dir = r"sample_cabs"

sample_files = []
for filename in os.listdir(sample_dir):
    if filename.endswith(".pdf"):
        sample_files.append(filename)

In [86]:
summary_df = pd.DataFrame({'file_name': sample_files})
print('total number of files: ' + str(len(summary_df)))

# only want files with 'cab' in it
summary_df = summary_df[summary_df['file_name'].apply(lambda x: 'cab.' in x.lower())]
# assuming first word after cab. is the primary fund word 
summary_df['fund'] = summary_df['file_name'].apply(lambda x: x.lower().split('cab.')[1].split()[0])

grouped_fund_df = pd.DataFrame(summary_df.groupby('fund')['file_name'].apply(lambda x: list(x))).reset_index()
grouped_fund_df['num_docs'] = grouped_fund_df['file_name'].apply(lambda x: len(x))
grouped_fund_df = grouped_fund_df.sort_values('num_docs', ascending = False)
print('number of potentially different funds: ' + str(len(grouped_fund_df)))
# only want funds with more than 5 docs 
grouped_fund_df = grouped_fund_df[grouped_fund_df['num_docs'] >= 5]
print('number of interesting funds: ' + str(len(grouped_fund_df)))
print('total docs: ' + str(sum(grouped_fund_df['num_docs'])))

interest_group_df = grouped_fund_df.head(6)

interest_group_df

total number of files: 2056
number of potentially different funds: 806
number of interesting funds: 93
total docs: 872


Unnamed: 0,fund,file_name,num_docs
693,ss,"[2021.09.30.CAB.SS AP B.Farley I.pdf, 2021.09....",72
143,carlyle,[2021.09.30.CAB.Carlyle Realty VII.CPO II Para...,30
673,silverstone,"[2021.09.30.CAB.SilverStone II-D.Montauk.pdf, ...",27
543,ocm,"[2021.09.30.CAB.OCM XI Feeder.TransGlobe.pdf, ...",24
94,bain,"[2021.09.30.CAB.Bain XII.Midland.pdf, 2021.09....",21
33,ag,[2021.09.30.CAB.AG Energy Credit Opps.CCCERA.p...,20


In [87]:
# re-expanding groups to get page numbers of each file 
exploded_interest_group_df = interest_group_df.explode('file_name').reset_index(drop=True).drop(['num_docs'], axis = 1)
exploded_interest_group_df['num_pages'] = exploded_interest_group_df['file_name'].apply(lambda x: len(pypdf.PdfReader(os.path.join(sample_dir, x)).pages))
exploded_interest_group_df = exploded_interest_group_df[exploded_interest_group_df['num_pages'] == 1]
exploded_interest_group_df = pd.DataFrame(exploded_interest_group_df.groupby('fund')['file_name'].apply(lambda x: list(x))).reset_index()
exploded_interest_group_df['num_docs'] = exploded_interest_group_df['file_name'].apply(lambda x: len(x))
exploded_interest_group_df = exploded_interest_group_df.sort_values('num_docs', ascending = False)

exploded_interest_group_df

Unnamed: 0,fund,file_name,num_docs
4,ss,"[2021.09.30.CAB.SS AP B.Farley I.pdf, 2021.09....",71
3,silverstone,"[2021.09.30.CAB.SilverStone II-D.Montauk.pdf, ...",27
0,ag,[2021.09.30.CAB.AG Energy Credit Opps.CCCERA.p...,20
1,bain,"[2021.09.30.CAB.Bain XII.Midland.pdf, 2021.09....",19
2,carlyle,[2021.09.30.CAB.Carlyle Property Investors.BMO...,1


# re-organizing and viewing doc differences

In [88]:
output_folder = 'generated_groupings'
if os.path.isdir(output_folder):
    shutil.rmtree(output_folder)
os.makedirs(output_folder)

for index, row in exploded_interest_group_df.iterrows():
    fund_folder = os.path.join(output_folder, row['fund'])
    os.makedirs(fund_folder)
    for file in row['file_name']:
        shutil.copyfile(os.path.join(sample_dir, file), os.path.join(fund_folder, file))

In [89]:
image_dict = {}

for fund_folder in os.listdir(output_folder):
    print(fund_folder)
    image_list = [] 
    iterator = 0
    for file in os.listdir(os.path.join(output_folder, fund_folder)):
        print(' - ' + file)
        file_img = convert_from_path(os.path.join(output_folder, fund_folder, file), fmt='jpeg')[0]
        file_img = cv2.cvtColor(np.array(file_img), cv2.COLOR_RGB2BGR)
        image_list.append(file_img)
        iterator += 1 
        if iterator > 5:
            break
    image_dict[fund_folder] = image_list

carlyle
 - 2021.09.30.CAB.Carlyle Property Investors.BMO Canada.pdf
silverstone
 - 2021.09.30.CAB.SilverStone II-D.Montauk.pdf
 - 2021.09.30.CAB.SilverStone II-K Class 2.SS UWF.pdf
 - 2021.09.30.CAB.SilverStone III Off.Viceroy.pdf
 - 2021.09.30.CAB.SilverStone Pebbles IV.SSOF IV.pdf
 - 2021.09.30.CAB.SilverStone II-E.SS UWF.pdf
 - 2021.09.30.CAB.SilverStone Balfour.SSOF IV Off.pdf
ag
 - 2021.09.30.CAB.AG Energy Credit Opps.CCCERA.pdf
 - 2021.09.30.CAB.AG Net Lease Realty IV.TransGlobe Life Insurance.pdf
 - 2021.09.30.CAB.AG Eur Realty III.CIRS.pdf
 - 2021.09.30.CAB.AG Commercial RED Opps.W VA.pdf
 - 2021.09.30.CAB.AG Realty VIII A.Texas Education Agency.pdf
 - 2021.09.30.CAB.AG Realty X.COH Beckman.pdf
bain
 - 2021.09.30.CAB.Bain XII.Midland.pdf
 - 2021.09.30.CAB.Bain IX CoInv.GESB.pdf
 - 2021.09.30.CAB.Bain X.Ferro.pdf
 - 2021.09.30.CAB.Bain VIII E.Midland.pdf
 - 2021.09.30.CAB.Bain Europe III.GESB.pdf
 - 2021.09.30.CAB.Bain VII.Viceroy.pdf
ss
 - 2021.09.30.CAB.SS AP B.Farley I.pdf
 -

In [94]:
combined_image_dict = {}
for fund in image_dict.keys():
    img1 = image_dict[fund][0]
    for image in image_dict[fund][1:]:
        if image.size == img1.size:
            img1 = img1 + image #Image.blend(img1, image, 0.5)
    combined_image_dict[fund] = img1

In [95]:
for fund in combined_image_dict.keys():
    file_dest = os.path.join(output_folder, fund)
    cv2.imwrite(os.path.join(file_dest, 'merge_img.jpg'), combined_image_dict[fund])