In [2]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [16]:
import pandas as pd
import os

from PyPDF2 import PdfReader

#Text preprocessing£

## Text Extraction

In [6]:
def extract_text_from_pdfs(directory):
    pdf_texts = {}

    for filename in os.listdir(directory):
        if filename.endswith('.pdf'):
            filepath = os.path.join(directory, filename)

            with open(filepath, 'rb') as file:
                pdf_reader = PdfReader(file)

                page_texts = []
                for page in pdf_reader.pages:
                    page_texts.append(page.extract_text())

                pdf_texts[filename[:-4]] = page_texts

    max_pages = max(len(pages) for pages in pdf_texts.values())

    df = pd.DataFrame(columns=pdf_texts.keys(), index=[f'Page {i+1}' for i in range(max_pages)])

    for filename, pages in pdf_texts.items():
        for i, text in enumerate(pages):
            df.at[f'Page {i+1}', filename] = text

    return df

In [7]:
pdf_reader = extract_text_from_pdfs("assets/benefits")

pdf_reader

Unnamed: 0,gym-policy,childcare-policy,health-insurance-policy,life-insurance-policy,vacation-policy,tuition-reimbursement-policy,work-from-home-policy,401k-retirement-policy
Page 1,TechLance Gym Membership Policy\nIntroduction\...,TechLance Childcare Support Policy\nIntroducti...,TechLance Health Insurance Policy\nIntroductio...,TechLance Life Insurance Policy\nIntroduction\...,TechLance Vacation Days Policy\nIntroduction\n...,TechLance Tuition Reimbursement Policy\nIntrod...,TechLance Work from Home Policy\nIntroduction\...,TechLance Retirement Plan (401k) Policy\nIntro...
Page 2,community centers. These partnerships provide ...,"incorporates STEM concepts, creative arts, and...",The HMO plan provides comprehensive coverage t...,medical requirements. The coverage amount adju...,"example, if you begin employment on July 1st, ...","To be eligible for tuition reimbursement, empl...",performed independently with digital tools and...,TechLance’s matching formula is designed to re...
Page 3,"In addition to traditional gym memberships, al...","programs, and summer camps. It does not typica...",The prescription drug coverage varies by plan ...,Tobacco users pay signiﬁcantly higher rates du...,"June, September, and December), product launch...","can receive up to $6,000 annually. Professiona...",For employees who don’t have formal remote wor...,the contribution amounts. You can opt out of a...
Page 4,"Once enrolled, you’ll receive a TechLance corp...",can care for children with minor illnesses who...,Some services require prior authorization from...,Beneﬁciary Designation s and Claims\nOne of th...,employees may choose to use vacation time for ...,additional justiﬁcation about how the educatio...,TechLance provides a comprehensive equipment p...,Investment performance and expense ratios are ...
Page 5,"insurance premiums by $25, while regular usage...","During parental leave, health insurance and ot...",platform can also prescribe medications when a...,convenience. The exam typically includes basic...,schedule re-entry meetings to help employees g...,These commitments begin upon successful comple...,Your home Wi-Fi network must use WPA3 encrypti...,The 401(k) plan allows loans for participants ...
Page 6,We also maintain corporate discount relationsh...,Eligibility and Enrollment\nAll full-time empl...,your coverage and consider whether your curren...,"Alternatively, you can continue your group cov...",employees with at least 30 days advance notice...,Reduced work schedules or job sharing arrangem...,Trial Periods and Ongoing Evaluation\nAll new ...,"information about plan changes, fee disclosure..."
Page 7,How much money can I save with corporate gym m...,Our Family Services coordinator in HR holds mo...,When can I change my health insurance plan? Yo...,How much life insurance do I actually need? A ...,and business needs.\nHow far in advance can I ...,business. We encourage employees to take advan...,Can I work from home occasionally without a fo...,We also provide guidance on coordinating your ...
Page 8,,provide resources for ﬁnding specialized care ...,,This policy is eﬀective immediately and supers...,,Can I appeal if my educational program is deni...,,Frequently Asked Question s\nHow much should I...
Page 9,,,,,,,,This policy is eﬀective immediately and supers...


## Text Preprocessing

In [25]:
def normalize_text(s):
    if s is None:
        return ''
    s = unicodedata.normalize('NFC',str(s)).strip()
    return re.sub(r"\s+", " ", s)


def chunk_text(text, chunk_size):
    text = normalize_text(text)
    if not text:
        return []

    words = text.split(" ")
    return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]


def build_chunks(df_long,chunk_size, doc_col, page_col, text_col):
    rows = []

    for doc_id, group in df_long.groupby(doc_col):
        groupe_sorted = group.sort_values(page_col)
        full_text = " ".join([normalize_text(t) for t in groupe_sorted[text_col] if pd.notna(t) ])
        chunks = chunk_text(full_text,chunk_size)
        for i, ch in enumerate(chunks):
            rows.append({
                'doc_id' : doc_id,
                'chunk_id' : i,
                'n_words': len(ch.split(" ")),
                'text' : ch
            })
    return pd.DataFrame(rows, columns=['doc_id', 'chunk_id', 'n_words', 'text'])


In [26]:
df_long = pdf_reader.reset_index().melt(id_vars=['index'], var_name='document', value_name='text')
df_long = df_long.rename(columns={'index': 'page'})
df_long['page'] = df_long['page'].str.replace('Page','').astype(int)

df_long.head()

Unnamed: 0,page,document,text
0,1,gym-policy,TechLance Gym Membership Policy\nIntroduction\...
1,2,gym-policy,community centers. These partnerships provide ...
2,3,gym-policy,"In addition to traditional gym memberships, al..."
3,4,gym-policy,"Once enrolled, you’ll receive a TechLance corp..."
4,5,gym-policy,"insurance premiums by $25, while regular usage..."


In [29]:
df_chunks = build_chunks(df_long,200,'document','page','text')

df_chunks.head(15)

Unnamed: 0,doc_id,chunk_id,n_words,text
0,401k-retirement-policy,0,200,TechLance Retirement Plan (401k) Policy Introd...
1,401k-retirement-policy,1,200,can contribute between 1% and 100% of their sa...
2,401k-retirement-policy,2,200,"always 100% vested in your own contributions, ..."
3,401k-retirement-policy,3,200,of automatic escalation or adjust the increase...
4,401k-retirement-policy,4,200,"mid-cap and small-cap funds, international dev..."
5,401k-retirement-policy,5,200,"no immediate tax beneﬁt, but qualiﬁed withdraw..."
6,401k-retirement-policy,6,200,it can signiﬁcantly impact your long-term reti...
7,401k-retirement-policy,7,200,"taxes and early withdrawal penalties, and you’..."
8,401k-retirement-policy,8,200,"balance exceeds $5,000, roll it over to a new ..."
9,401k-retirement-policy,9,200,"as IRAs, health savings accounts, and taxable ..."


In [31]:
# Check number of chunk per document 
print(df_chunks.groupby('doc_id').size())

doc_id
401k-retirement-policy          13
childcare-policy                11
gym-policy                      11
health-insurance-policy         11
life-insurance-policy           12
tuition-reimbursement-policy    11
vacation-policy                 10
work-from-home-policy           11
dtype: int64


In [30]:
# Save Dataframe into csv file 
df_chunks.to_csv('assets/benefits/policy_chunks.csv', index=False)