In [135]:
# Process one pdf:

In [136]:
import pdfplumber
import csv
import re
import os

In [137]:
def find_sentences(text, keyword):
    # Use word boundaries around the keyword to match it as a whole word
    pattern = r'([^.]*?\b' + re.escape(keyword) + r'\b[^.]*\.)'
    sentences = re.findall(pattern, text, re.IGNORECASE)
    return "\n".join(sentences)

In [138]:
def extract_crn(text):
    # Match the first occurrence of a 5-digit number
    match = re.search(r'\b\d{5}\b', text)
    return match.group(0) if match else ''

In [139]:
def extract_course(text):
    # Match a pattern of 3 alphabets followed by a space, dash, or parentheses (optional space), then 4 digits
    match = re.search(r'\b[A-Za-z]{3}[\s\-()]*\d{4}\b', text)
    return match.group(0).replace('(', '').replace(')', '').strip() if match else ''

In [140]:
### Combining all this into one function
## Input -> one pdf path
## Output -> open csv file and edit it

In [141]:
def process_pdf(pdf_path,keywords,csv_path):
    text = ''
    # Open the PDF file
    with pdfplumber.open(pdf_path) as pdf:
        # Iterate through each page
        for page in pdf.pages:
            # Extract text from the page
            text = text + page.extract_text()
            
    # Extract the crn
    crn = extract_crn(pdf_path)
    course = extract_course(pdf_path)
    
    
    with open(csv_path, mode='a', newline='', encoding='utf-8') as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=['course','crn', 'keyword', 'isKeywordPresent', 'sentence'])
        
        # Iterate through each keyword and check if it's in the text
        for keyword in keywords:
            is_present = keyword.lower() in text.lower()
            sentences = find_sentences(text, keyword) if is_present else ''
            writer.writerow({
                'course' : course,
                'crn': crn ,
                'keyword': keyword,
                'isKeywordPresent': is_present,
                'sentence': sentences
            })
        
    return csv_path
    

In [142]:
## Process just one pdf

pdf_path = '/Users/manoh/Downloads/ALY6070.21136.202325.pdf'
csv_path = 'keywords_check.csv'
keywords = keywords = [
        'Climate', 'Ecologic', 'Ecological', 'Environment', 'Environmental', 'Environmentalism',
        'Fossil Fuel', 'Global Warming', 'Natural', 'Nature', 'Resilience', 'Resilient',
        'Sustainability', 'Sustainable', 'Energy', 'Renewable', 'Solar', 'Wind',
        'Environmental', 'Alternative Transportation', 'Biodiversity', 'Conservation',
        'Consumption', 'Contamination', 'Deforestation', 'Eco-conscious', 'Ecoliteracy',
        'Ecosystem', 'Green building', 'Greenhouse', 'Land management', 'Marine', 'Native species',
        'Pollution', 'Preservation', 'Recycling', 'Waste', 'Water', 'Wildlife', 'Land Use',
        'Social Justice', 'Disparities', 'Equality', 'Equitable', 'Food security', 'Food system',
        'Food waste', 'Human rights', 'Hunger', 'Inequalities', 'Inequity', 'Poverty', 'Racial',
        'Racism', 'Reproductive rights', 'Social change', 'Justice'
    ]


process_pdf(pdf_path,keywords,csv_path)

'keywords_check.csv'

In [146]:
import os
import pdfplumber
from pdfminer.pdfparser import PDFSyntaxError

def process_all_pdfs(folder_path, keywords, csv_path):
    # Check if the CSV file already exists and has content
    write_header = not os.path.exists(csv_path) or os.stat(csv_path).st_size == 0
    
    # Write the header if the file is newly created or empty
    if write_header:
        with open(csv_path, mode='w', newline='', encoding='utf-8') as csv_file:
            writer = csv.DictWriter(csv_file, fieldnames=['course', 'crn', 'keyword', 'isKeywordPresent', 'sentence'])
            writer.writeheader()

    for root, dirs, files in os.walk(folder_path):
        for filename in files:
            if filename.lower().endswith('.pdf'):
                pdf_path = os.path.join(root, filename)
                try:
                    print(f"Processing {pdf_path}...")
                    process_pdf(pdf_path, keywords, csv_path)
                except PDFSyntaxError as e:
                    print(f"Error processing {pdf_path}: {e}. Skipping this file.")

In [148]:
folder_path = '/Users/manoh/Downloads/OneDrive_3_2-14-2024'
keywords = keywords = [
        'Climate', 'Ecologic', 'Ecological', 'Environment', 'Environmental', 'Environmentalism',
        'Fossil Fuel', 'Global Warming', 'Natural', 'Nature', 'Resilience', 'Resilient',
        'Sustainability', 'Sustainable', 'Energy', 'Renewable', 'Solar', 'Wind',
        'Environmental', 'Alternative Transportation', 'Biodiversity', 'Conservation',
        'Consumption', 'Contamination', 'Deforestation', 'Eco-conscious', 'Ecoliteracy',
        'Ecosystem', 'Green building', 'Greenhouse', 'Land management', 'Marine', 'Native species',
        'Pollution', 'Preservation', 'Recycling', 'Waste', 'Water', 'Wildlife', 'Land Use',
        'Social Justice', 'Disparities', 'Equality', 'Equitable', 'Food security', 'Food system',
        'Food waste', 'Human rights', 'Hunger', 'Inequalities', 'Inequity', 'Poverty', 'Racial',
        'Racism', 'Reproductive rights', 'Social change', 'Justice'
    ]
csv_path = 'keywords_check.csv'

with open(csv_path, mode='w', newline='', encoding='utf-8') as csv_file:
            writer = csv.DictWriter(csv_file, fieldnames=['course', 'crn', 'keyword', 'isKeywordPresent', 'sentence'])
            writer.writeheader()
            
process_all_pdfs(folder_path, keywords, csv_path)

Processing /Users/manoh/Downloads/OneDrive_3_2-14-2024/EDU7207.20565.202425.pdf...
Processing /Users/manoh/Downloads/OneDrive_3_2-14-2024/LDR7980.20197.202425.pdf...
Processing /Users/manoh/Downloads/OneDrive_3_2-14-2024/DGM6168.21815.202425.pdf...
Processing /Users/manoh/Downloads/OneDrive_3_2-14-2024/CED6910.20405.202425.pdf...
Processing /Users/manoh/Downloads/OneDrive_3_2-14-2024/GST6504.21046.202425.pdf...
Processing /Users/manoh/Downloads/OneDrive_3_2-14-2024/GST6550.20579.202425.pdf...
Processing /Users/manoh/Downloads/OneDrive_3_2-14-2024/CMN6000.21066.202425.pdf...
Processing /Users/manoh/Downloads/OneDrive_3_2-14-2024/CED6010.20451.202425.pdf...
Processing /Users/manoh/Downloads/OneDrive_3_2-14-2024/EDU6102.20598.202425.pdf...
Processing /Users/manoh/Downloads/OneDrive_3_2-14-2024/PJM6205.20839.202425.pdf...
Processing /Users/manoh/Downloads/OneDrive_3_2-14-2024/LDR6100.20064.202425.pdf...
Processing /Users/manoh/Downloads/OneDrive_3_2-14-2024/RGA6463.21702.202425.pdf...
Proc