In [26]:
pip install pdfplumber pyspark

Note: you may need to restart the kernel to use updated packages.


In [27]:
import pdfplumber
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, FloatType

# Initialize Spark session
spark = SparkSession.builder.appName("PDFToSpark").getOrCreate()

# Define schema for the Spark DataFrame
schema = StructType([
    StructField("Card Name", StringType(), True),
    StructField("Bank", StringType(), True),
    StructField("Annual Fee", StringType(), True),  # Keep as String to accommodate "No annual fee"
    StructField("Purchase Interest Rate (%)", FloatType(), True),
    StructField("Rewards", StringType(), True)
])

def extract_pdf_data(file_path):
    """
    Extracts data from the credit card PDF and structures it for Spark DataFrame.
    """
    data = []
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            lines = text.split("\n")
            
            # Parse lines for credit card information
            for i in range(len(lines)):
                if "%" in lines[i]:  # Heuristic: look for interest rate
                    # Extract card details (sample parsing logic, adjust as needed)
                    try:
                        card_name = lines[i - 3].strip()
                        bank = lines[i - 2].strip()
                        annual_fee = lines[i + 1].split(" ")[0].strip() if "fee" in lines[i + 1].lower() else "No annual fee"
                        interest_rate = float(lines[i].split("%")[0].strip())
                        rewards = None
                        
                        # Check for rewards (look at subsequent lines)
                        rewards_lines = []
                        j = i + 2
                        while j < len(lines) and ":" in lines[j]:
                            rewards_lines.append(lines[j].split(":")[1].strip())
                            j += 1
                        
                        if rewards_lines:
                            rewards = ", ".join(rewards_lines)
                        
                        # Append to data
                        data.append((card_name, bank, annual_fee, interest_rate, rewards))
                    except Exception as e:
                        print(f"Skipping line due to parsing error: {e}")
                        continue
    return data

# Path to the PDF file
pdf_path = "/Users/aaryas127/Documents/GitHub/credit_card_reward_maximizer/SearchCreditCard-eng.pdf"

# Extract data from the PDF
parsed_data = extract_pdf_data(pdf_path)

# Create a Spark DataFrame from the parsed data
df = spark.createDataFrame(parsed_data, schema=schema)

# Show the DataFrame
df.show(truncate=False)


+------------------------------------------------------------+-----------------------------------------------+-------------+--------------------------+-------+
|Card Name                                                   |Bank                                           |Annual Fee   |Purchase Interest Rate (%)|Rewards|
+------------------------------------------------------------+-----------------------------------------------+-------------+--------------------------+-------+
|Desjardins Flexi Visa                                       |Desjardins                                     |No annual fee|10.9                      |NULL   |
|MBNA True Line® Mastercard®                                 |MBNA is a division of The Toronto-Dominion Bank|No annual fee|12.99                     |NULL   |
|Coast Capital Collabria Classic Mastercard                  |Coast Capital Savings                          |No annual fee|13.99                     |NULL   |
|Tangerine Money-Back Credit Card       