In [10]:
pip install pdfplumber pyspark openai

Collecting openai
  Downloading openai-1.55.1-py3-none-any.whl.metadata (24 kB)
Collecting anyio<5,>=3.5.0 (from openai)
  Using cached anyio-4.6.2.post1-py3-none-any.whl.metadata (4.7 kB)
Collecting distro<2,>=1.7.0 (from openai)
  Downloading distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Using cached httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.7.1-cp312-cp312-macosx_11_0_arm64.whl.metadata (5.2 kB)
Collecting pydantic<3,>=1.9.0 (from openai)
  Downloading pydantic-2.10.2-py3-none-any.whl.metadata (170 kB)
Collecting sniffio (from openai)
  Using cached sniffio-1.3.1-py3-none-any.whl.metadata (3.9 kB)
Collecting tqdm>4 (from openai)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting idna>=2.8 (from anyio<5,>=3.5.0->openai)
  Using cached idna-3.10-py3-none-any.whl.metadata (10 kB)
Collecting certifi (from httpx<1,>=0.23.0->openai)
  Using cached certifi-

In [11]:
import pdfplumber
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, FloatType

# Initialize Spark session
spark = SparkSession.builder.appName("PDFToSpark").getOrCreate()

# Define schema for the Spark DataFrame
schema = StructType([
    StructField("Card Name", StringType(), True),
    StructField("Bank", StringType(), True),
    StructField("Annual Fee", StringType(), True),  # Keep as String to accommodate "No annual fee"
    StructField("Purchase Interest Rate (%)", FloatType(), True),
    StructField("Rewards", StringType(), True)
])

def extract_pdf_data(file_path):
    """
    Extracts data from the credit card PDF and structures it for Spark DataFrame.
    """
    data = []
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            lines = text.split("\n")
            
            # Parse lines for credit card information
            for i in range(len(lines)):
                if "%" in lines[i]:  # Heuristic: look for interest rate
                    # Extract card details (sample parsing logic, adjust as needed)
                    try:
                        card_name = lines[i - 3].strip()
                        bank = lines[i - 2].strip()
                        annual_fee = lines[i + 1].split(" ")[0].strip() if "fee" in lines[i + 1].lower() else "No annual fee"
                        interest_rate = float(lines[i].split("%")[0].strip())
                        rewards = None
                        
                        # Check for rewards (look at subsequent lines)
                        rewards_lines = []
                        j = i + 2
                        while j < len(lines) and ":" in lines[j]:
                            rewards_lines.append(lines[j].split(":")[1].strip())
                            j += 1
                        
                        if rewards_lines:
                            rewards = ", ".join(rewards_lines)
                        
                        # Append to data
                        data.append((card_name, bank, annual_fee, interest_rate, rewards))
                    except Exception as e:
                        print(f"Skipping line due to parsing error: {e}")
                        continue
    return data

# Path to the PDF file
pdf_path = "/Users/aaryas127/Documents/GitHub/credit_card_reward_maximizer/SearchCreditCard-eng.pdf"

# Extract data from the PDF
parsed_data = extract_pdf_data(pdf_path)

# Create a Spark DataFrame from the parsed data
df = spark.createDataFrame(parsed_data, schema=schema)

# Show the DataFrame
df.show(truncate=False)


+------------------------------------------------------------+-----------------------------------------------+-------------+--------------------------+-------+
|Card Name                                                   |Bank                                           |Annual Fee   |Purchase Interest Rate (%)|Rewards|
+------------------------------------------------------------+-----------------------------------------------+-------------+--------------------------+-------+
|Desjardins Flexi Visa                                       |Desjardins                                     |No annual fee|10.9                      |NULL   |
|MBNA True Line® Mastercard®                                 |MBNA is a division of The Toronto-Dominion Bank|No annual fee|12.99                     |NULL   |
|Coast Capital Collabria Classic Mastercard                  |Coast Capital Savings                          |No annual fee|13.99                     |NULL   |
|Tangerine Money-Back Credit Card       

In [12]:
df.first()["Card Name"]

'Desjardins Flexi Visa'

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

In [17]:
from openai import OpenAI

client = OpenAI(
  base_url = "https://integrate.api.nvidia.com/v1",
  api_key = "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"
)

completion = client.chat.completions.create(
  model="meta/llama-3.1-405b-instruct",
  messages=[{"role":"user","content":"Can you give me information on Desjardins Flexi Visa"}],
  temperature=0.2,
  top_p=0.7,
  max_tokens=1024,
  stream=True
)

for chunk in completion:
  if chunk.choices[0].delta.content is not None:
    print(chunk.choices[0].delta.content, end="")

The Desjardins Flexi Visa is a credit card offered by Desjardins, a Canadian financial cooperative. Here are some key features and benefits of the card:

**Key Features:**

1. **Variable credit limit**: The credit limit is adjusted based on your payment history and credit score.
2. **Flexible payment options**: You can choose to pay a fixed amount, a percentage of the balance, or the minimum payment.
3. **Low interest rate**: The card offers a competitive interest rate, which is currently 10.9% (may vary depending on your credit score and market conditions).
4. **No annual fee**: There is no annual fee for the primary cardholder.
5. **Additional card benefits**: The card offers purchase protection, extended warranty, and travel insurance.

**Benefits:**

1. **Rewards program**: Earn 1% cashback on all purchases, with no rotating categories or spending limits.
2. **Travel insurance**: Get coverage for trip cancellations, interruptions, and delays, as well as medical and hospital expense

24/11/26 17:02:34 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 271300 ms exceeds timeout 120000 ms
24/11/26 17:02:34 WARN SparkContext: Killing executors is not supported by current scheduler.
24/11/26 17:02:36 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$