In [1]:
%pip install pdfplumber

Note: you may need to restart the kernel to use updated packages.


1. Extract and Inspect the Raw Text

In [2]:
import pdfplumber

pdf_path = "sodapdf-converted.pdf"
all_text = []
with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        text = page.extract_text()
        if text:
            all_text.append(text)
raw_text = "\n".join(all_text)
print(raw_text[:2000])  # Inspect the first 2000 chars


1. HDFC Diners Club Black Metal Edition
HDFC Bank’s Diners Club Black Metal Edition carries a INR10,000 joining fee and INR10,000 annual fee
(waivable), requiring an annual income of INR21 lakh. It is a points-based card offering 5 points per
INR150 spent (3.33% value) with 10X SmartBuy multipliers and 2X weekend dining. Points redeem at
INR1 per point for travel bookings or merchandise. Key features include unlimited domestic/international
lounge access for primary and add-on members, 6 complimentary golf games quarterly, low forex markup
of 2%, and complimentary Club Marriott, Amazon, and Swiggy One memberships. Interest rate: ~3.49%
p.m.
2. Axis Bank Reserve Credit Card
The Axis Bank Reserve demands INR50,000 for both joining and annual fees, targeting individuals with
≥INR1.2 crore per annum income. It awards 30 EDGE points per INR200 spent (3% value), doubling to
60 points on international spends. Points redeem at INR1 = 1 EDGE point for statement credit or travel.
Benefits includ

2. Split Into Card Entries

In [3]:
import re

# Split on lines like "1. Card Name"
entries = re.split(r"\n\d+\.\s+", raw_text)
entries = [e.strip() for e in entries if e.strip()]
print(f"Found {len(entries)} card entries.")


Found 50 card entries.


3. Parsing and Cleaning Each Entry

We'll extract:

Card Name

Issuer (from card name or text)

Joining Fee, Annual Fee (handle "waivable", missing, etc.)

Income Requirement (handle lakh/crore and "≥" sign)

Reward Description (e.g., "3 Avios per 100 (3% value)", "2X on spends across categories")

Reward Rate (extract numeric % value if present)

Interest Rate

Key Features (everything else)

In [4]:
import pdfplumber
import re
import pandas as pd
import numpy as np

pdf_path = "sodapdf-converted.pdf"

# Extract all text from PDF
with pdfplumber.open(pdf_path) as pdf:
    all_text = []
    for page in pdf.pages:
        text = page.extract_text()
        if text:
            all_text.append(text)
raw_text = "\n".join(all_text)

# Split into card entries (by number-dot-space pattern)
entries = re.split(r"\n\s*\d+\.\s+", raw_text)
entries = [e.strip() for e in entries if e.strip()]

def clean_numeric(val):
    if pd.isnull(val) or val == '' or val is None:
        return np.nan
    val_str = str(val).lower().replace(',', '').replace('inr', '').replace('₹', '').replace('%', '').replace('≥', '').replace('>', '').replace('+', '').strip()
    # Handle lakh/crore
    if any(x in val_str for x in ['lakh', 'lac', 'lacs', 'l']):
        match = re.search(r'([\d\.]+)', val_str)
        if match:
            return float(match.group(1)) * 100000
    elif any(x in val_str for x in ['crore', 'cr']):
        match = re.search(r'([\d\.]+)', val_str)
        if match:
            return float(match.group(1)) * 10000000
    else:
        try:
            return float(val_str)
        except:
            return np.nan

def extract_reward_rate(desc):
    # Extract first % value in reward description
    match = re.search(r'(\d+(\.\d+)?)\s*%', desc)
    return float(match.group(1)) if match else np.nan

def extract_issuer(card_name):
    for bank in ['HDFC', 'Axis', 'ICICI', 'SBI', 'Standard Chartered', 'YES', 'IDFC', 'AU', 'RBL', 'Kotak', 'Federal', 'HSBC', 'Canara', 'PNB', 'Bank of Baroda', 'Bank of India', 'IndusInd', 'Union Bank', 'Citi', 'Yatra', 'Amazon', 'Flipkart', 'Myntra', 'Swiggy', 'MakeMyTrip', 'EazyDiner', 'Tata', 'India Oil', 'BPCL', 'Ixigo']:
        if bank in card_name:
            return bank + (' Bank' if 'Bank' not in bank and bank not in ['Citi', 'Yatra', 'Amazon', 'Flipkart', 'Myntra', 'Swiggy', 'MakeMyTrip', 'EazyDiner', 'Tata', 'India Oil', 'BPCL', 'Ixigo'] else '')
    return ''

parsed_data = []
for entry in entries:
    lines = entry.split('\n')
    card_name = lines[0].strip() if lines else ''
    issuer = extract_issuer(card_name)
    
    # Joining Fee (various patterns)
    joining_fee = None
    annual_fee = None
    
    # Pattern 1: "joining fee and annual fee"
    match = re.search(r'joining fee[s]?\s*(?:of|:)?\s*inr?([\d,\.]+).*annual fee[s]?\s*(?:of|:)?\s*inr?([\d,\.]+)', entry, re.IGNORECASE)
    if match:
        joining_fee, annual_fee = match.group(1), match.group(2)
    else:
        # Pattern 2: "demands INR50,000 for both joining and annual fees"
        match = re.search(r'demands\s*inr?([\d,\.]+)\s*for both joining and annual fee[s]?', entry, re.IGNORECASE)
        if match:
            joining_fee = annual_fee = match.group(1)
        else:
            # Pattern 3: "levies INR12,500 + GST annual fee"
            match = re.search(r'levies\s*inr?([\d,\.]+)', entry, re.IGNORECASE)
            if match:
                annual_fee = match.group(1)
            # Pattern 4: "charges INR12,499 for joining and renewal"
            match = re.search(r'charges\s*inr?([\d,\.]+)\s*for joining and renewal', entry, re.IGNORECASE)
            if match:
                joining_fee = annual_fee = match.group(1)
            # Pattern 5: "joining/annual fees"
            match = re.search(r'joining/annual fee[s]?\s*(?:of|:)?\s*inr?([\d,\.]+)', entry, re.IGNORECASE)
            if match:
                joining_fee = annual_fee = match.group(1)
            # Pattern 6: "annual fee of INRxxxx"
            match = re.search(r'annual fee[s]?\s*(?:of|:)?\s*inr?([\d,\.]+)', entry, re.IGNORECASE)
            if match and not annual_fee:
                annual_fee = match.group(1)
            # Pattern 7: "joining fee of INRxxxx"
            match = re.search(r'joining fee[s]?\s*(?:of|:)?\s*inr?([\d,\.]+)', entry, re.IGNORECASE)
            if match and not joining_fee:
                joining_fee = match.group(1)
            # Pattern 8: "INRxxxx joining/annual fees"
            match = re.search(r'inr?([\d,\.]+)\s*joining/annual fee[s]?', entry, re.IGNORECASE)
            if match and not joining_fee and not annual_fee:
                joining_fee = annual_fee = match.group(1)
            # Pattern 9: "INRxxxx joining, INRxxxx annual"
            match = re.search(r'inr?([\d,\.]+)\s*joining[^\d]+inr?([\d,\.]+)\s*annual', entry, re.IGNORECASE)
            if match:
                joining_fee, annual_fee = match.group(1), match.group(2)
    
    # Income requirement
    income_req = None
    match = re.search(r'(?:income requirement|income of|income)[^\d]*(≥|>|~)?\s*inr?([\d,\.]+)\s*(lakh|crore|cr|l)?', entry, re.IGNORECASE)
    if match:
        num, unit = match.group(2), match.group(3)
        income_req = f"{num} {unit}" if unit else num
    else:
        match = re.search(r'inr?([\d,\.]+)\s*(lakh|crore|cr|l)?\s*income', entry, re.IGNORECASE)
        if match:
            num, unit = match.group(1), match.group(2)
            income_req = f"{num} {unit}" if unit else num
    
    # Reward Description: first sentence with "reward", "cashback", "mile", "point", "savings", "back", "per"
    reward_desc = ''
    for l in lines:
        if any(x in l.lower() for x in ['reward', 'cashback', 'mile', 'point', 'savings', 'back', 'per']):
            reward_desc = l.strip()
            break
    # Fallback: any line with a % or X
    if not reward_desc:
        reward_desc = next((l for l in lines if '%' in l or 'X' in l or 'per' in l), '')
    
    reward_rate = extract_reward_rate(reward_desc)
    
    # Interest rate
    interest_rate = None
    match = re.search(r'interest rate[:\s~]*([\d\.]+)%', entry, re.IGNORECASE)
    if match:
        interest_rate = match.group(1)
    
    # Key features: concatenate all lines except those with fees, income, or interest rate
    feature_lines = []
    for l in lines[1:]:
        if not any(x in l.lower() for x in ['joining fee', 'annual fee', 'income', 'interest rate']):
            feature_lines.append(l)
    key_features = ' | '.join(feature_lines)
    
    parsed_data.append({
        'Card Name': card_name,
        'Issuer': issuer,
        'Joining Fee': clean_numeric(joining_fee),
        'Annual Fee': clean_numeric(annual_fee),
        'Eligibility (Income)': clean_numeric(income_req),
        'Reward Description': reward_desc,
        'Reward Rate (%)': reward_rate,
        'Interest Rate (p.m.)': clean_numeric(interest_rate),
        'Key Features': key_features
    })

df = pd.DataFrame(parsed_data)
print(df.head(10))
df.to_csv("model/credit_card_data_cleaned.csv", index=False)
print("Saved cleaned data to model/credit_card_data_cleaned.csv")


                                    Card Name                   Issuer  \
0     1. HDFC Diners Club Black Metal Edition                HDFC Bank   
1               Axis Bank Reserve Credit Card                Axis Bank   
2                     HDFC Infinia Metal Card                HDFC Bank   
3    ICICI Emeralde Private Metal Credit Card               ICICI Bank   
4     Standard Chartered Ultimate Credit Card  Standard Chartered Bank   
5                      Axis Atlas Credit Card                Axis Bank   
6                  HSBC TravelOne Credit Card                HSBC Bank   
7  MakeMyTrip ICICI Bank Platinum Credit Card               ICICI Bank   
8                              Yatra SBI Card                 SBI Bank   
9   Standard Chartered EaseMyTrip Credit Card  Standard Chartered Bank   

   Joining Fee  Annual Fee  Eligibility (Income)  \
0      10000.0     10000.0             2100000.0   
1      50000.0     50000.0                   NaN   
2          NaN     12500.0   

4. Save Cleaned Data

In [5]:

# Now save the cleaned DataFrame
df.to_csv('model/credit_card_data_cleaned.csv', index=False)

print("Cleaned data saved.")


Cleaned data saved.


In [11]:
import pandas as pd
import numpy as np
import joblib
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import MinMaxScaler
import os

# Load cleaned data
df = pd.read_csv('model/credit_card_data_cleaned.csv')

# 1. Clean and normalize numeric features
def clean_numeric(val):
    if pd.isnull(val) or val == '':
        return np.nan
    val_str = str(val).lower().replace('₹', '').replace('inr', '').replace(',', '').replace('%', '').replace('≥', '').strip()
    # Handle lakh/crore
    if any(x in val_str for x in ['lakh', 'lac', 'lacs', 'l']):
        match = re.search(r'([\d\.]+)', val_str)
        if match:
            return float(match.group(1)) * 100000
    elif any(x in val_str for x in ['crore', 'cr']):
        match = re.search(r'([\d\.]+)', val_str)
        if match:
            return float(match.group(1)) * 10000000
    else:
        try:
            return float(val_str)
        except:
            return np.nan

# Apply cleaning to numeric columns
numeric_cols = ['Joining Fee', 'Annual Fee', 'Eligibility (Income)', 'Reward Rate (%)', 'Interest Rate (p.m.)']
for col in numeric_cols:
    df[col] = df[col].apply(clean_numeric)
    
# Fill missing values with column mean
for col in numeric_cols:
    df[col] = df[col].fillna(df[col].mean())

# Normalize numeric features
scaler = MinMaxScaler()
normalized_numeric = scaler.fit_transform(df[numeric_cols])

# 2. Generate text embeddings
text_fields = (
    df['Card Name'].astype(str) + ' ' +
    df['Reward Description'].astype(str) + ' ' +
    df['Key Features'].astype(str)
).tolist()

embedder = SentenceTransformer('all-MiniLM-L6-v2')
text_embeddings = embedder.encode(text_fields, show_progress_bar=True)

# 3. Combine text embeddings and normalized numeric features
hybrid_embeddings = np.hstack([text_embeddings, normalized_numeric])

# 4. Save everything
os.makedirs('model', exist_ok=True)
np.save('model/credit_card_hybrid_embeddings.npy', hybrid_embeddings)
joblib.dump(embedder, 'model/credit_card_embedder.joblib')
joblib.dump(scaler, 'model/credit_card_scaler.joblib')
df.to_csv('model/credit_card_data_final.csv', index=False)

print("Hybrid embeddings and models saved successfully!")


Batches: 100%|██████████| 2/2 [00:00<00:00,  5.02it/s]


Hybrid embeddings and models saved successfully!


In [12]:
print(f"Hybrid embedding shape: {hybrid_embeddings.shape}")

Hybrid embedding shape: (50, 389)
