In [18]:
import spacy
from spacy.language import Language
import pandas as pd
import re

In [None]:
# transaction types
TRANSACTION_TYPES = {
    'NEFT', 'RTGS', 'POS', 'ACH', 'IMPS', 'UPI',
    'NACH', 'FT', 'DD', 'ECS', 'AEPS', 'SWIFT'
}

try:
    nlp = spacy.load("en_core_web_lg")
except OSError:
    print("Downloading spaCy model 'en_core_web_lg'...")
    spacy.cli.download("en_core_web_lg")
    nlp = spacy.load("en_core_web_lg")

# pre-compile regex patterns
ACCOUNT_NO_PATTERN = re.compile(r'([A-Z]{4,}\d{6,}\w*)')
ALPHA_NUMERIC_WORD_PATTERN = re.compile(r'\b(?=\w*[A-Za-z])(?=\w*\d)\w+\b')
TRANSACTION_TYPE_PATTERN = re.compile(r'\b(?:' + '|'.join(TRANSACTION_TYPES) + r')\b', re.IGNORECASE)
NON_ALPHA_SPACE_PATTERN = re.compile(r'[^A-Za-z\s]')
MONTHS_REMOVE = re.compile(r'(?i)\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|sept?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)\b')

STOP_WORDS = {
    'CR', 'DR', 'BY', 'TO', 'FROM', 'TRANSFER', 'PAYMENT',
    'CREDIT', 'DEBIT', 'THROUGH', 'VIA', 'TRANSACTION', 'CHQ', 
    'ADV', 'CHEQUE', 'DEPOSIT', 'OUTWARD', 'INWARD'
}

def should_process(text):
    """Check if text contains any of the transaction types"""
    if pd.isna(text) or not isinstance(text, str):
        return False

    return any(trans_type in text.upper() for trans_type in TRANSACTION_TYPES)

def extract_after_account_no(text):
    """
    Detect account number and return everything after it.
    """
    match = ACCOUNT_NO_PATTERN.search(text)
    if match:
        return text[match.end():].strip()
    return text

def clean_text(text):
    """Clean bank statement description text using regex"""
    if not should_process(text):
        return ""

    # 1. Extract everything after the account number
    processed_text = extract_after_account_no(text)

    # 2. Remove any group that contains both letters and digits (alphanumeric words)
    processed_text = ALPHA_NUMERIC_WORD_PATTERN.sub('', processed_text)

    # 3. Remove transaction type keywords (case-insensitive, as whole words)
    processed_text = TRANSACTION_TYPE_PATTERN.sub('', processed_text)

    # 4. Remove abbreviated months and dates
    processed_text = MONTHS_REMOVE.sub('', processed_text)

    # 5. Remove special characters except alphabets and spaces, convert to uppercase, and collapse spaces
    processed_text = NON_ALPHA_SPACE_PATTERN.sub(' ', processed_text)
    processed_text = processed_text.upper()
    processed_text = ' '.join(processed_text.split())

    # 6. Remove common transaction-related words
    words = [w for w in processed_text.split() if w not in STOP_WORDS]

    return ' '.join(words)

def extract_entities(text):
    """Extract entities from text using spaCy's built-in NER"""
    if not text:  # Skip empty strings
        return []

    doc = nlp(text)
    orgs = [(ent.text, ent.label_) for ent in doc.ents]
    return orgs


Processed data saved to data/extracted/cmp_yesbank.csv
Total rows: 751
Rows processed (containing transaction types): 655


In [None]:
def process_csv(input_file, output_file):
    """Process bank statements from CSV and save results"""
    try:
        df = pd.read_csv(input_file)
    except FileNotFoundError:
        print(f"Error: Input file '{input_file}' not found.")
        return
    except Exception as e:
        print(f"Error reading CSV file '{input_file}': {e}")
        return

    df_processed = df.copy()
    desc_column = df.columns[0]

    rows_to_process_mask = df_processed[desc_column].apply(should_process)
    df_processed.loc[rows_to_process_mask, 'Cleaned_Description'] = df_processed.loc[rows_to_process_mask, desc_column].apply(clean_text)

    df_processed['Extracted_Entities'] = None

    # Extract entities only for the relevant rows
    if rows_to_process_mask.any():
        df_processed.loc[rows_to_process_mask, 'Extracted_Entities'] = df_processed.loc[rows_to_process_mask, 'Cleaned_Description'].apply(extract_entities)

    try:
        df_processed.to_csv(output_file, index=False)
        print(f"Processed data saved to {output_file}")
        print(f"Total rows: {len(df)}")
        print(f"Rows processed (containing transaction types): {rows_to_process_mask.sum()}")
    except Exception as e:
        print(f"Error saving processed data to '{output_file}': {e}")
        
if __name__ == "__main__":
    input_file = "data/desc/yesbank_desc.csv"  # input file
    output_file = "data/extracted/cmp_yesbank.csv"

    try:
        process_csv(input_file, output_file)
    except ValueError as ve:
        print(f"Configuration error: {ve}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

In [None]:
# clustering and fuzzy matching

import spacy
from spacy.matcher import PhraseMatcher
from rapidfuzz import fuzz
from pprint import pprint  # Importing pprint for cleaner output

# Load the spaCy model
nlp = spacy.load("en_core_web_lg")

# Initialize PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

def extract_entities(text):
    """Extract named entities from text using spaCy's NER."""
    doc = nlp(text)
    return {ent.text.lower(): ent.text for ent in doc.ents}

def create_phrase_matcher(entities):
    """Create a PhraseMatcher for the given entities."""
    patterns = [nlp.make_doc(entity) for entity in entities]
    matcher.add("EntityMatcher", patterns)

def find_fuzzy_matches(text, entities, threshold=80):
    """Find fuzzy matches of entities in the text."""
    matches = {}
    for entity, original in entities.items():
        # Find the first exact match
        matcher.add("ExactMatch", [nlp.make_doc(original)])
        doc = nlp(text)
        matches_found = matcher(doc)
        if matches_found:
            first_match = matches_found[0]
            start, end = first_match[1], first_match[2]
            matches[entity] = {'original': original, 'start': start, 'end': end}
            # Find fuzzy matches
            for i in range(end, len(doc)):
                token = doc[i].text.lower()
                if fuzz.ratio(entity, token) >= threshold:
                    matches[entity].setdefault('fuzzy_matches', []).append({'token': token, 'start': i, 'end': i+1})
    return matches

def process_text(text):
    """Process text to find entities and their fuzzy matches."""
    entities = extract_entities(text)
    create_phrase_matcher(entities)
    matches = find_fuzzy_matches(text, entities)
    return matches

# Example usage
text = "I transferred money to John Doe via UPI. Later, Jon Doe received it."
matches = process_text(text)

# Pretty-print the matches
print("Entity Matches Found:")
pprint(matches, width=80)  # Adjust width as needed for better readability


Entity Matches Found:
{'john doe': {'end': 6, 'original': 'John Doe', 'start': 4},
 'jon doe': {'end': 6, 'original': 'Jon Doe', 'start': 4},
 'upi': {'end': 6,
         'fuzzy_matches': [{'end': 8, 'start': 7, 'token': 'upi'}],
         'original': 'UPI',
         'start': 4}}
