In [17]:
import pandas as pd
import numpy as np
import os
import nepali_datetime
from datetime import datetime, timedelta

In [2]:
from rapidfuzz import fuzz, process, utils

def preprocess_text(text: str) -> str:
    try:
        return utils.default_process(str(text).replace(' ', ''))
    except Exception as e:
        print(f"Error preprocessing text: {e}")
        return ""
    

In [3]:
preprocess_text("Kshitiz  Poudel!")

'kshitizpoudel'

In [4]:
def calculate_text_similarity(text1: str, text2: str) -> float:
    try:
        text1 = preprocess_text(text1)
        text2 = preprocess_text(text2)

        if not text1 or not text2:
            return 0.0
        
        return fuzz.ratio(text1, text2) / 100.0
    except Exception as e:
        print(f"Error calculating text similarity: {e}")
        return 0.0

In [5]:
calculate_text_similarity("Ram B Shrestha!", "Ram Bahadur Chrestha")

0.7333333333333334

In [6]:
calculate_text_similarity("Shrestha Ram B!", "Ram Bahadur Chrestha")

0.4666666666666666

In [7]:
def preprocess_dataframe_column(series: pd.Series) -> pd.Series:
    """Vectorized preprocessing of a DataFrame column."""
    if series.empty:
        return series
    return series.fillna("").astype(str).apply(preprocess_text)


In [8]:
def calculate_batch_text_similarity( series: pd.Series, query_value: str) -> pd.Series:
    """Calculate similarity between a series of values and a query value in a vectorized manner.""" 
    try:
        processed_query = (query_value)
        if not processed_query:
            return pd.Series(0.0, index=series.index)
            
        processed_series = preprocess_dataframe_column(series)            
        similarities = process.cdist(
            processed_series, 
            [processed_query], 
            scorer=fuzz.ratio, 
            dtype=np.uint8,  
            workers=max(1, os.cpu_count() // 2)
        )
        return pd.Series(similarities.flatten() / 100.0, index=series.index)
        
    except Exception as e:
        print(f"Error in batch text similarity calculation: {e}")
        return pd.Series(0.0, index=series.index)

In [9]:
calculate_batch_text_similarity(pd.Series(["Ram B Shrestha!", "Shrestha Ram B!"]), "Ram Bahadur Chrestha")

0    0.56
1    0.44
dtype: float64

In [None]:
def standardize_date_format(date_obj, format="%Y-%m-%d"):
    """
    Convert any date object to a standard string format for easy comparison.
    
    Args:
        date_obj: Either a datetime.date object, string date, or BS date string
        format: The output format string (default: YYYY-MM-DD)
        
    Returns:
        A formatted date string or None if conversion fails
    """
    if not date_obj:
        return None
    
    date_formats = [
        "%Y-%m-%d %H:%M:%S",  # Format: 1949-06-16 00:00:00
        "%Y-%m-%d",           # Format: 1949-06-16
        "%d/%m/%Y",           # Format: 16/06/1949
        "%m/%d/%Y",           # Format: 06/16/1949
        "%Y/%m/%d",           # Format: 1949/06/16
        "%Y%m%d",             # Format: 19490616
        "%d-%m-%Y",           # Format: 16-06-1949
        "%m-%d-%Y",           # Format: 06-16-1949
        "%b %d, %Y",          # Format: Jun 16, 1949
        "%d %b %Y",           # Format: 16 Jun 1949
        "%B %d, %Y",          # Format: June 16, 1949
        "%d %B %Y",           # Format: 16 June 1949
        "%Y.%m.%d",           # Format: 1949.06.16
        "%d.%m.%Y",           # Format: 16.06.1949
        "%m.%d.%Y",           # Format: 06.16.1949
        "%Y/%m/%d %H:%M:%S",  # Format: 1949/06/16 00:00:00
        "%d-%b-%Y",           # Format: 16-Jun-1949
        "%d-%B-%Y",           # Format: 16-June-1949
        "%Y-%b-%d",           # Format: 1949-Jun-16
        "%Y-%B-%d"            # Format: 1949-June-16
    ]
    
    for date_format in date_formats:
        try:
            date_obj = datetime.strptime(str(date_obj), date_format)
            return date_obj.strftime("%Y%m%d")
        except (ValueError, TypeError):
            continue
    
    return None


In [33]:
result = standardize_date_format("Jul 16, 1949")
print(result)

19490716
