In [1]:
import os
import pandas as pd
import glob
import numpy as np
import json

In [2]:
def t1(filepath):
    f = open(filepath, 'r')
    return json.load(f)
    
data = [{"basename": os.path.basename(f).split(".")[0], "filepath": f, "val": t1(f)} for f in glob.glob("../data/*.json")]

In [3]:
inputData = pd.DataFrame(data[0]['val'])

In [4]:
inputData

Unnamed: 0,Weight out of 10,Novo Business Checking,Chase Business Complete Banking
Feature,Weight out of 10,Novo Business Checking,Chase Business Complete Banking
Pros,,"No monthly fees, unlimited transactions, free ...","$15 monthly maintenance fee waived with a $1,5..."
Cons,,"Limited physical branch access, lower check de...",Monthly maintenance fee if minimum balance not...
Good For,,Small Business Owners Who Don't Care About Phy...,Businesses needing access to a broad range of ...
Monthly Fee,5,0,$15
Invoicing,5,Yes,Yes
Budgeting,5,Yes,Yes
Payments,5,Yes,Yes
Hidden Fees,5,No,Yes
Application Speed,5,4,3


In [5]:
ranking = list(range(len(data[1]['val'])))

In [6]:
df = inputData.iloc[:,1:].T

In [7]:
df = df.replace({'Yes': 5, 'No': 0}, inplace=False)

In [8]:
df

Unnamed: 0,Feature,Pros,Cons,Good For,Monthly Fee,Invoicing,Budgeting,Payments,Hidden Fees,Application Speed,...,Transaction Limits,Zelle,Ease of Setup,Bank Reliability,Dedicated Business Manager,Wire Transfers,Accessibility Score,Signup Benefits,Overdraft Protection?,ATM Presence Score
Novo Business Checking,Novo Business Checking,"No monthly fees, unlimited transactions, free ...","Limited physical branch access, lower check de...",Small Business Owners Who Don't Care About Phy...,0,5,5,5,0,4,...,3.0,0,4,3,0,5,3,0,,0
Chase Business Complete Banking,Chase Business Complete Banking,"$15 monthly maintenance fee waived with a $1,5...",Monthly maintenance fee if minimum balance not...,Businesses needing access to a broad range of ...,$15,5,5,5,5,3,...,,5,3,4,0,5,4,3,,5


### Step 1: Match Criteria 

https://towardsdatascience.com/introduction-to-ranking-algorithms-4e4639d65b8

Inputs:
- $X$ = A set of preferences
- $X^{'}$ = A feature vector representing an account.
- $y$ = A ordered list of ranked pairs.

Given an n-dimensional feature vector storing the information about a query and a document, the objective of ranking is to find such a function f which produces a real number indicating the relevance of the query to the document. Additionally, if object i is ranked higher than object j (i ▷ j), then f(i) should be greater than f(j).

Note. i ▷ j means that document i is ranked higher than document j.

In [None]:
supplyData = pd.DataFrame(data[1]['val']) # features dervied from document

In [None]:
supplyData

# Approach 1 : LLM/Generative Approach 

Pros: Can consider a large amount of features.

Cons: Requires large models, constant API calls, slower, and hallucinates.


# Approach 2 : Distance Approach

Pros: Fast updates. Offline. 

Cons: Limited. 

In [208]:
from sklearn.metrics import pairwise_distances
from scipy.spatial import distance

class HeuristicScorer():
    
    _x = None
    _y = None
    _distance_metric = None 
    _weights = None
    
    def __init__(self, distance_metric=''):
        # set weights for how much constraint mapping impacts score
        self._distance_metric = distance_metric

    def fit(self, X, y=None):
        self._x = X
        self._y = y
        return self

    def transform(self, X):
        return X
        
    def predict(self, X):
        if self._x is None:
            raise ValueError("not trained")
        if self._distance_metric is None: 
            raise ValueError("please put valid distance metric")  
        dist = distance.cdist(X, self._x) #metric=self._distance_metric)
        return [np.argsort(dist), dist]

class ConstraintMapper():
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def predict(self, X):
        return X
        
class CustomNormalizer():

    _min = None
    _max = None
    
    def __init__(self, min=0, max=5):
        self._min = min
        self._max = max

    def fit(self, X, y=None):
        return self 
        
    def transform(self, X):
        return (X-self._min)/(self._max-self._min)

In [209]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
pipeline = Pipeline(steps=[
    ('custom_normalizer', MinMaxScaler()), 
    ('heuristic_scorer', HeuristicScorer(distance_metric='euclidean')) 
])

In [210]:
for column in df.columns:
    converted_column = pd.to_numeric(df[column], errors='ignore')
    # If the conversion was successful and the type has changed, update the DataFrame
    if converted_column.dtype != object:
        df[column] = converted_column
df = df.select_dtypes(include='number').fillna(0).T
X = df.iloc[0].values

In [235]:
pd.DataFrame(pipeline.fit_transform(df, None))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,35,36,37,38,39,40,41,42,43,44
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0


In [232]:
ranks = pipeline.predict([df.T['Chase Business Complete Banking']])



In [233]:
df.iloc[ranks[0][0]]

Unnamed: 0,Invoicing,Budgeting,Payments,Hidden Fees,Application Speed,ATM Support,Free Transfers,FDIC Insured,Checks,Wires,...,Transaction Limits,Zelle,Ease of Setup,Bank Reliability,Dedicated Business Manager,Wire Transfers,Accessibility Score,Signup Benefits,Overdraft Protection?,ATM Presence Score
Chase Business Complete Banking,5.0,5.0,5.0,5.0,3.0,5.0,5.0,5.0,0.0,5.0,...,0.0,5.0,3.0,4.0,0.0,5.0,4.0,3.0,0.0,5.0
Novo Business Checking,5.0,5.0,5.0,0.0,4.0,5.0,5.0,5.0,5.0,5.0,...,3.0,0.0,4.0,3.0,0.0,5.0,3.0,0.0,0.0,0.0


# Approach 3 : Collaborative Filtering

https://cs.nyu.edu/~mohri/pub/pref.pdf
https://research.google/pubs/preference-based-learning-to-rank/

Pros: Small model which can run quickly offline, and update rank live.

Cons: Requires data and harder to implement.

Key Questions To Consider:

1. Preference Function.
2. Categories
3. Data