In [28]:
import os
import pandas as pd
import glob
import numpy as np
import json

In [19]:
def t1(filepath):
    f = open(filepath, 'r')
    return json.load(f)
    
data = [{"basename": os.path.basename(f).split(".")[0], "filepath": f, "val": t1(f)} for f in glob.glob("../data/*.json")]

In [20]:
inputData = pd.DataFrame(data[0]['val'])

In [21]:
ranking = list(range(len(data[1]['val'])))

### Step 1: Match Criteria 

https://towardsdatascience.com/introduction-to-ranking-algorithms-4e4639d65b8

Inputs:
- $X$ = A set of preferences
- $X^{'}$ = A feature vector representing an account.
- $y$ = A ordered list of ranked pairs.

Given an n-dimensional feature vector storing the information about a query and a document, the objective of ranking is to find such a function f which produces a real number indicating the relevance of the query to the document. Additionally, if object i is ranked higher than object j (i ▷ j), then f(i) should be greater than f(j).

Note. i ▷ j means that document i is ranked higher than document j.

In [22]:
supplyData = pd.DataFrame(data[1]['val']) # features dervied from document

In [49]:
s = Scorer(supplyData, inputData)
s.preprocess()

# Approach 1 : LLM/Generative Approach 

Pros: Can consider a large amount of features.

Cons: Requires large models, constant API calls, slower, and hallucinates.


# Approach 2 : Distance Approach

Pros: Fast updates. Offline. 

Cons: Limited. 

In [313]:
from sklearn.metrics import pairwise_distances
from scipy.spatial import distance

class HeuristicScorer():
    
    _x = None
    _y = None
    _distance_metric = None 
    _weights = None
    
    def __init__(self, distance_metric='hamming'):
        # set weights for how much constraint mapping impacts score
        self._distance_metric = distance_metric

    def fit(self, X, y=None):
        self._x = X
        self._y = y
        return self

    def transform(self, X):
        return X
        
    def predict(self, X):
        if self._x is None:
            raise ValueError("not trained")
        if self._distance_metric is None: 
            raise ValueError("please put valid distance metric")  
        dist = distance.cdist(X, self._x, metric=self._distance_metric)
        return [np.argsort(dist), dist]

class ConstraintMapper():
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def predict(self, X):
        return X
        
class CustomNormalizer():

    _min = None
    _max = None
    
    def __init__(self, min=0, max=5):
        self._min = min
        self._max = max

    def fit(self, X, y=None):
        return self 
        
    def transform(self, X):
        return (X-self._min)/(self._max-self._min)

In [311]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
pipeline = Pipeline(steps=[
    ('custom_normalizer', CustomNormalizer()), 
    ('heuristic_scorer', HeuristicScorer(distance_metric='euclidean')) 
])

In [308]:
X = pd.concat([supplyData.Vendor, pd.DataFrame(np.random.randint(low=0, high=5, size=(len(supplyData.Vendor), 10)))], axis=1)
pipeline.fit(X.iloc[:,1:], None)
ranks = pipeline.predict(np.random.randint(low=0, high=5, size=(1, 10)))
display(X.iloc[list(ranks[0][0])].Vendor)

4          EverBank Small Business Checking
2           Chase Business Complete Banking
0                    Novo Business Checking
3    Wells Fargo Initiate Business Checking
1                BlueVine Business Checking
Name: Vendor, dtype: object

# Approach 3 : Collaborative Filtering

https://cs.nyu.edu/~mohri/pub/pref.pdf
https://research.google/pubs/preference-based-learning-to-rank/

Pros: Small model which can run quickly offline, and update rank live.

Cons: Requires data and harder to implement.

Key Questions To Consider:

1. Preference Function.
2. Categories
3. Data