In [None]:
import numpy as np
import sklearn
from sklearn.datasets import load_svmlight_file
from collections import defaultdict

In [None]:
X, y, query_ids = load_svmlight_file('l2r/train.txt', query_id=True)

In [None]:
# Train-test-split (только для экспериментов)
Xs, ys, query_idss = load_svmlight_file('l2r/train.txt', query_id=True)
X = Xs[:317023]
y = ys[:317023]
query_ids = query_idss[:317023]

X_test = Xs[317023:]
y_test = ys[317023:]
query_ids_test = query_idss[317023:]

test_query_doc = defaultdict(list)
for doc_id, query_id in enumerate(query_ids_test):
    test_query_doc[query_id].append(doc_id)

In [None]:
import tqdm.notebook

### Russian
confusion_matrix = np.array([[0.75, 0.22, 0.02, 0, 0],
                    [0.34, 0.54, 0.11, 0.01, 0],
                    [0.07, 0.13, 0.73, 0.06, 0.01],
                    [0.04, 0.04, 0.52, 0.32, 0.08],
                    [0.03, 0.02, 0.05, 0.08, 0.83]
                   ])

### Ukranian
# confusion_matrix = np.array([[0.88, 0.09, 0.02, 0, 0],
#                     [0.26, 0.65, 0.07, 0.01, 0],
#                     [0.05, 0.08, 0.78, 0.07, 0.01],
#                     [0.03, 0.02, 0.24, 0.60, 0.1],
#                     [0.03, 0.02, 0.03, 0.05, 0.86]
#                    ])

### Yahoo challenge
# confusion_matrix = np.array([[0.869, 0.103, 0.02, 0.001, 0.007],
#                             [0.016, 0.878, 0.1, 0.005, 0.002],
#                             [0.003, 0.098, 0.85, 0.046, 0.004],
#                             [0.0, 0.01, 0.094, 0.896, 0.0],
#                             [0.0, 0.0, 0.019, 0.016, 0.965]
#                            ])

def weight(left, right):
    w = 0
    for u in range(confusion_matrix.shape[0]):
        for v in range(confusion_matrix.shape[1]):
            if u > v:
                w += confusion_matrix[int(left), u] * confusion_matrix[int(right), v]
    return w
    
train_query_doc = defaultdict(list)
for doc_id, query_id in enumerate(query_ids):
    train_query_doc[query_id].append(doc_id)
    
w_ij = dict()
for query_id in tqdm.notebook.tqdm(train_query_doc):
    docs = train_query_doc[query_id]
    y_i = y[docs]
    y_ij = np.zeros((y_i.shape[0], y_i.shape[0]))
    for i in range(y_i.shape[0]):
        for j in range(y_i.shape[0]):
            y_ij[i][j] = weight(y_i[i], y_i[j])
    w_ij[query_id] = y_ij

In [None]:
### Loss = -log(e^x_i / (e^x_i + e^x_j))
### Loss = (x_i - x_j) + log(1 + e^(x_i - x_j))
### grad = 1 - e^(x_i - x_j) / (1 + e^(x_i - x_j))

In [None]:
from collections import defaultdict
import numpy as np
import tqdm.notebook
from sklearn.tree import DecisionTreeRegressor


class YetiRank:
    def __init__(self, n_trees, max_depth, learning_rate):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.learning_rate = learning_rate
        self.trees = []
        self.train_loss = []
        self.val_loss = []
        
    def compute_grads(self, y_true, y_pred):
        grad = np.zeros(y_true.shape[0])
        hess = np.zeros(y_true.shape[0])

        for query_id in train_query_doc:
            docs = np.array(train_query_doc[query_id])
            r = y_pred[docs]
            r = r.reshape(-1, 1) - r
            r = np.clip(r, -30, 30)
            dL_dr = 1 - np.exp(r) / (1 + np.exp(r))
            grad[docs] = np.sum(w_ij[query_id] * -dL_dr, axis=1)
            hess[docs] = np.maximum(np.sum(dL_dr * (1 - dL_dr), axis=1), 1e-16)
        return grad, hess

    def reweight_tree(self, X, tree, grad, hess):
        leaf_index_dct = defaultdict(list)
        for sample_index, leaf_index in enumerate(tree.tree_.apply(X)):
            leaf_index_dct[leaf_index].append(sample_index)
        for leaf_index, sample_indexes in leaf_index_dct.items():
            nom = -grad[sample_indexes].sum()
            denom = hess[sample_indexes].sum()
            if nom == 0 or denom == 0:
                tree.tree_.value[leaf_index] = 0.
            else:
                tree.tree_.value[leaf_index] = nom / denom
        return tree

    def fit(self, X, y):
        predictions = np.zeros_like(y)
        for _ in tqdm.notebook.tqdm(range(self.n_trees)):
            tree = DecisionTreeRegressor(max_depth=self.max_depth)
            grad, hess = self.compute_grads(y, predictions)
            tree.fit(X, -grad)
            tree = self.reweight_tree(X, tree, grad, hess)
            self.trees.append(tree)
            predictions += self.learning_rate * tree.predict(X)
        return self

    def predict(self, X):
        preds = np.sum([self.learning_rate * tree.predict(X) for tree in self.trees], axis=0)
        return preds

In [None]:
# model = YetiRank(n_trees=64, max_depth=10, learning_rate=0.1).fit(X.astype(np.float32), y.astype(np.float32))
### Ручной бустинг МЕДЛЕННЫЙ, хоть сколько его не ускоряй и нет многопоточности
### Поэтому решил использовать XGB

In [None]:
def objective(y_true, y_pred):
    grad = np.zeros(y_true.shape[0])
    hess = np.zeros(y_true.shape[0])

    for query_id in train_query_doc:
        docs = np.array(train_query_doc[query_id])
        r = y_pred[docs]
        r = r.reshape(-1, 1) - r
        r = np.clip(r, -30, 30)
        dL_dr = 1 - np.exp(r) / (1 + np.exp(r))
        grad[docs] = np.sum(w_ij[query_id] * -dL_dr, axis=1)
        hess[docs] = np.maximum(np.sum(dL_dr * (1 - dL_dr), axis=1), 1e-16)
    return grad, hess

In [None]:
from xgboost import XGBRegressor
params = {
         'objective': objective, 
         'n_estimators': 2048 , 
         'n_jobs': 16,
         'max_depth': 10, 
         'learning_rate': 0.05
         }
model = XGBRegressor(**params)
model.fit(X, y)

In [None]:
y_pred = model.predict(X_test)

In [None]:
y_pred

In [None]:
### Validation scoring
import tqdm
score = []
for query_id in tqdm.notebook.tqdm(test_query_doc):
    docs = np.array(test_query_doc[query_id])
    y_pred_i = y_pred[docs]
    y_gt = y_test[docs].astype('int')
    if len(y_gt) != 1:
        score.append(sklearn.metrics.ndcg_score(y_gt.reshape(1, -1), y_pred_i.reshape(1, -1), k=5))
    else:
        score.append(1.0)

In [None]:
np.mean(score)

In [None]:
X_test, y_test, query_ids_test = load_svmlight_file('l2r/test.txt', query_id=True)
y_pred = model.predict(X_test)
test_query_doc = defaultdict(list)
for doc_id, query_id in enumerate(query_ids_test):
    test_query_doc[query_id].append(doc_id)

with open("submission.csv", 'w') as write_file:
    print("QueryId,DocumentId", file=write_file)
    for query_id in test_query_doc:
        docs = test_query_doc[query_id]
        sorted_docs = np.array(docs)[np.argsort(y_pred[docs])[::-1]]
        for doc_id in sorted_docs:
            print(f"{query_id},{doc_id+1}", file=write_file)