In [1]:
import pandas as pd
# Ranking module also shared as a gist: https://gist.github.com/ceroper/58675b2ac2c73b66f24f63c32e837af2
from ranking import *
from ranksvm_utils import *
from sklearn import svm, linear_model
from sklearn.model_selection import GroupKFold
from sklearn.metrics import ndcg_score
from sklearn.datasets import load_svmlight_file

In [2]:
# Downloaded Learn to Rank dataset here: https://huggingface.co/datasets/YahooResearch/Yahoo-Learning-to-Rank-Challenge
# Found "convert" function source here: https://colab.research.google.com/github/finardi/tutos/blob/master/learning_to_rank.ipynb/#scrollTo=vmpCW_oepNCZ
## Note that we have convert the original raw data into a pure libsvm format.
## For more details, pls refer to: https://github.com/guolinke/boosting_tree_benchmarks/tree/master/data

def convert(input_filename, out_data_filename, out_query_filename, out_query_filename2):
	input = open(input_filename,"r")
	output_feature = open(out_data_filename,"w")
	output_query = open(out_query_filename,"w")
	output_query2 = open(out_query_filename2,"w")
	cur_cnt = 0
	cur_doc_cnt = 0
	last_qid = -1
	while True:
		line = input.readline()
		if not line:
			break
		tokens = line.split(' ')
		tokens[-1] = tokens[-1].strip()
		label = tokens[0]
		qid = int(tokens[1].split(':')[1])
		if qid != last_qid:
			if cur_doc_cnt > 0:
				output_query.write(str(cur_doc_cnt) + '\n')
				output_query2.write(str(cur_doc_cnt) + '\n')
				cur_cnt += 1
			cur_doc_cnt = 0
			last_qid = qid
		cur_doc_cnt += 1
		output_feature.write(label+' ')
		output_feature.write(' '.join(tokens[2:]) + '\n')
	output_query.write(str(cur_doc_cnt) + '\n')
	output_query2.write(str(cur_doc_cnt) + '\n')
	
	input.close()
	output_query.close()
	output_feature.close()
	output_query2.close()

convert("Learning to Rank Challenge/ltrc_yahoo/set1.train.txt","yahoo.train","yahoo.train.query","yahoo.train.group")
convert("Learning to Rank Challenge/ltrc_yahoo/set1.test.txt","yahoo.test","yahoo.test.query","yahoo.test.group")

In [3]:
# Load the LIBSVM file
X, y = load_svmlight_file('yahoo.train')


In [4]:
# Load the LIBSVM file
g = load_svmlight_file('yahoo.train.group')
groups = g[1]

In [5]:
# Replace the groups variable with this code to convert group sizes to group IDs
group_ids = [group_num for group_num, count in enumerate([int(x) for x in groups], start=1) for _ in range(count)]

In [6]:
max_groups = 400

In [7]:
n = len([x for x in group_ids if x <= max_groups])

In [8]:
X = X[:n].toarray()
y = y[:n]

In [9]:
group_ids = [x for x in group_ids if x <= max_groups]

In [10]:

X_2 = np.concatenate(
    (X, np.array(group_ids)[:, np.newaxis]),
    axis=1
)
y_2 = np.concatenate(
    (y[:, np.newaxis], np.array(group_ids)[:, np.newaxis]),
    axis=1
)

In [11]:
y_2.shape

(6176, 2)

In [12]:
gkf = GroupKFold(n_splits=5)


param_grid = {
    'C': [0.1, 10],
    'max_iter': [1000, 5000],
    'tol': [1e-4],
    'penalty': ['l2']
}

Example Cross-Validating the Pairwise Model

In [13]:
results = []
scores = []

for penalty in param_grid['penalty']:
    for c in param_grid['C']:
        for max_iter in param_grid['max_iter']:
            for tol in param_grid['tol']:
                fold_scores = []
                for fold_idx, (train_idx, val_idx) in enumerate(gkf.split(X, y_2, group_ids)):

                    # Get training and validation DataFrames
                    X_train_fold = X_2[train_idx].copy()
                    y_train_fold = y_2[train_idx].copy()
                    X_val_fold = X_2[val_idx].copy()
                    y_val_fold = y_2[val_idx].copy()
                    
                    # Create DataFrame for easier handling with session_id
                    y_val_df_eval = pd.DataFrame({
                        'label': y_val_fold[:, 0],
                        'group': y_val_fold[:, 1]
                    })

                    # Fit and evaluate using calculate_ranking_ndcg
                    fold_model = RankSVM(C=c, tol=tol, max_iter=max_iter, penalty=penalty, dual=False, random_state=42)
                    fold_model.fit(X_train_fold, y_train_fold)
                    preds = fold_model.predict(X_val_fold)
                    y_val_df_eval['y_pred'] = preds
                    fold_score = calculate_ranking_ndcg(y_val_df_eval.label,
                                                       y_val_df_eval.y_pred,
                                                       get_group_sizes(y_val_df_eval))

                    fold_scores.append(round(fold_score,4))
                    print(f"Fold {fold_idx + 1} NDCG: {fold_score:.4f}")
                res = {'C': c, 'tol': tol, 'max_iter': max_iter, 'penalty':penalty, 'scores':fold_scores, 'mean_score':round(np.mean(fold_scores),4)}
                results.append(res)
                print ('folds complete', res)

Fold 1 NDCG: 0.8094
Fold 2 NDCG: 0.4878
Fold 3 NDCG: 0.9727
Fold 4 NDCG: 0.8350
Fold 5 NDCG: 0.8016
folds complete {'C': 0.1, 'tol': 0.0001, 'max_iter': 1000, 'penalty': 'l2', 'scores': [np.float64(0.8094), np.float64(0.4878), np.float64(0.9727), np.float64(0.835), np.float64(0.8016)], 'mean_score': np.float64(0.7813)}
Fold 1 NDCG: 0.8094
Fold 2 NDCG: 0.4878
Fold 3 NDCG: 0.9727
Fold 4 NDCG: 0.8350
Fold 5 NDCG: 0.8016
folds complete {'C': 0.1, 'tol': 0.0001, 'max_iter': 5000, 'penalty': 'l2', 'scores': [np.float64(0.8094), np.float64(0.4878), np.float64(0.9727), np.float64(0.835), np.float64(0.8016)], 'mean_score': np.float64(0.7813)}
Fold 1 NDCG: 0.8094
Fold 2 NDCG: 0.4878
Fold 3 NDCG: 0.9727
Fold 4 NDCG: 0.8350
Fold 5 NDCG: 0.8016
folds complete {'C': 10, 'tol': 0.0001, 'max_iter': 1000, 'penalty': 'l2', 'scores': [np.float64(0.8094), np.float64(0.4878), np.float64(0.9727), np.float64(0.835), np.float64(0.8016)], 'mean_score': np.float64(0.7813)}
Fold 1 NDCG: 0.8094
Fold 2 NDCG: 0.487

Compare to Pointwise Model

In [14]:
results = []
scores = []

for penalty in param_grid['penalty']:
    for c in param_grid['C']:
        for max_iter in param_grid['max_iter']:
            for tol in param_grid['tol']:
                fold_scores = []
                for fold_idx, (train_idx, val_idx) in enumerate(gkf.split(X, y_2, group_ids)):

                    # Get training and validation DataFrames
                    X_train_fold = X[train_idx].copy()
                    y_train_fold = y[train_idx].copy()
                    X_val_fold = X[val_idx].copy()
                    y_val_fold = y[val_idx].copy()
                    
                    # Create DataFrame for easier handling with grouping/query parameter
                    y_val_df_eval_pointwise = pd.DataFrame({
                        'label': y_2[val_idx, 0],
                        'group': y_2[val_idx, 1]
                    })

                    # Fit and evaluate using calculate_ranking_ndcg
                    fold_model = svm.LinearSVC(C=c, tol=tol, max_iter=max_iter, penalty=penalty, dual=False, random_state=42)
                    fold_model.fit(X_train_fold, y_train_fold)
                    preds = fold_model.predict(X_val_fold)
                    y_val_df_eval_pointwise['y_pred'] = preds
                    fold_score = calculate_ranking_ndcg(y_val_df_eval_pointwise.label,
                                                       y_val_df_eval_pointwise.y_pred,
                                                       get_group_sizes(y_val_df_eval_pointwise))

                    fold_scores.append(round(fold_score,4))
                    print(f"Fold {fold_idx + 1} NDCG: {fold_score:.4f}")
                res = {'C': c, 'tol': tol, 'max_iter': max_iter, 'penalty':penalty, 'scores':fold_scores, 'mean_score':round(np.mean(fold_scores),4)}
                results.append(res)
                print ('folds complete', res)

Fold 1 NDCG: 0.8228
Fold 2 NDCG: 0.6763
Fold 3 NDCG: 0.9142
Fold 4 NDCG: 0.8237
Fold 5 NDCG: 0.8832
folds complete {'C': 0.1, 'tol': 0.0001, 'max_iter': 1000, 'penalty': 'l2', 'scores': [np.float64(0.8228), np.float64(0.6763), np.float64(0.9142), np.float64(0.8237), np.float64(0.8832)], 'mean_score': np.float64(0.824)}
Fold 1 NDCG: 0.8228
Fold 2 NDCG: 0.6763
Fold 3 NDCG: 0.9142
Fold 4 NDCG: 0.8237
Fold 5 NDCG: 0.8832
folds complete {'C': 0.1, 'tol': 0.0001, 'max_iter': 5000, 'penalty': 'l2', 'scores': [np.float64(0.8228), np.float64(0.6763), np.float64(0.9142), np.float64(0.8237), np.float64(0.8832)], 'mean_score': np.float64(0.824)}
Fold 1 NDCG: 0.8404
Fold 2 NDCG: 0.6539
Fold 3 NDCG: 0.9142
Fold 4 NDCG: 0.8044
Fold 5 NDCG: 0.9357
folds complete {'C': 10, 'tol': 0.0001, 'max_iter': 1000, 'penalty': 'l2', 'scores': [np.float64(0.8404), np.float64(0.6539), np.float64(0.9142), np.float64(0.8044), np.float64(0.9357)], 'mean_score': np.float64(0.8297)}
Fold 1 NDCG: 0.8404
Fold 2 NDCG: 0.65

In [None]:
example = y_val_df_eval.loc[y_val_df_eval.group == 363, :]

In [16]:
example_2 = y_val_df_eval_pointwise.loc[y_val_df_eval_pointwise.group == 363, :]

In [None]:
ndcg_score([example.label.astype(int).values], [example.y_pred.astype(int).values])

0.7730432758941261

In [17]:
ndcg_score([example_2.label.astype(int).values], [example_2.y_pred.astype(int).values])

0.750132293507585

In [None]:
# not every group has a "4" - that's interesting

In [21]:
y_val_df_eval.group.value_counts().describe()

count    80.00000
mean     15.43750
std       4.51115
min       4.00000
25%      12.75000
50%      15.00000
75%      19.00000
max      27.00000
Name: count, dtype: float64

In [19]:
example_2.sort_values(by = 'y_pred')

Unnamed: 0,label,group,y_pred
1122,1.0,363.0,0.0
1113,2.0,363.0,0.0
1134,1.0,363.0,0.0
1133,1.0,363.0,1.0
1132,1.0,363.0,1.0
1131,1.0,363.0,1.0
1130,1.0,363.0,1.0
1129,1.0,363.0,1.0
1128,2.0,363.0,1.0
1126,0.0,363.0,1.0


In [None]:
y_val_df_eval.loc[y_val_df_eval.group == 363, :].sort_values(by = 'label')

Unnamed: 0,label,group,y_pred
1110,0.0,363.0,24
1126,0.0,363.0,8
1120,0.0,363.0,14
1121,0.0,363.0,25
1109,1.0,363.0,0
1133,1.0,363.0,1
1132,1.0,363.0,2
1131,1.0,363.0,3
1130,1.0,363.0,4
1129,1.0,363.0,5


In [None]:
y_val_df_eval.group.value_counts()

group
363.0    27
71.0     25
231.0    24
130.0    23
34.0     23
         ..
18.0      9
308.0     8
11.0      8
81.0      7
95.0      4
Name: count, Length: 80, dtype: int64

In [None]:
y_val_df_eval.label.value_counts()

label
1.0    489
2.0    405
0.0    231
3.0     83
4.0     27
Name: count, dtype: int64

In [None]:
y_val_df_eval.y_pred.value_counts()

y_pred
0     80
1     80
2     80
3     80
4     79
5     79
6     79
7     78
8     76
9     72
10    69
11    65
12    60
13    52
14    46
15    39
16    29
17    24
18    21
19    15
20    11
21     9
22     5
23     3
24     2
25     1
26     1
Name: count, dtype: int64

In [None]:
y_val_df_eval.y_pred.value_counts()

y_pred
0     80
1     80
2     80
3     80
4     79
5     79
6     79
7     78
8     76
9     72
10    69
11    65
12    60
13    52
14    46
15    39
16    29
17    24
18    21
19    15
20    11
21     9
22     5
23     3
24     2
25     1
26     1
Name: count, dtype: int64

In [None]:
y_val_df_eval.y_pred.value_counts()

y_pred
1.0    470
2.0    390
0.0    259
3.0     73
4.0     43
Name: count, dtype: int64