In [1]:
import pandas as pd
# Ranking module also shared as a gist: https://gist.github.com/ceroper/58675b2ac2c73b66f24f63c32e837af2
from ranking import *
from ranksvm_utils import *
from sklearn import svm
from sklearn.model_selection import GroupKFold
from sklearn.metrics import ndcg_score
from sklearn.datasets import load_svmlight_file

In [2]:
# Downloaded Learn to Rank dataset here: https://huggingface.co/datasets/YahooResearch/Yahoo-Learning-to-Rank-Challenge
# Found "convert" function source here: https://colab.research.google.com/github/finardi/tutos/blob/master/learning_to_rank.ipynb/#scrollTo=vmpCW_oepNCZ
## Note that we have convert the original raw data into a pure libsvm format.
## For more details, pls refer to: https://github.com/guolinke/boosting_tree_benchmarks/tree/master/data

def convert(input_filename, out_data_filename, out_query_filename, out_query_filename2):
	input = open(input_filename,"r")
	output_feature = open(out_data_filename,"w")
	output_query = open(out_query_filename,"w")
	output_query2 = open(out_query_filename2,"w")
	cur_cnt = 0
	cur_doc_cnt = 0
	last_qid = -1
	while True:
		line = input.readline()
		if not line:
			break
		tokens = line.split(' ')
		tokens[-1] = tokens[-1].strip()
		label = tokens[0]
		qid = int(tokens[1].split(':')[1])
		if qid != last_qid:
			if cur_doc_cnt > 0:
				output_query.write(str(cur_doc_cnt) + '\n')
				output_query2.write(str(cur_doc_cnt) + '\n')
				cur_cnt += 1
			cur_doc_cnt = 0
			last_qid = qid
		cur_doc_cnt += 1
		output_feature.write(label+' ')
		output_feature.write(' '.join(tokens[2:]) + '\n')
	output_query.write(str(cur_doc_cnt) + '\n')
	output_query2.write(str(cur_doc_cnt) + '\n')
	
	input.close()
	output_query.close()
	output_feature.close()
	output_query2.close()

convert("Learning to Rank Challenge/ltrc_yahoo/set1.train.txt","yahoo.train","yahoo.train.query","yahoo.train.group")
convert("Learning to Rank Challenge/ltrc_yahoo/set1.test.txt","yahoo.test","yahoo.test.query","yahoo.test.group")

In [3]:
# Load the LIBSVM file
X, y = load_svmlight_file('yahoo.train')


In [4]:
# Load the LIBSVM file
g = load_svmlight_file('yahoo.train.group')
groups = g[1]

In [5]:
# Replace the groups variable with this code to convert group sizes to group IDs
group_ids = [group_num for group_num, count in enumerate([int(x) for x in groups], start=1) for _ in range(count)]

In [6]:
max_groups = 400

In [7]:
n = len([x for x in group_ids if x <= max_groups])

In [8]:
X = X[:n].toarray()
y = y[:n]

In [9]:
group_ids = [x for x in group_ids if x <= max_groups]

In [10]:
y_2 = np.concatenate(
    (y[:, np.newaxis], np.array(group_ids)[:, np.newaxis]),
    axis=1
)

In [11]:
y_2.shape

(6176, 2)

In [12]:
gkf = GroupKFold(n_splits=3)


param_grid = {
    'C': [0.1, 10],
    'max_iter': [1000, 5000],
    'tol': [1e-4],
    'penalty': ['l2']
}

In [13]:
results = []
scores = []

for penalty in param_grid['penalty']:
    for c in param_grid['C']:
        for max_iter in param_grid['max_iter']:
            for tol in param_grid['tol']:
                fold_scores = []
                for fold_idx, (train_idx, val_idx) in enumerate(gkf.split(X, y_2, group_ids)):

                    # Get training and validation DataFrames
                    X_train_fold = X[train_idx].copy()
                    y_train_fold = y_2[train_idx].copy()
                    X_val_fold = X[val_idx].copy()
                    X_val_fold = np.concatenate((X_val_fold, np.array(group_ids)[val_idx][:, np.newaxis]), axis=1)
                    y_val_fold = y_2[val_idx].copy()
                    
                    # Create DataFrame for easier handling with session_id
                    y_val_df_eval = pd.DataFrame({
                        'label': y_val_fold[:, 0],
                        'group': y_val_fold[:, 1]
                    })

                    # Fit and evaluate using calculate_ranking_ndcg
                    fold_model = RankSVM(C=c, tol=tol, max_iter=max_iter, penalty=penalty, dual=False, random_state=42)
                    fold_model.fit(X_train_fold, y_train_fold)
                    preds = fold_model.predict(X_val_fold)
                    y_val_df_eval['y_pred'] = preds
                    fold_score = calculate_ranking_ndcg(y_val_df_eval.label,
                                                       y_val_df_eval.y_pred,
                                                       get_group_sizes(y_val_df_eval))

                    fold_scores.append(fold_score)
                    print(f"Fold {fold_idx + 1} NDCG: {fold_score:.4f}")
                res = {'C': c, 'tol': tol, 'max_iter': max_iter, 'penalty':penalty, 'scores':fold_scores, 'mean_score':np.mean(fold_scores)}
                results.append(res)
                print ('folds complete', res)

Fold 1 NDCG: 0.8717
Fold 2 NDCG: 0.8056
Fold 3 NDCG: 0.7147
folds complete {'C': 0.1, 'tol': 0.0001, 'max_iter': 1000, 'penalty': 'l2', 'scores': [np.float64(0.8717100107511292), np.float64(0.8056127579149828), np.float64(0.7146759081994981)], 'mean_score': np.float64(0.7973328922885367)}
Fold 1 NDCG: 0.8717
Fold 2 NDCG: 0.8056
Fold 3 NDCG: 0.7147
folds complete {'C': 0.1, 'tol': 0.0001, 'max_iter': 5000, 'penalty': 'l2', 'scores': [np.float64(0.8717100107511292), np.float64(0.8056127579149828), np.float64(0.7146759081994981)], 'mean_score': np.float64(0.7973328922885367)}
Fold 1 NDCG: 0.8717
Fold 2 NDCG: 0.8056
Fold 3 NDCG: 0.7147
folds complete {'C': 10, 'tol': 0.0001, 'max_iter': 1000, 'penalty': 'l2', 'scores': [np.float64(0.8717100107511292), np.float64(0.8056127579149828), np.float64(0.7146759081994981)], 'mean_score': np.float64(0.7973328922885367)}
Fold 1 NDCG: 0.8717
Fold 2 NDCG: 0.8056
Fold 3 NDCG: 0.7147
folds complete {'C': 10, 'tol': 0.0001, 'max_iter': 5000, 'penalty': 'l