In [9]:
import pandas as pd

In [14]:
def preprocess_features():
    '''This function processes and creates our feature columns descriptions'''
    # Read in the features file
    features = pd.read_csv('features.csv')
    # Create new header and replace spaces with underscore
    new_header = features.iloc[0].str.replace(' ','_')
    # Remove the first row which is now the new header
    features = features[1:]
    # Set new headers
    features.columns = new_header
    # Only the first cell for each category is filled. Using forward will
    # will allow me to map each category to their sub-categories located
    # in the stream column 
    features['feature_description'] = features['feature_description'].ffill()
    # Replacing characters to allign with TensorFlows regex requirements
    character_removal = [' ', '(', ')', '*']
    for char in character_removal:
        features['feature_description'] = features['feature_description'].str.replace(char, '_')
        features['stream'] = features['stream'].astype(str).str.replace(char, '_')
    # Setting column type to string for mapping within the load_rename_save function
    features['feature_id'] = features['feature_id'].astype(str)
    # Creating new column to map features to existing dataset
    features['cols'] = 'string'
    # Looping over all features and creating new column name
    for idx in range(len(features)):
        if str(features.iloc[idx]['stream']) != 'nan':
            features['cols'].iloc[idx] = features['feature_description'].iloc[idx] + '_' + features['stream'].iloc[idx]
        else:
            features['cols'].iloc[idx] = features['feature_description'].iloc[idx]
    return features

In [15]:
def label_columns(df):
    
    '''This function labels the columns by descriptions
       found on the microsoft research page'''    
        
    for col in df.columns:
        if col == 0:
            df.rename({col : 'relevance_label'}, axis=1, inplace=True)
        elif col == 1:
            df.rename({col : 'query_id'}, axis=1, inplace=True)
        else:
            df.rename({col : f'feature_{col - 1}'}, axis=1, inplace=True)
            
    return df

In [18]:
def load_rename_save(folder_num):
    '''This function reads in all data located in folder n,
       labels the columns, removes uneeded elements from the cells (i.e. 'qid:1' the qid is uneeded),
       and saves the files as a parquet within folder n'''
    
    for folder in folder_num:
        # Load data
        df_train = pd.read_csv(f'MSLR-WEB10K/Fold{folder}/train.txt', sep=' ', header=None)
        df_test = pd.read_csv(f'MSLR-WEB10K/Fold{folder}/test.txt', sep=' ', header=None)
        df_val = pd.read_csv(f'MSLR-WEB10K/Fold{folder}/vali.txt', sep=' ', header=None)
        
        # Label the columns
        df_train = label_columns(df_train)
        df_test = label_columns(df_test)
        df_val = label_columns(df_val)
        
        # Remove 'n:' from each column. The dataset assigned each feature number
        # to the cells value which needs to be removed to get the data into int/float format
        dataframes = {'train': df_train, 'test': df_test, 'val': df_val}
        for k, df in dataframes.items():
            for i in range(1,len(df.columns)-1):
                df[f'feature_{i}'].replace(f'{i}:', '', regex=True, inplace=True)          
            
        # Only query_id was different than all of the other columns when assigning 
        # the prefix to the values. Here we remove 'qid:' from each cell
            df['query_id'].replace('qid:', '', regex=True, inplace=True)

        # Rename the feature columns from the given descriptions on Microsofts webiste   
        features = preprocess_features()
        
        for k, df in dataframes.items():
            for idx in range(len(features)):
                id_ = features.iloc[idx]['feature_id']
                for col in df.columns:
                    if str(id_) == col.lstrip('feature_'):
                        df.rename({col: features.iloc[idx]['cols']}, axis=1, inplace=True)
        
        # Save the cleaned dataset as a csv
        df_train.to_csv(f'MSLR-WEB10K/Fold{folder}/df_train.csv', index=False)
        df_test.to_csv(f'MSLR-WEB10K/Fold{folder}/df_test.csv', index=False)
        df_val.to_csv(f'MSLR-WEB10K/Fold{folder}/df_val.csv', index=False)

In [20]:
load_rename_save([1])

  features['feature_description'] = features['feature_description'].str.replace(char, '_')
  features['stream'] = features['stream'].astype(str).str.replace(char, '_')


In [24]:
df_train = pd.read_csv(f'MSLR-WEB10K/Fold1/df_train.csv')
df_train.head()

Unnamed: 0,relevance_label,query_id,covered_query_term_number_body,covered_query_term_number_anchor,covered_query_term_number_title,covered_query_term_number_url,covered_query_term_number_whole_document,covered_query_term_ratio_body,covered_query_term_ratio_anchor,covered_query_term_ratio_title,...,Inlink_number,Outlink_number,PageRank,SiteRank,QualityScore,QualityScore2,Query-url_click_count,url_click_count,url_dwell_time,feature_137
0,2,1,3,3,0,0,3,1.0,1.0,0.0,...,11089534,2,116,64034,13,3,0,0,0.0,
1,2,1,3,0,3,0,3,1.0,0.0,1.0,...,11089534,2,124,64034,1,2,0,0,0.0,
2,0,1,3,0,2,0,3,1.0,0.0,0.666667,...,3,1,124,3344,14,67,0,0,0.0,
3,2,1,3,0,3,0,3,1.0,0.0,1.0,...,11089534,13,123,63933,1,3,0,0,0.0,
4,1,1,3,0,3,0,3,1.0,0.0,1.0,...,5,7,256,49697,1,13,0,0,0.0,


###  Gradient Boost

In [27]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.metrics import ndcg_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

# Load the MSLR-WEB10K dataset
data = pd.read_csv("MSLR-WEB10K/Fold1/df_train.csv")

# Split the data into training, validation, and test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

# Extract the features and labels from the data
train_features = train_data.iloc[:, 2:].values
train_labels = train_data.iloc[:, 1].values

val_features = val_data.iloc[:, 2:].values
val_labels = val_data.iloc[:, 1].values

test_features = test_data.iloc[:, 2:].values
test_labels = test_data.iloc[:, 1].values

# Define the evaluation metric as NDCG@10
def ndcg(y_true, y_pred, k=10):
    score = ndcg_score(np.array([y_true]), np.array([y_pred]), k=k)
    return score

# Define the gradient boosting algorithm for LTR
class GradientBoostingLTR():
    def __init__(self, num_trees=100, learning_rate=0.1, max_depth=6):
        self.num_trees = num_trees
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        
    def fit(self, X, y, qid):
        self.trees = []
        unique_qid = np.unique(qid)
        
        for q in unique_qid:
            mask = qid == q
            Xq = X[mask]
            yq = y[mask]
            n = len(yq)
            weights = np.ones(n) / n
            
            for i in range(self.num_trees):
                tree = DecisionTreeRegressor(max_depth=self.max_depth)
                tree.fit(Xq, yq, sample_weight=weights)
                
                predictions = tree.predict(Xq)
                gradient = yq - predictions
                
                weights = weights * np.exp(-self.learning_rate * gradient)
                weights = weights / np.sum(weights)
                
                self.trees.append(tree)
                
    def predict(self, X, qid):
        predictions = np.zeros(len(X))
        unique_qid = np.unique(qid)
        
        for q in unique_qid:
            mask = qid == q
            Xq = X[mask]
            n = len(Xq)
            
            if n == 0:
                continue
                
            tree_predictions = np.zeros(n)
            
            for tree in self.trees:
                tree_predictions += self.learning_rate * tree.predict(Xq)
                
            predictions[mask] = tree_predictions
            
        return predictions

# Train the gradient boosting LTR algorithm on the training set
ltr = GradientBoostingLTR(num_trees=100, learning_rate=0.1, max_depth=6)
ltr.fit(train_features, train_labels, train_data['query_id'].values)

# Evaluate the LTR algorithm on the test set using NDCG@10
predictions = ltr.predict(test_features, test_data['query_id'].values)
test_ndcg = ndcg(test_labels, predictions, k=10)

print("Test NDCG@10:", test_ndcg)


ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [28]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.metrics import ndcg_score

# Load the MSLR-WEB10K dataset
data = pd.read_csv("MSLR-WEB10K/Fold1/df_train.csv")

# Define the necessary functions
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def ranknet_loss(preds, labels):
    n = preds.shape[0]
    S = np.zeros((n, n))
    for i in range(n):
        for j in range(n):
            if labels[i] > labels[j]:
                S[i][j] = 1
            elif labels[i] < labels[j]:
                S[i][j] = -1
    
    P = sigmoid(np.subtract.outer(preds, preds))
    P_diff = np.subtract.outer(P, P)
    S_diff = np.subtract.outer(S, S)
    return -np.sum(S_diff * P_diff) / (n * (n - 1))

def ndcg_at_k(preds, labels, k):
    return ndcg_score(np.array([labels]), np.array([preds]), k=k)

# Split the data into training, validation, and test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

# Extract the features and labels from the data
train_features = train_data.iloc[:, 2:].values
train_labels = train_data.iloc[:, 1].values

val_features = val_data.iloc[:, 2:].values
val_labels = val_data.iloc[:, 1].values

test_features = test_data.iloc[:, 2:].values
test_labels = test_data.iloc[:, 1].values

# Normalize the features
train_features = (train_features - np.mean(train_features, axis=0)) / np.std(train_features, axis=0)
val_features = (val_features - np.mean(train_features, axis=0)) / np.std(train_features, axis=0)
test_features = (test_features - np.mean(train_features, axis=0)) / np.std(train_features, axis=0)

# Define the hyperparameters for the LTR algorithm
num_features = train_features.shape[1]
learning_rate = 0.01
num_epochs = 100
batch_size = 32

# Train the LTR algorithm using mini-batch gradient descent
W = np.zeros((num_features,))
for epoch in range(num_epochs):
    for i in range(0, len(train_features), batch_size):
        batch_features = train_features[i:i+batch_size]
        batch_labels = train_labels[i:i+batch_size]
        
        batch_preds = np.dot(batch_features, W)
        batch_loss = ranknet_loss(batch_preds, batch_labels)
        
        grad = np.zeros((num_features,))
        for j in range(batch_size):
            for k in range(batch_size):
                if batch_labels[j] > batch_labels[k]:
                    grad += (sigmoid(batch_preds[j] - batch_preds[k]) * (batch_features[j] - batch_features[k]))
                elif batch_labels[j] < batch_labels[k]:
                    grad += (sigmoid(batch_preds[k] - batch_preds[j]) * (batch_features[k] - batch_features[j]))
        
        W -= learning_rate * grad / batch_size
        
    val_preds = np.dot(val_features, W)
    val_ndcg = ndcg_at_k(val_preds, val_labels, k=10)
    
    print(f"Epoch {epoch}")


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

### Light GBM

In [32]:
# Import necessary libraries
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score, make_scorer

# Load the MSLR-WEB10K dataset
data = pd.read_csv("MSLR-WEB10K/Fold1/df_train.csv")

# Split the data into training, validation, and test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

# Extract the features and labels from the data
train_features = train_data.iloc[:, 2:]
train_labels = train_data.iloc[:, 1]

val_features = val_data.iloc[:, 2:]
val_labels = val_data.iloc[:, 1]

test_features = test_data.iloc[:, 2:]
test_labels = test_data.iloc[:, 1]

# Define a custom NDCG scorer
ndcg_scorer = make_scorer(ndcg_score, needs_proba=True, k=10)

# Train a LightGBM model on the training set
params = {'objective': 'lambdarank', 'metric': 'ndcg', 'ndcg_eval_at': 10, 'learning_rate': 0.1,
          'max_depth': 6, 'num_leaves': 64, 'verbose': 1}
train_dataset = lgb.Dataset(train_features, label=train_labels)
val_dataset = lgb.Dataset(val_features, label=val_labels, reference=train_dataset)
model = lgb.train(params, train_dataset, num_boost_round=200, valid_sets=[train_dataset, val_dataset],
                  early_stopping_rounds=10, verbose_eval=10, feval=ndcg_scorer)

# Evaluate the model on the test set using ndcg_score
predictions = model.predict(test_features)
test_ndcg = ndcg_score(test_labels, predictions, k=10)

print("Test NDCG@10:", test_ndcg)


[LightGBM] [Fatal] Ranking tasks require query information


LightGBMError: Ranking tasks require query information

In [34]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score

# Load the MSLR-WEB10K dataset
data = pd.read_csv("data.csv")

# Split the data into training, validation, and test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

# Extract the features and labels from the data
train_features = train_data.iloc[:, 2:]
train_labels = train_data.iloc[:, 1]

val_features = val_data.iloc[:, 2:]
val_labels = val_data.iloc[:, 1]

test_features = test_data.iloc[:, 2:]
test_labels = test_data.iloc[:, 1]

# Define a function to compute pairwise differences between labels
def pairwise_diff(labels):
    n = len(labels)
    diff_matrix = np.zeros((n, n))
    for i in range(n):
        for j in range(n):
            diff_matrix[i][j] = max(0, 1 - (labels[i] - labels[j]))
    return diff_matrix

# Train a Learning to Rank model on the training set
n_features = train_features.shape[1]
weights = np.ones(n_features)
eta = 0.01
n_iterations = 100
for iteration in range(n_iterations):
    gradient = np.zeros(n_features)
    for i in range(len(train_labels)):
        xi = train_features.iloc[i]
        yi = train_labels.iloc[i]
        diff_matrix = pairwise_diff(train_labels)
        s_i = np.dot(xi, weights)
        gradient += np.sum([((diff_matrix[i][j] - diff_matrix[j][i]) / (1 + np.exp(s_i - np.dot(xi, weights) + np.dot(train_features.iloc[j], weights)))) * (xi - train_features.iloc[j]) for j in range(len(train_labels))], axis=0)
    weights -= eta * gradient

# Evaluate the model on the test set using ndcg_score
predictions = np.dot(test_features, weights)
test_ndcg = ndcg_score([test_labels], [predictions], k=10)

print("Test NDCG@10:", test_ndcg)


KeyError: 0