In [22]:
import pandas as pd
import numpy as np
import ast

In [277]:
# convert 2 qid:13 1:2 2:0 3:2 ... --> {'rel': 2, 'qid': 13, 1:2, 2:0, 3:2, ...} then build a Dataframe based on it 
def split_fea_name_val(df: pd.DataFrame):
    def func(c):
        fe, val = c.split(':', 1)    
        return "{}: {}".format(fe, val)
    dict_df = df.apply(lambda x: ast.literal_eval("{{'rel': {}, 'qid': {}, {}}}".format(
        x[0], x[1].split(':',1)[1], ', '.join(x[2:].apply(lambda cl: func(cl))))), axis=1)
    return pd.DataFrame(index=dict_df.index, data=list(dict_df.values))

def adjust_features(df: pd.DataFrame, cols: list):
    for i, v in enumerate(cols):
        if i == 0:
            continue
        df[v] = df[v].str.replace(str(v-1)+':', str(i-1)+':')
    return df

def sort_qid(df: pd.DataFrame):
    df[['qid-str', 'qid']] = df[1].str.split(':', expand=True)
    df['qid'] = df.qid.astype(int)

    df.sort_values(by='qid', inplace=True)
    df.drop(columns=['qid-str', 'qid'], inplace=True)
    
def split_X_y_qid(df: pd.DataFrame):
    df = split_fea_name_val(df)
    X = df[list(range(1,df.columns[-1]))]
    y = df['rel']
    qid = df['qid']
    return X, y, qid

# MSLR-WEB data preparation - Sort data based on qid

In [286]:
%%time

# output_postfix = 'reduced'
bp, dirname = '/data/khodadaa/mslr/MSLR-WEB10K/', 'mslr-web10k'
# bp, dirname = '/data/khodadaa/mslr/MSLR-WEB30K/', 'mslr-web30k'
# bp, dirname = '/data/khodadaa/mslr/Gov/Feature_min/2003_hp_dataset/', '2003_hp_dataset'

cols = None
# cols = list(range(1,6)) #+ list(range(11,26)) + list(range(71,76)) + list(range(105, 132))
# cols = [0, 1] + list(map(lambda x: x+1, keep))

for fn in range(1,6):
    fold = 'Fold' + str(fn)
    npz_dict = {}
    for gr in ['train', 'vali', 'test']:
        df = pd.read_csv(bp + '{}/{}.txt'.format(fold, gr), sep='\s+', header=None, usecols=cols)
        print(fold, gr, df.shape)
        
        X, y, qid = split_X_y_qid(df)
        print('Split done!')
        npz_dict['X_'+gr] = X.values
        npz_dict['y_'+gr] = y.values
        npz_dict['qid_'+gr] = qid.values
    np.savez(file=bp+dirname+fold, **npz_dict)
        
        
#         sort_qid(df)
#         df = adjust_features(df, cols)
#         df.to_csv(bp + '{}/{}_{}.csv'.format(fold, gr, output_postfix), index=False, sep=' ', header=False)

Fold1 train (723412, 138)
Split done!
Fold1 vali (235259, 138)
Split done!
Fold1 test (241521, 138)
Split done!
Fold2 train (716683, 138)


KeyboardInterrupt: 

# MSLR data analysis

In [2]:
%%time
train_df = pd.read_csv('/data/khodadaa/mslr/MSLR-WEB30K/Fold1/train.txt', header=None, sep='\s+')
valid_df = pd.read_csv('/data/khodadaa/mslr/MSLR-WEB30K/Fold1/vali.txt', header=None, sep='\s+')
test_df = pd.read_csv('/data/khodadaa/mslr/MSLR-WEB30K/Fold1/test.txt', header=None, sep='\s+')    
print(train_df.shape, valid_df.shape, test_df.shape)
print(train_df.shape[0]+ valid_df.shape[0]+test_df.shape[0])

(2270296, 138) (747218, 138) (753611, 138)
3771125
CPU times: user 2min 38s, sys: 10.8 s, total: 2min 49s
Wall time: 2min 34s


In [3]:
def map_column_to_score():
    mapping = {}
    corpus_type = ['bdy', 'anc', 'ttl', 'url', 'wdc']
    metric = ['CovQTNum', 'CovQTR', 'StrmLen', 'idf', 'sum-tf', 'min-tf', 'max-tf', 'mean-tf', 'var-tf', 
          'sum-StrLen-norm-tf', 'min-StrLen-norm-tf', 'max-StrLen-norm-tf', 'mean-StrLen-norm-tf', 'var-StrLen-norm-tf',
          'sum-tfidf', 'min-tfidf', 'max-tfidf', 'mean-tfidf', 'var-tfidf', 'boolModel', 'VecSpacMod', 'BM25',
          'LMIR.ABS', 'LMIR.DIR', 'LMIR.JM']
    i = 0    
    for m in metric:
        for ct in corpus_type:
            i += 1
            mapping[i] = m + '({})'.format(ct)

    for e in ['NumSlshURL', 'url-len', 'inlink-num', 'outlink-num', 'PageRank', 'SiteRank', 'QualScore',
              'QualScore2', 'QUrlCliCnt', 'UrlCliCnt', 'UrlDwellTime']:
        i += 1
        mapping[i] = e 
    return mapping

In [4]:
%%time
df_orig = train_df.append(valid_df, ignore_index=True).append(test_df, ignore_index=True)
df_orig.shape

CPU times: user 47.2 s, sys: 10.6 s, total: 57.8 s
Wall time: 31.1 s


In [None]:
%%time
df = split_fea_name_val(df_orig)
df.sort_values(by=['qid', 'rel'], inplace=True)

In [8]:
%%time
df.to_csv('/data/khodadaa/mslr/MSLR-WEB30K/mslr-web30k.csv')
df_rn = df.rename(columns=map_column_to_score())
df_rn.to_csv('/data/khodadaa/mslr/MSLR-WEB30K/mslr-web30k-colname.csv')

CPU times: user 16min 23s, sys: 46.8 s, total: 17min 10s
Wall time: 16min 28s


# Learn to rank

In [26]:
import pyltr

bp = '/data/khodadaa/mslr/MSLR-WEB10K/Fold1'
stp_after = 10

In [27]:
%%time
with open(bp + '/train.txt') as trainfile, \
     open(bp + '/vali.txt') as valifile, \
     open(bp + '/test.txt') as evalfile:
        TX, Ty, Tqids, _ = pyltr.data.letor.read_dataset(trainfile)
        VX, Vy, Vqids, _ = pyltr.data.letor.read_dataset(valifile)
        EX, Ey, Eqids, _ = pyltr.data.letor.read_dataset(evalfile)

CPU times: user 3min 44s, sys: 3.46 s, total: 3min 48s
Wall time: 3min 48s


In [28]:
metric = pyltr.metrics.NDCG(k=10)
monitor = pyltr.models.monitors.ValidationMonitor(
        VX, Vy, Vqids, metric=metric, stop_after=stp_after)
model = pyltr.models.LambdaMART(
        metric=metric,
        n_estimators=1000,
        learning_rate=0.02,
        max_features=0.5,
        query_subsample=0.5,
        max_leaf_nodes=10,
        min_samples_leaf=64,
        verbose=1)

In [None]:
%%time
model.fit(TX, Ty, Tqids, monitor=monitor)

 Iter  Train score  OOB Improve    Remaining                           Monitor Output 
    1       0.0348       0.0294      570.62m      C:      0.0343 B:      0.0343 S:  0
    2       0.0834       0.0469      562.10m      C:      0.0874 B:      0.0874 S:  0
    3       0.2343       0.1494      558.47m      C:      0.2357 B:      0.2357 S:  0
    4       0.2736       0.0429      559.88m      C:      0.2755 B:      0.2755 S:  0
    5       0.2828       0.0088      555.48m      C:      0.2838 B:      0.2838 S:  0
    6       0.3029       0.0225      554.74m      C:      0.3028 B:      0.3028 S:  0
    7       0.3316       0.0219      555.91m      C:      0.3317 B:      0.3317 S:  0
    8       0.3369       0.0025      557.38m      C:      0.3340 B:      0.3340 S:  0
    9       0.3381       0.0010      555.81m      C:      0.3353 B:      0.3353 S:  0
   10       0.3320      -0.0003      554.89m      C:      0.3358 B:      0.3358 S:  0
   15       0.3431       0.0026      553.52m      C: 

In [158]:
def get_frequency():
    test_df = pd.read_csv(bp + '/test.txt', sep='\s+', header=None)
    url_click_counts = test_df[136].str.replace('135:', '')
    url_click_counts = url_click_counts.astype(int)
    return url_click_counts.values
    
Epred = model.predict(EX)
Efreq = get_frequency()
pd.DataFrame({'Eqids': Eqids, 'Ey':Ey, 'Epred':Epred, 'Efreq':Efreq}, 
                       columns=['Eqids', 'Efreq', 'Ey', 'Epred']).to_csv(
                        '{}/lambda_preds_{}.csv'.format(bp, stp_after), index=False)

In [233]:
print('Random ranking:', metric.calc_mean_random(Eqids, Ey))
print('Our model:', metric.calc_mean(Eqids, Ey, Epred))

Random ranking: 0.18912831330223595
Our model: 0.435418749785479


# Cache size effect on test scores

In [262]:
def get_cache_score_distrib(df: pd.DataFrame, metric_type='NDCG', local=False, k=10):    
    def get_metric(m_type, k):
        m = None
        if m_type == 'NDCG':
            m = pyltr.metrics.NDCG(k=k)
        elif m_type == 'DCG':
            m = pyltr.metrics.DCG(k=k)
        return m
    
    metric = get_metric(metric_type, k)
    df = df.sort_values(by='Efreq', ascending=False)
    distrib = pd.Series()    
    for i in range(100,0,-1):
        if local:
            metric = get_metric(metric_type, k)
        sz = int((i/100) * df.shape[0])        
        ts = df[:sz].sort_values(by='Eqids')
        distrib.loc[i] = metric.calc_mean(qids=ts['Eqids'].values, targets=ts['Ey'].values, preds=ts['Epred'].values)    
    return distrib

In [192]:
pred_df = pd.read_csv('/data/khodadaa/mslr/MSLR-WEB10K/Fold1/lambda_preds_10.csv', 
                      dtype={'Eqids': str, 'Efreq': int, 'Ey': float, 'Epred': float})

In [264]:
mslr_cache_distribs = pd.DataFrame(index=list(range(1,101)))
mslr_cache_distribs['NDCG-global'] = get_cache_score_distrib(pred_df, metric_type='NDCG')
mslr_cache_distribs['NDCG-local'] = get_cache_score_distrib(pred_df, metric_type='NDCG', local=True)
mslr_cache_distribs['DCG-global'] = get_cache_score_distrib(pred_df, metric_type='DCG')
mslr_cache_distribs['DCG-local'] = get_cache_score_distrib(pred_df, metric_type='DCG', local=True)
mslr_cache_distribs.to_csv('/data/khodadaa/mslr/MSLR-WEB10K/Fold1/cache_scoring_distributions.csv')