In [1]:
import numpy as np
import pandas as pd
import lightgbm

In [139]:
df = pd.read_csv('X_train.csv')
nbr_of_rows = len(df)
split_point = int(nbr_of_rows / 10 * 8)
train_df = df[:split_point]  # first 80%
validation_df = df[split_point:]  # remaining 20%

In [140]:
qids_train = train_df.groupby("q_id")["q_id"].count().to_numpy()
X_train = train_df.drop(["q_id", "rel"], axis=1)
y_train = train_df["rel"]
qids_validation = validation_df.groupby("q_id")["q_id"].count().to_numpy()
X_validation = validation_df.drop(["q_id", "rel"], axis=1)
y_validation = validation_df["rel"]

In [141]:
model = lightgbm.LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    ndcg_eval_at=[1, 3, 5, 10, 100],
    n_estimators=35
)

model.fit(
    X=X_train,
    y=y_train,
    group=qids_train,
    eval_set=[(X_validation, y_validation)],
    eval_group=[qids_validation],
    eval_at=1,
    verbose=1,
)



[1]	valid_0's ndcg@1: 0.06	valid_0's ndcg@3: 0.107737	valid_0's ndcg@5: 0.12646	valid_0's ndcg@10: 0.151718	valid_0's ndcg@100: 0.295656
[2]	valid_0's ndcg@1: 0.135	valid_0's ndcg@3: 0.215474	valid_0's ndcg@5: 0.246022	valid_0's ndcg@10: 0.270475	valid_0's ndcg@100: 0.380876
[3]	valid_0's ndcg@1: 0.125	valid_0's ndcg@3: 0.207974	valid_0's ndcg@5: 0.240456	valid_0's ndcg@10: 0.27734	valid_0's ndcg@100: 0.380454
[4]	valid_0's ndcg@1: 0.13	valid_0's ndcg@3: 0.20351	valid_0's ndcg@5: 0.242234	valid_0's ndcg@10: 0.282075	valid_0's ndcg@100: 0.384046
[5]	valid_0's ndcg@1: 0.115	valid_0's ndcg@3: 0.199284	valid_0's ndcg@5: 0.244029	valid_0's ndcg@10: 0.282971	valid_0's ndcg@100: 0.382279
[6]	valid_0's ndcg@1: 0.115	valid_0's ndcg@3: 0.200474	valid_0's ndcg@5: 0.245658	valid_0's ndcg@10: 0.277816	valid_0's ndcg@100: 0.381212
[7]	valid_0's ndcg@1: 0.105	valid_0's ndcg@3: 0.192974	valid_0's ndcg@5: 0.234727	valid_0's ndcg@10: 0.268764	valid_0's ndcg@100: 0.375343
[8]	valid_0's ndcg@1: 0.12	valid

LGBMRanker(metric='ndcg', n_estimators=35, ndcg_eval_at=[1, 3, 5, 10, 100],
           objective='lambdarank')

In [142]:
dev_queries = pd.read_csv('dev-bm25-features.csv')
only_features = dev_queries.drop(['q_id', 'doc_id'], axis=1).to_numpy()
outputs = model.predict(only_features)
dev_queries['score'] = outputs
result = dev_queries.sort_values(['q_id', 'score'], ascending=False)
result

Unnamed: 0,1,2,3,4,5,6,7,8,q_id,doc_id,score
137203,10.273640,0.736925,0.192308,11.653727,649,52,3,22,1102400,D677570,1.202770
137220,9.955198,0.403051,0.126126,11.653727,1703,59,4,22,1102400,D251163,0.817490
137210,10.236705,0.364816,0.099359,11.653727,2572,49,5,22,1102400,D251165,0.799442
137201,10.961761,0.424772,0.129568,11.653727,2514,47,4,22,1102400,D677568,0.758341
137202,10.544532,0.665221,0.134884,11.653727,1843,62,4,22,1102400,D1450527,0.602159
...,...,...,...,...,...,...,...,...,...,...,...
239742,8.787430,0.415093,0.066667,13.244507,2768,78,4,25,2,D1809949,-2.036755
239759,8.758108,0.352020,0.056452,13.244507,3329,66,4,25,2,D2512105,-2.117368
239781,9.599329,0.088585,0.016251,13.244507,8095,31,3,25,2,D2001023,-2.126583
239784,8.842834,0.141128,0.029240,13.244507,3264,61,6,25,2,D247176,-2.168612


In [143]:
result = result.reset_index()
result.loc[0]

index        137203
1          10.27364
2          0.736925
3          0.192308
4         11.653727
5               649
6                52
7                 3
8                22
q_id        1102400
doc_id      D677570
score       1.20277
Name: 0, dtype: object

In [144]:
result.loc[0].score

1.202770362863172

In [145]:
with open('dev-bm25-reranked_k_1000.trec', 'w') as fout:
    for i in range(len(result)):
        row = result.loc[i]
        fout.write(f'{row.q_id} Q0 {row.doc_id} {i % 100} {row.score} reranked\n')

In [146]:
test_queries = pd.read_csv('test-bm25-features.csv')
only_features = test_queries.drop(['q_id', 'doc_id'], axis=1).to_numpy()
outputs = model.predict(only_features)
test_queries['score'] = outputs
result = test_queries.sort_values(['q_id', 'score'], ascending=False)

In [147]:
with open('test-bm25-reranked_k_1000.trec', 'w') as fout:
    for i in range(len(result)):
        row = result.loc[i]
        fout.write(f'{row.q_id} Q0 {row.doc_id} {i % 100} {row.score} reranked\n')