In [118]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

In [119]:
df = pd.read_csv('X_train.csv')
nbr_of_rows = len(df)
split_point = int(nbr_of_rows / 10 * 8)
train_df = df[:split_point]  # first 80%
validation_df = df[split_point:]  # remaining 20%

In [120]:
X_train = train_df.drop(["q_id", "rel"], axis=1)
y_train = train_df["rel"]

X_validation = validation_df.drop(["q_id", "rel"], axis=1)
y_validation = validation_df["rel"]


In [121]:
X_train

Unnamed: 0,1,2,3,4,5,6,7,8
0,12.416857,0.433013,0.112903,13.782650,2827,81,4,50
1,13.119300,0.635571,0.168337,13.782650,4859,73,4,50
2,13.035340,0.366829,0.100407,13.782650,6470,70,4,50
3,12.894505,0.700222,0.197279,13.782650,2693,133,7,50
4,7.818214,0.371969,0.089655,15.364447,2254,70,4,46
...,...,...,...,...,...,...,...,...
3700,19.494394,1.197688,0.534772,20.729592,3453,63,4,74
3701,19.536617,0.949154,0.455782,20.729592,3733,87,4,74
3702,18.287802,1.002268,0.477876,20.729592,1843,73,4,74
3703,18.144333,0.841441,0.355634,20.729592,2423,88,4,74


In [122]:
model = LogisticRegression(max_iter=5000)

model.fit(
    X=X_train,
    y=y_train
)

LogisticRegression(max_iter=5000)

In [123]:
score = model.score(X_validation, y_validation)
score

0.778856526429342

In [124]:
y_predicted = model.predict(X_validation)
f1_score(y_validation, y_predicted)

0.20233463035019456

In [125]:
# By now the model is trained - we evaluate on the test collection
dev_results = pd.read_csv('test-bm25-features.csv')
dev_results

Unnamed: 0,1,2,3,4,5,6,7,8,q_id,doc_id
0,7.003504,0.210713,0.129187,7.729339,4602,37,4,41,1104031,D2401591
1,7.177691,0.104355,0.069088,7.729339,9953,56,5,41,1104031,D3065828
2,6.467798,0.190917,0.100604,7.729339,4761,69,4,41,1104031,D772274
3,6.730981,0.362539,0.205607,7.729339,5246,56,4,41,1104031,D3335222
4,6.730981,0.362539,0.205607,7.729339,5245,55,4,41,1104031,D67568
...,...,...,...,...,...,...,...,...,...,...
19995,5.935711,0.079121,0.042961,8.578480,16495,68,5,24,634428,D1138539
19996,5.539033,0.168392,0.043974,8.578480,4351,61,6,24,634428,D1464474
19997,5.357733,0.128297,0.035461,8.578480,7336,63,7,24,634428,D537515
19998,6.852679,0.069790,0.017391,8.578480,23751,61,6,24,634428,D228927


In [126]:
feature_rows = dev_results.drop(["q_id", "doc_id"], axis=1)
feature_rows

Unnamed: 0,1,2,3,4,5,6,7,8
0,7.003504,0.210713,0.129187,7.729339,4602,37,4,41
1,7.177691,0.104355,0.069088,7.729339,9953,56,5,41
2,6.467798,0.190917,0.100604,7.729339,4761,69,4,41
3,6.730981,0.362539,0.205607,7.729339,5246,56,4,41
4,6.730981,0.362539,0.205607,7.729339,5245,55,4,41
...,...,...,...,...,...,...,...,...
19995,5.935711,0.079121,0.042961,8.578480,16495,68,5,24
19996,5.539033,0.168392,0.043974,8.578480,4351,61,6,24
19997,5.357733,0.128297,0.035461,8.578480,7336,63,7,24
19998,6.852679,0.069790,0.017391,8.578480,23751,61,6,24


In [127]:
# feature_rows['relevant']
dev_results['relevance'] = model.predict(feature_rows)
dev_results

Unnamed: 0,1,2,3,4,5,6,7,8,q_id,doc_id,relevance
0,7.003504,0.210713,0.129187,7.729339,4602,37,4,41,1104031,D2401591,0
1,7.177691,0.104355,0.069088,7.729339,9953,56,5,41,1104031,D3065828,0
2,6.467798,0.190917,0.100604,7.729339,4761,69,4,41,1104031,D772274,0
3,6.730981,0.362539,0.205607,7.729339,5246,56,4,41,1104031,D3335222,0
4,6.730981,0.362539,0.205607,7.729339,5245,55,4,41,1104031,D67568,0
...,...,...,...,...,...,...,...,...,...,...,...
19995,5.935711,0.079121,0.042961,8.578480,16495,68,5,24,634428,D1138539,0
19996,5.539033,0.168392,0.043974,8.578480,4351,61,6,24,634428,D1464474,0
19997,5.357733,0.128297,0.035461,8.578480,7336,63,7,24,634428,D537515,0
19998,6.852679,0.069790,0.017391,8.578480,23751,61,6,24,634428,D228927,0


In [128]:
# output = dev_results.groupby('q_id').apply(lambda x: x.sort_values(by="relevance", ascending=False))
output = dev_results.sort_values(['q_id', 'relevance'], ascending=False)
output

Unnamed: 0,1,2,3,4,5,6,7,8,q_id,doc_id,relevance
6423,5.310766,0.085765,0.026732,7.097959,61766,46,4,25,1136427,D1238297,1
6425,5.592947,0.141030,0.038084,7.097959,60141,37,4,25,1136427,D1238298,1
6455,5.177349,0.136375,0.036301,7.097959,57059,52,4,25,1136427,D717552,1
6483,5.343661,0.134607,0.035132,7.097959,78045,33,4,25,1136427,D1005548,1
6484,5.343661,0.134519,0.035109,7.097959,78096,48,4,25,1136427,D1326552,1
...,...,...,...,...,...,...,...,...,...,...,...
5466,15.143692,0.114327,0.039451,24.491484,4599,110,7,42,11096,D2861657,0
5467,17.286839,0.138549,0.026405,24.491484,29348,53,4,42,11096,D705298,0
5470,15.549438,0.164178,0.025352,24.491484,19267,88,7,42,11096,D950225,0
5484,14.233579,0.119082,0.021397,24.491484,10854,44,4,42,11096,D3528060,0


In [129]:
output.reset_index(inplace=True)

In [130]:
import time

start = time.time()

print()
with open('l2r_baseline_output.trec', 'w') as fout:
    for i in range(output['q_id'].count()):
        score = 1.0 / int(i+1)
        fout.write(f'{output.loc[i]["q_id"]} Q0 {output.loc[i]["doc_id"]} {i%100} {score} anserini\n')

print(f'Took {time.time() - start}s to finish')


Took 7.008204221725464s to finish
