In [65]:
import pandas as pd
import pyterrier as pt #pip install python-terrier
import os

if not pt.started():
    pt.init()


In [66]:
products = pd.read_csv("products.csv",dtype={'product_id': str})
products["docno"] = products["product_id"]

# pd_indexer = pt.DFIndexer("./pd_index")
# indexref = pd_indexer.index(products["title"], products["docno"])

In [67]:
## gerando topics
pairs = pd.read_csv("pairs.csv",dtype={'product_id': str})
# atribuindo ids pra queries
pairs['qid'] = pairs.groupby(['query']).ngroup()
topics = pairs[["qid","query"]]
topics = topics.drop_duplicates("qid")
topics["qid"] = topics["qid"].astype(str)
topics

Unnamed: 0,qid,query
0,83,Convite Padrinhos Batismo
1,92,Decoracao De Casamento
2,230,Toalha De Lavabo
3,59,Calendario 2023 Editavel
4,96,Ecobag
...,...,...
1521,131,Lembrancinha Barata De Natal
1550,166,Minha Vida É Uma Viagem
1648,193,Presente De Formatura Infantil
3875,174,Musicas Nacionais Romanticas Anos 80


In [68]:
## gerando qrels
# Either a path to a qrels file or a pandas.Dataframe with columns=[‘qid’,’docno’, ‘label’]
joint_table = products.merge(pairs,on="product_id")
joint_table.head()

Unnamed: 0,product_id,title,tags,creation_date,price,weight,express_delivery,category,minimum_quantity,print_count_product,...,order_count_product,docno,pair_id,query,search_position,print_count_query,view_count_query,cart_count_query,order_count_query,qid
0,101,Jogo Banheiro de Crochê de 3 Peças,"['#jogobanheiro #croche #tapetes', 'decoração'...",2022-09-25 13:43:36,110.0,1.0,1,Técnicas de Artesanato,1,11,...,0,101,1520418423149,Necessarie,281,143,5,0,0,178
1,101,Jogo Banheiro de Crochê de 3 Peças,"['#jogobanheiro #croche #tapetes', 'decoração'...",2022-09-25 13:43:36,110.0,1.0,1,Técnicas de Artesanato,1,11,...,0,101,55,Necessaire,281,104,3,0,0,176
2,106,Guardanapos de Tecido - 100 unidades,"['guardanapos de tecido', 'guradanapo', 'festa...",2014-12-26 18:47:48,269.5,0.0,0,Casa,1,62,...,0,106,278,Bebê,86,346,7,1,1,21
3,47,Toalha Papai Noel,"['natal', 'toalha de natal', 'toalha de mesa',...",2013-11-06 20:43:27,291.1,0.0,0,Casa,1,423,...,0,47,27,Lembrancinha Copa Do Mundo,160,798,1,0,0,135
4,8589941942,Caixa para 1 bis feliz natal cliente como você...,"['lembrança', 'personalizados', 'festa', 'caix...",2021-11-22 15:02:30,45.0,0.0,0,Lembrancinhas,30,2746,...,2,8589941942,1606317768786,Decoracao De Natal,344,785,4,0,0,93


In [69]:
qrels = joint_table[["qid","docno","search_position"]]
qrels = qrels.rename(columns={"search_position": "label"})
# qrels["label"] = 0
qrels = qrels.query("label < 40")
qrels

Unnamed: 0,qid,docno,label
11,54,17179869287,32
12,49,17179869287,7
27,118,25769822426,26
29,67,25769822413,13
47,87,54,30
...,...,...,...
89757,206,25769812260,6
89776,243,34359738414,9
89778,123,8589936991,10
89796,96,17179880662,14


In [70]:
qrels["label"] = qrels["label"].apply(lambda x: 1 if x < 10 else 0)
qrels = qrels.astype({"qid": str})
qrels

Unnamed: 0,qid,docno,label
11,54,17179869287,0
12,49,17179869287,1
27,118,25769822426,0
29,67,25769822413,0
47,87,54,0
...,...,...,...
89757,206,25769812260,1
89776,243,34359738414,1
89778,123,8589936991,0
89796,96,17179880662,0


In [71]:
qrels.dtypes

qid      object
docno    object
label     int64
dtype: object

In [72]:
topics.dtypes

qid      object
query    object
dtype: object

## Experimentos

In [78]:
index = pt.IndexFactory.of("./pd_index")
tf_idf = pt.BatchRetrieve(index, wmodel="TF_IDF", verbose=True,num_results=10, controls={'tf_idf.k_1': 1.2, 'c': 0.1}) % 10
bm25 = pt.BatchRetrieve(index, wmodel="BM25")
pl2 = pt.BatchRetrieve(index, wmodel="PL2")


In [79]:
df = pt.Experiment([tf_idf, bm25, pl2], topics, qrels, eval_metrics=["map"])
# df = tf_idf.transform(topics)

BR(TF_IDF): 100%|█████████████████████████████████████████████████████| 249/249 [00:00<00:00, 401.49q/s]


In [80]:
df

Unnamed: 0,name,map
0,"RankCutoff(BR(TF_IDF), 10)",0.0
1,BR(BM25),0.000216
2,BR(PL2),0.000233
