In [16]:
#!pip install python-terrier

import numpy as np
import pandas as pd
import pyterrier as pt
import os

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64/"
if not pt.started():
    pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])



# Chargement des fichiers

### QA

On génére les fichiers :
- <code>relevant</code> : dataframe des passages pertinents pour chaque requete
- <code>train_queries</code>, <code>dev_queries</code>, <code>test_queries</code> : les dataframe de reqête

Les docno sont sous la forme {numéro doc},{numéro passage}

In [50]:
def string_to_list (s):
    """
    Converti un string de liste en une liste d'entier
    ex "21,22"->[21,22]
    """
    l=[]
    for i in s.split(',') :
        l.append(int(i))
    return l

In [137]:
relevant = pd.DataFrame({'qid' : pd.Series(dtype='int'), 'docno' : pd.Series(dtype='str'), 'iteration' : pd.Series(dtype='float')})

def load_tsv (fname) :
    global relevant
    
    queries = pd.DataFrame({'qid' : pd.Series(dtype='int'), 'query' : pd.Series(dtype='str')})
    
    data = pd.read_csv(fname+'.tsv', sep='\t')
    for i in range(len(data)) :
        #ajout dans queries
        tupl=pd.DataFrame({'qid' : [data['QID'][i]],
                                      'query' : [data['Question'][i]]})
        queries=pd.concat([queries,tupl])
        
        #ajout dans relevant
        passages = string_to_list(data['RelevantPassages'][i])
        tupl=pd.DataFrame({'qid' : [data['QID'][i] for i in passages],
                           'docno' : [f"{data['DocumentID'][i]},{p}" for p in passages],
                           'iteration' : [1 for i in passages]})
        relevant=pd.concat([relevant,tupl])
    return queries



train_queries=load_tsv("data/train").reset_index().drop(columns=['index'])
dev_queries=load_tsv("data/dev").reset_index().drop(columns=['index'])
test_queries=load_tsv("data/test").reset_index().drop(columns=['index'])

relevant=relevant.reset_index().drop(columns=['index'])

In [139]:
relevant

Unnamed: 0,qid,docno,iteration
0,956,6724,1.0
1,195,3591,1.0
2,557,3592,1.0
3,956,2854,1.0
4,2610,57914,1.0
...,...,...,...
6962,782,3932,1.0
6963,3738,8294,1.0
6964,844,8295,1.0
6965,1140,3641,1.0


In [140]:
train_queries

Unnamed: 0,qid,query
0,3086,What is the role of conversionism in Evangelic...
1,195,How did the assault on the Bastille the first ...
2,557,What is the prehistory of Albania?
3,1508,What significance did Bulgaria have in the end...
4,956,What is the rationale of support of the Common...
...,...,...
3327,3700,How Do We Separate The Gray Wolf Or Grey Wolf ...
3328,1588,What is Greenland's self governing act?
3329,1971,How did conquering of Alexander the great effe...
3330,1153,What is Amnesty International?


### Documents

On charge le json dans un fichier pyterrier index

In [121]:
pt_index_path = './passageQA'
if not os.path.exists(pt_index_path + "/data.properties") :
    # création de l'index
    indexer = pt.DFIndexer(pt_index_path, overwrite=True) 

    # chargement du json en dataframe
    with open('data/document_passages.json', 'r') as f:
        data_row = json.load(f)
    data={"docno" : [k0+','+k1 for k0,v0 in data_row.items() for k1,v1 in v0.items()],
          "text"  : [v1 for k0,v0 in data_row.items() for k1,v1 in v0.items()]}
    docs_df = pd.DataFrame(data)
    
    index_ref = indexer.index(docs_df["text"], docs_df["docno"])

else:
    # dans le cas où l'index existe déjà
    index_ref = pt.IndexRef.of(pt_index_path + "/data.properties")
index = pt.IndexFactory.of(index_ref)

In [122]:
print(index.getCollectionStatistics().toString())

Number of documents: 50612
Number of terms: 103283
Number of postings: 3056988
Number of fields: 0
Number of tokens: 3906884
Field names: []
Positions:   false

