In [1]:
#!pip install python-terrier

import numpy as np
import pandas as pd
import pyterrier as pt
import os

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64/"
if not pt.started():
    pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])



PyTerrier 0.9.2 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.7

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [2]:
#from google.colab import drive
#drive.mount('/content/drive')
#%cd /content/drive/MyDrive/M1_S2/RITAL/projet2
import json
import os
import pandas as pd

In [3]:
#!pip install python-terrier
import pandas as pd

import pyterrier as pt
if not pt.started():
  pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])

# 1 - Chargement des fichiers

### 1a) Chargement documents

On charge le json dans un fichier pyterrier index

In [5]:
# chargement du json en dataframe
with open('WikiPassageQA/document_passages.json', 'r') as f:
        data_row = json.load(f)
data={"DocumentID" : [int(k0) for k0,v0 in data_row.items() for k1,v1 in v0.items()],
    "docno" : [k0+','+k1 for k0,v0 in data_row.items() for k1,v1 in v0.items()],
      "text"  : [v1 for k0,v0 in data_row.items() for k1,v1 in v0.items()]}
docs_df = pd.DataFrame(data)

In [6]:
import json
pt_index_path = './passageQA'
if not os.path.exists(pt_index_path + "/data.properties") :
    # création de l'index
    indexer = pt.DFIndexer(pt_index_path, overwrite=True) 
    index_ref = indexer.index(docs_df["text"], docs_df["docno"])

else:
    # dans le cas où l'index existe déjà
    index_ref = pt.IndexRef.of(pt_index_path + "/data.properties")
index = pt.IndexFactory.of(index_ref)

In [7]:
print(index.getCollectionStatistics().toString())

Number of documents: 50612
Number of terms: 103283
Number of postings: 3056988
Number of fields: 0
Number of tokens: 3906884
Field names: []
Positions:   false



### 1b) Chargement queries

In [8]:
df_train = pd.read_csv("WikiPassageQA/train.tsv",sep='\t')
df_test = pd.read_csv("WikiPassageQA/test.tsv",sep='\t')
df_dev = pd.read_csv("WikiPassageQA/dev.tsv",sep='\t')

In [27]:
queries_train=df_train[["QID", "Question"]]
queries_train.columns = ['qid', 'query']
queries_test=df_test[["QID", "Question"]]
queries_test.columns = ['qid', 'query']
queries_dev=df_dev[["QID", "Question"]]
queries_dev.columns = ['qid', 'query']

queries_test

Unnamed: 0,qid,query
0,449,What is Iraq's role in political unstabilization?
1,1140,What have been the warmest years of global war...
2,782,How were the Olympic games broadcasted?
3,1421,What was the election turnout for 2008?
4,3738,How has secularism been misinterpreted in Amer...
...,...,...
411,2522,What does the Church mean by Fallen Nature?
412,2851,What is the history of Eurasia?
413,3705,What are the nicknames of Missouri?
414,799,How do endoglossic and exoglossic languages co...


### 1c) Création qrels

In [10]:
df_query = pd.concat([df_train, df_test, df_dev])
df_complete = df_query.merge(docs_df, on =["DocumentID"])
df_complete

Unnamed: 0,QID,Question,DocumentID,DocumentName,RelevantPassages,docno,text
0,3086,What is the role of conversionism in Evangelic...,672,Evangelicalism.html,4,67242,They claimed major credit for the election of ...
1,3086,What is the role of conversionism in Evangelic...,672,Evangelicalism.html,4,67248,Evangelicalism was a major force in the Anglic...
2,3086,What is the role of conversionism in Evangelic...,672,Evangelicalism.html,4,67243,The rich and the poor remained traditional Cat...
3,3086,What is the role of conversionism in Evangelic...,672,Evangelicalism.html,4,67249,"The Evangelical Alliance, formed in 1846, was ..."
4,3086,What is the role of conversionism in Evangelic...,672,Evangelicalism.html,4,67224,"By 1737, Whitefield had become a national cele..."
...,...,...,...,...,...,...,...
244131,2570,How was the kalmar Union formed?,553,Norway.html,18192021,55372,The budgetary rule is to spend no more than 4%...
244132,2570,How was the kalmar Union formed?,553,Norway.html,18192021,55371,Norway is the fifth-largest oil exporter and t...
244133,2570,How was the kalmar Union formed?,553,Norway.html,18192021,55370,"In 2011, 28% of state revenues were generated ..."
244134,2570,How was the kalmar Union formed?,553,Norway.html,18192021,55379,The two most prominent are the E6 going north-...


In [11]:
# cree list relevant
relevant = []
for row in df_complete.iterrows():
  passage = row[1]['RelevantPassages'].split(",")
  doc_passage = row[1]['docno'].split(",")[1] # doc,pass -> [1]
  if doc_passage in passage:
    relevant.append(1)
  else:
    relevant.append(0)

df_complete["label"] = relevant
qrel = df_complete[["QID", "docno","label"]]
qrel.columns = ['qid', 'docno', 'label']
qrel = qrel.astype({"qid": str, "docno": object, "label":object}) #qid en string pour pt.Experiment
qrel

Unnamed: 0,qid,docno,label
0,3086,67242,0
1,3086,67248,0
2,3086,67243,0
3,3086,67249,0
4,3086,67224,0
...,...,...,...
244131,2570,55372,0
244132,2570,55371,0
244133,2570,55370,0
244134,2570,55379,0


# 2 - Analyse des données

# 3 - Mesure de performances

### 3a) Préprétraitement

In [12]:
import string
import re
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer


def prepretraiter(query):
    """
    Préprétraite une query
    """
    #wn_lemmatizer = nltk.stem.WordNetLemmatizer()
    stemmer = SnowballStemmer("english")
    stopwords_en = stopwords.words('english')
    punc = string.punctuation
    q_traite = []
    for q in query:
        q = q[0].lower()
        q = q.translate(str.maketrans(punc, ' '*len(punc)))
        q = word_tokenize(q)
        q = ' '.join([w for w in q if w not in stopwords_en])
        #q = ' '.join(map(lambda x: wn_lemmatizer.lemmatize(x), word_tokenize(q)))
        q = ' '.join(map(lambda x: stemmer.stem(x), word_tokenize(q)))
        q_traite.append(q)
    return q_traite

[nltk_data] Downloading package punkt to /home/admpc/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/admpc/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/admpc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
def applique_pretraite (df) :
    """
    Applique le préprétraitement de query à un dataframe
    """
    liste_query=df[['query']].values.tolist()
    liste_query=prepretraiter(liste_query)
    df['query']=liste_query
    df = df.astype({"qid": str, "query": object})
    
    return df

In [14]:
queries_train=applique_pretraite(queries_train)
queries_test=applique_pretraite(queries_test)
queries_dev=applique_pretraite(queries_dev)

queries_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['query']=liste_query
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['query']=liste_query
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['query']=liste_query


Unnamed: 0,qid,query
0,449,iraq role polit unstabil
1,1140,warmest year global warm
2,782,olymp game broadcast
3,1421,elect turnout 2008
4,3738,secular misinterpret america
...,...,...
411,2522,church mean fallen natur
412,2851,histori eurasia
413,3705,nicknam missouri
414,799,endogloss exogloss languag coexist nigeria


### 3b) Apprentissage


BM25 :

In [25]:
bm25 = pt.BatchRetrieve(index, wmodel="BM25", controls={"bm25.b" : 0.75, "bm25.k_1": 0.75, "bm25.k_3": 0.75})
pt.GridSearch(
    bm25,
    {bm25: {"bm25.b"  : [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1 ],
            "bm25.k_1": [0.3, 0.6, 0.9, 1.2, 1.4, 1.6, 2],
            "bm25.k_3": [0.5, 2, 4, 6, 8, 10, 12, 14, 20]
    }},
    queries_train,
    qrel,
    "map")

  warn("Skipping empty query for qid %s" % qid)
  warn("Skipping empty query for qid %s" % qid)
  warn("Skipping empty query for qid %s" % qid)
  warn("Skipping empty query for qid %s" % qid)
  warn("Skipping empty query for qid %s" % qid)


KeyboardInterrupt: 

### 3c) Mesures

In [None]:
wc = pt.BatchRetrieve(index, wmodel="Tf")
wc_idf = pt.BatchRetrieve(index, wmodel="TF_IDF", norm = None)
tf_idf = pt.BatchRetrieve(index, wmodel="TF_IDF")
ql = pt.BatchRetrieve(index, wmodel="DirichletLM")


pt.Experiment(
    [wc, wc_idf, tf_idf, bm25, ql],
    queries_test,
    qrel,
    eval_metrics=["map", "recip_rank", "P_5","P_10","ndcg", "recall_5", "recall_10", "recall_20" ],
    names = ["WC", "WC.IDF", "VSM", "BM25", "QL"]
)