In [149]:
import json
import numpy as np
import re
import math

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import Counter
from num2words import num2words

nltk.download('stopwords')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alexisprodel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/alexisprodel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [12]:
train_path = "./BoolQ dataset/train.jsonl"
val_path = "./BoolQ dataset/dev.jsonl"

In [5]:
def dump_jsonl(data, output_path, append=False):
    """
    Write list of objects to a JSON lines file.
    """
    mode = 'a+' if append else 'w'
    with open(output_path, mode, encoding='utf-8') as f:
        for line in data:
            json_record = json.dumps(line, ensure_ascii=False)
            f.write(json_record + '\n')
    print('Wrote {} records to {}'.format(len(data), output_path))

def load_jsonl(input_path) -> list:
    """
    Read list of objects from a JSON lines file.
    """
    data = []
    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.rstrip('\n|\r')))
    print('Loaded {} records from {}'.format(len(data), input_path))
    return data

In [13]:
train = load_jsonl(train_path)
validation = load_jsonl(val_path)

Loaded 9427 records from ./BoolQ dataset/train.jsonl
Loaded 3270 records from ./BoolQ dataset/dev.jsonl


<h1>Preprocessing</h1>

In [101]:
def convert_lower_case(data):
    return np.char.lower(data)

def remove_foreign_chars(data):
    return re.sub("[^a-zA-Z0-9\s]+", "", data)

def remove_stop_words(data):
    stop_words = stopwords.words('english')
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if w not in stop_words and len(w) > 1:
            new_text = new_text + " " + w
    return new_text

def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    data = np.char.replace(data, "'", "")
    return data

def stemming(data):
    stemmer= PorterStemmer()
    
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + stemmer.stem(w)
    return new_text

def convert_numbers(data):
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        try:
            w = num2words(int(w))
        except:
            a = 0
        new_text = new_text + " " + w
    new_text = np.char.replace(new_text, "-", " ")
    return new_text

In [102]:
def preprocess(data):
    data = convert_lower_case(data)
    data = remove_punctuation(data)
    data = np.array([remove_foreign_chars(text) for text in data])
    data = np.array([remove_stop_words(text) for text in data])
    data = np.array([convert_numbers(text) for text in data])
    data = np.array([stemming(text) for text in data])
    data = remove_punctuation(data) 
    #data = remove_stop_words(data) 
    return data

In [190]:
train_contexts = np.array([elt["passage"] for elt in train])
train_questions = np.array([elt["question"] for elt in train])
train_preprocessed = preprocess(train_contexts)

<h1>Similarity approach: TF-IDF & cosine similarity</h1>

L'approche retenue pour sélectionner le contexte correspondant à la question consiste à calculer un score de similarité entre la question et les contextes du dataset. Le contexte retenu est celui présentant le meilleur score. On choisit ici la cosine similarity entre les vecteurs de score TF-IDF.

In [175]:
DF = {}

for i in range(len(train_preprocessed)):
    text = train_preprocessed[i].split(" ")
    for word in text:
        if word != "":
            try:
                DF[word].add(i)
            except:
                DF[word] = {i}

for i in DF:
    DF[i] = len(DF[i])
    
total_vocab_size = len(DF)
total_vocab = [x for x in DF]

In [176]:
def doc_freq(word):
    c = 0
    try:
        c = DF[word]
    except:
        pass
    return c

In [177]:
doc = 0

tf_idf = {}

for i in range(len(train_preprocessed)):
    
    tokens = train_preprocessed[i].split(" ")
    
    counter = Counter(tokens)
    words_count = len(tokens)
    
    for token in np.unique(tokens):
        
        tf = counter[token]/words_count
        df = doc_freq(token)
        idf = np.log((len(train_preprocessed)+1)/(df+1))
        
        tf_idf[doc, token] = tf*idf

    doc += 1

In [179]:
def cosine_sim(a, b):
    cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
    return cos_sim

In [180]:
D = np.zeros((len(train_preprocessed), total_vocab_size))
for i in tf_idf:
    ind = total_vocab.index(i[1])
    D[i[0]][ind] = tf_idf[i]

In [184]:
def gen_vector(tokens):

    Q = np.zeros((len(total_vocab)))
    
    counter = Counter(tokens)
    words_count = len(tokens)

    query_weights = {}
    
    for token in np.unique(tokens):
        
        tf = counter[token]/words_count
        df = doc_freq(token)
        idf = math.log((len(train_contexts)+1)/(df+1))

        try:
            ind = total_vocab.index(token)
            Q[ind] = tf*idf
        except:
            pass
    return Q

In [187]:
def cosine_similarity(k, query):
    preprocessed_query = preprocess(np.array([query]))[0]
    tokens = word_tokenize(str(preprocessed_query))
    
    d_cosines = []
    
    query_vector = gen_vector(tokens)
    
    for d in D:
        d_cosines.append(cosine_sim(query_vector, d))
        
    return np.array(d_cosines).argsort()[-k:][::-1]
    

In [209]:
def result_mat(queries, k):
    res = np.zeros((len(queries), k))
    
    for i in range(len(queries)):
        if i % 100 == 0:
            print(i)
        res[i] = cosine_similarity(k, queries[i])
    
    return res


In [210]:
def compute_k_accuracy(queries, k):
    res = result_mat(queries, k)
    
    cpt = 0
    for i in range(len(res)):
        if i in res[i]:
            cpt += 1
            
    return cpt / len(res)
    

In [None]:
compute_k_accuracy(train_questions, 1)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
